# Setup

In [1]:
!source activate /home/gilbee3/anaconda3/envs/akita

In [5]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
sns.set(color_codes=True) # Seaborn desaturates matplotlib colors (this is useful if you use both seaborn and basic matplot lib and want colors to be consistent)
import scipy.stats as stats
sns.set(palette='husl', context = 'poster', style='white')
import os
import scikit_posthocs as sp
import pybedtools

In [6]:
BASE_PATH = "/".join(os.getcwd().split("/")[:-2]) # base directory level

BIN_PATH = os.path.join(BASE_PATH, "bin")  # where my scripts live
DATA_PATH = os.path.join(BASE_PATH, "data")  # where I dump new data.
RESULTS_PATH = os.path.join(BASE_PATH, "results")  # where I analyze results

SRC_PATH = os.path.join(BASE_PATH, "src")  # where any packages needed to run analyses live. I haven't started structuring things this way yet. 

COMP_PATH = os.path.join(DATA_PATH,"pairwise/hsmrca")
#COMP_PATH = os.path.join(DATA_PATH,"pairwise/reference")

In [7]:
COMP_PATH

'/wynton/group/capra/projects/modern_human_3Dgenome/data/pairwise/hsmrca'

## Formatting data for analysis
The results of this are written out to files that can be loaded for any downstream analysis to make starting up faster each time. 

In [None]:
COMP_PATH = os.path.join(DATA_PATH,"pairwise/hsmrca")
anc_list = os.path.join(DATA_PATH, "reference/lists/listOfPairwiseComps_hsmrca.txt")
anc_g, anc_m, anc_s = comp_data_setup(anc_list, 'hsmrca_ancestral')
anc_g = anc_g[anc_g.genome_avg_spearman!='']

In [None]:
write_comp_dfs('anc', anc_g, anc_m, anc_s)

## Functions

In [None]:
def write_comp_dfs(base, g, m, s):
    g.to_csv('/wynton/group/capra/projects/modern_human_3Dgenome/results/comp_tables/%s_genomewide_averages.csv' % base)
    m.to_csv('/wynton/group/capra/projects/modern_human_3Dgenome/results/comp_tables/%s_window_mse.csv' % base)
    s.to_csv('/wynton/group/capra/projects/modern_human_3Dgenome/results/comp_tables/%s_window_spearman.csv' % base)
    return

In [None]:
def comp_data_setup(list_file, base):
    comp_list=pd.read_table(list_file, sep=' ', names=['1KG','ancestor'])
    if base != 'hg38_reference':
        base_df = comp_list[comp_list.ancestor==base].set_index('1KG')
    else:
        base_df = comp_list.set_index('1KG')
    
    comp_dict = {}
    print('first for')
    for row in base_df.index:
        ind = str(base_df.loc[row].name)
        filename = '3dcomp_%s_vs_%s.txt' % (ind,base)
        if os.path.exists('%s/%s' % (COMP_PATH,filename)):
            try:
                df = pd.read_table('%s/%s' % (COMP_PATH,filename))
                df = df[df.chr != 'chrX']
                comp_dict[ind] = df
            except:
                continue     
    print('done')
    
    indivs = list(comp_dict.keys())
    df = comp_dict[list(comp_dict.keys())[0]].set_index(['chr','windowStartPos'])
    windows = df.index
    ### Window by window
    mse = pd.DataFrame(index = windows, columns = indivs)
    spear = pd.DataFrame(index = windows, columns = indivs)
    
    ### Genome wide averages        
    base_df['genome_avg_mse']=''
    base_df['genome_avg_spearman']=''
    print('second for')
    for ind in comp_dict.keys():
        df = comp_dict[ind]

        avg_mse =float(np.mean(df['mse']))
        avg_spearman = float(np.mean(df['spearman']))

        base_df.genome_avg_mse.loc[ind]=avg_mse
        base_df.genome_avg_spearman.loc[ind]=avg_spearman
        
        
        df = comp_dict[ind]
        df = df.set_index(['chr','windowStartPos'])
        mse[ind]=df['mse']
        spear[ind]=df['spearman']
    print('done')  
    

    
    base_df['temp'] = base_df.index
    new = base_df.temp.str.split("_", expand = True)
    base_df['super_pop'] = new[0]
    base_df['sub_pop'] = new[1]
    base_df['sex'] = new[2]
    base_df['id'] = new[3]
    
    genome_wide = base_df.drop(columns=['temp'])

    mse.columns = mse.columns.str.split('_', expand=True)
    spear.columns = spear.columns.str.split('_', expand=True)
    
    return genome_wide, mse, spear

In [None]:
def corr_plot(df):
    g = sns.lmplot(x='genome_avg_mse',y='divergence', hue='super_pop', data=df,scatter_kws={'s':2}, line_kws={'color': 'gray'},aspect=1.2)
    slope, intercept, r, p, se = stats.linregress(df['genome_avg_mse'].values, df['divergence'].values)
    _ = plt.title('r = %0.2f, r2 = %0.2f' % (r, r**2))

In [None]:
def per_window_hists(mse, spear, div):
    mse_flat = mse.to_numpy().flatten()
    spear_flat = spear.to_numpy().flatten()
    div_flat = div.to_numpy().flatten()
    
    plt.figure(figsize=(10,4))
    plt.hist(mse_flat, bins=100)
    plt.xlim(0,0.1)
    plt.xlabel('MSE (per window per individual)')

    plt.figure(figsize=(10,4))
    plt.hist(spear_flat, bins=100)
    plt.xlim(0.5,1)
    plt.xlabel('Spearman Correlation (per window per individual)')

    plt.figure(figsize=(10,4))
    plt.hist(div_flat, bins=100)
    plt.xlim(0, .5)
    plt.xlabel('3D Divergence (per window per individual)')

In [None]:
def all_indivs_genomeplot(df):
    windows = pd.DataFrame(df.mean(axis=1), columns=['mean_div'])
    
    chroms = ['chr1','chr2','chr3','chr4','chr5','chr6','chr7','chr8'] # did this to split the plot into a top panel and bottom, but you could just put all chroms here for 1 linear plot

    fig = plt.figure(figsize=(25,5))
    x_labels = []
    x_labels_pos = []
    ax = fig.add_subplot(111)
    ax.set_facecolor('white')


    maxLoc = 0
    num = 0
    for chrm in chroms:
        num+=1
        windows['center_loc'] = (windows.index.get_level_values(1) + 2**20)/2 + maxLoc # center_loc is the midway point in the region
        _ = plt.plot( 'center_loc','mean_div',data = windows.loc[chrm], lw=1.5)
        maxLoc = windows.loc[chrm]['center_loc'].max() + 10 # max loc iteratively becomes larger with each chromosome to place everything on the same x-axis. Remove the + 10 if you don't want space between the chroms
        x_labels_pos.append(windows.loc[chrm]['center_loc'].mean()) # save the position for the xlabels
        if (num % 2) == 0: # alternate gray and white background
            _ = ax.axvspan(windows.loc[chrm]['center_loc'].min(), windows.loc[chrm]['center_loc'].max(), facecolor='lightgray')
    _ = ax.set_xticks(x_labels_pos)
    _ = ax.set_xticklabels(chroms)
    _ = ax.set_xlim(0,maxLoc)
    #_ = ax.set_ylim(0,1.5)
    maxLocOld = maxLoc
    plt.show()


    chroms = ['chr9','chr10','chr11','chr12','chr13','chr14','chr15','chr16','chr17','chr18','chr19','chr20','chr21','chr22']
    fig = plt.figure(figsize=(25,5))
    x_labels = []
    x_labels_pos = []
    ax = fig.add_subplot(111)
    ax.set_facecolor('white')


    maxLoc = 0
    num = 0
    for chrm in chroms:
        num+=1
        windows['center_loc'] = (windows.index.get_level_values(1) + 2**20)/2 + maxLoc # center_loc is the midway point in the region
        _ = plt.plot( 'center_loc','mean_div',data = windows.loc[chrm], lw=1.5)
        maxLoc = windows.loc[chrm]['center_loc'].max() + 10 # max loc iteratively becomes larger with each chromosome to place everything on the same x-axis. Remove the + 10 if you don't want space between the chroms
        x_labels_pos.append(windows.loc[chrm]['center_loc'].mean()) # save the position for the xlabels
        if (num % 2) == 0: # alternate gray and white background
            _ = ax.axvspan(windows.loc[chrm]['center_loc'].min(), windows.loc[chrm]['center_loc'].max(), facecolor='lightgray')
    _ = ax.set_xticks(x_labels_pos)
    _ = ax.set_xticklabels(chroms)
    _ = ax.set_xlim(0,maxLoc)
    #_ = ax.set_ylim(0,1.5)
    maxLocOld = maxLoc
    plt.show()

In [None]:
def one_chrm_pop_plot(df, chrm):
    
    afr_avg = df['AFR'].mean(axis=1)
    amr_avg = df['AMR'].mean(axis=1)
    eur_avg = df['EUR'].mean(axis=1)
    eas_avg = df['EAS'].mean(axis=1)
    sas_avg = df['SAS'].mean(axis=1)
    
    windows = pd.DataFrame([afr_avg, amr_avg, eur_avg, eas_avg, sas_avg]).transpose()
    windows.columns = ['AFR','AMR','EUR','EAS','SAS']
    
    windows['AVERAGE'] = windows.mean(axis=1)
    
    df = windows.loc[chrm]
    
    fig, ax = plt.subplots(figsize=[40,10])
    sns.lineplot(y=df.AFR, 
                 x=df.index, 
                label='AFR')
    sns.lineplot(y=df.AMR, 
                 x=df.index, 
                label='AMR')
    sns.lineplot(y=df.EUR, 
                 x=df.index, 
                label='EUR')
    sns.lineplot(y=df.EAS, 
                 x=df.index, 
                label='EAS')
    sns.lineplot(y=df.SAS, 
                 x=df.index, 
                label='SAS')

In [None]:
def genome_wide_plots_stats(df, stat):
        plt.figure(figsize=(10,8))
        sns.violinplot(x='super_pop', y=stat, data=df)
        plt.figure(figsize=(30,8))
        sns.boxplot(x='sub_pop', y=stat, data=df, showfliers = False, 
                    hue='super_pop', dodge=False)
        plt.legend(bbox_to_anchor=(1.02, 1), loc='upper left', borderaxespad=0)

        kruskal = stats.kruskal(df[df.super_pop=='AFR'][stat], 
                  df[df.super_pop=='AMR'][stat],
                  df[df.super_pop=='EUR'][stat],
                  df[df.super_pop=='EAS'][stat],
                  df[df.super_pop=='SAS'][stat])
        conover = sp.posthoc_conover(df, val_col=stat, group_col='super_pop', p_adjust = 'holm')
        
        print(kruskal)
        print(conover)

In [None]:
def std_genomeplot(df):
    windows = pd.DataFrame(df.std(axis=1), columns=['std_div'])
    chroms = ['chr1','chr2','chr3','chr4','chr5','chr6','chr7','chr8'] # did this to split the plot into a top panel and bottom, but you could just put all chroms here for 1 linear plot

    fig = plt.figure(figsize=(25,5))
    x_labels = []
    x_labels_pos = []
    ax = fig.add_subplot(111)
    ax.set_facecolor('white')


    maxLoc = 0
    num = 0
    for chrm in chroms:
        num+=1
        windows['center_loc'] = (windows.index.get_level_values(1) + 2**20)/2 + maxLoc # center_loc is the midway point in the region
        _ = plt.plot( 'center_loc','std_div',data = windows.loc[chrm], lw=1.5)
        maxLoc = windows.loc[chrm]['center_loc'].max() + 10 # max loc iteratively becomes larger with each chromosome to place everything on the same x-axis. Remove the + 10 if you don't want space between the chroms
        x_labels_pos.append(windows.loc[chrm]['center_loc'].mean()) # save the position for the xlabels
        if (num % 2) == 0: # alternate gray and white background
            _ = ax.axvspan(windows.loc[chrm]['center_loc'].min(), windows.loc[chrm]['center_loc'].max(), facecolor='lightgray')
    _ = ax.set_xticks(x_labels_pos)
    _ = ax.set_xticklabels(chroms)
    _ = ax.set_xlim(0,maxLoc)
    #_ = ax.set_ylim(0,1.5)
    maxLocOld = maxLoc
    plt.show()


    chroms = ['chr9','chr10','chr11','chr12','chr13','chr14','chr15','chr16','chr17','chr18','chr19','chr20','chr21','chr22']
    fig = plt.figure(figsize=(25,5))
    x_labels = []
    x_labels_pos = []
    ax = fig.add_subplot(111)
    ax.set_facecolor('white')


    maxLoc = 0
    num = 0
    for chrm in chroms:
        num+=1
        windows['center_loc'] = (windows.index.get_level_values(1) + 2**20)/2 + maxLoc # center_loc is the midway point in the region
        _ = plt.plot( 'center_loc','std_div',data = windows.loc[chrm], lw=1.5)
        maxLoc = windows.loc[chrm]['center_loc'].max() + 10 # max loc iteratively becomes larger with each chromosome to place everything on the same x-axis. Remove the + 10 if you don't want space between the chroms
        x_labels_pos.append(windows.loc[chrm]['center_loc'].mean()) # save the position for the xlabels
        if (num % 2) == 0: # alternate gray and white background
            _ = ax.axvspan(windows.loc[chrm]['center_loc'].min(), windows.loc[chrm]['center_loc'].max(), facecolor='lightgray')
    _ = ax.set_xticks(x_labels_pos)
    _ = ax.set_xticklabels(chroms)
    _ = ax.set_xlim(0,maxLoc)
    #_ = ax.set_ylim(0,1.5)
    maxLocOld = maxLoc
    plt.show()

# Genome wide average

## Setup

In [None]:
anc = pd.read_table('/wynton/group/capra/projects/modern_human_3Dgenome/results/comp_tables/anc_genomewide_averages.csv',
                    sep=',', index_col=0)
drops = anc[np.isnan(anc.genome_avg_mse)].index
anc = anc.drop(index=drops)

ref = pd.read_table('/wynton/group/capra/projects/modern_human_3Dgenome/results/comp_tables/ref_genomewide_averages.csv',
                    sep=',', index_col=0)
drops = ref[np.isnan(ref.genome_avg_mse)].index
ref = ref.drop(index=drops)

In [None]:
anc['divergence'] = 1-anc.genome_avg_spearman
ref['divergence'] = 1-ref.genome_avg_spearman

## Correlations

In [None]:
ref.index

In [None]:
anc.index.unique()

In [None]:
corr_plot(anc)

In [None]:
corr_plot(ref)

## Divergence

In [None]:
def plot_pop_divergence(df, ref):
    sup_order = df.groupby(by=["super_pop"])["divergence"].mean().sort_values().index
    sub_order = df.groupby(by=["sub_pop"])["divergence"].mean().sort_values().index
    
    sup_mapping = pd.DataFrame({
    'super': sup_order,
    })
    sup_mapping = sup_mapping.reset_index().set_index('super')

    sub_mapping = pd.DataFrame({
        'sub': sub_order,
    })
    sub_mapping = sub_mapping.reset_index().set_index('sub')
    
    df['super_rank'] = df['super_pop'].map(sup_mapping['index'])
    df['sub_rank'] = df['sub_pop'].map(sub_mapping['index'])
    
    my_order = df.sort_values(['super_rank','sub_rank'])
    
    my_order = my_order['sub_pop'].drop_duplicates().values
    
    amr_pal = sns.diverging_palette(50,230, s=82, l=58, n=10)[-4:]
    eur_pal = sns.diverging_palette(50,230, s=82, l=58, n=15)[:5]
    afr_pal = sns.diverging_palette(50,369, s=82, l=58, n=17)[-7:]
    sas_pal = sns.diverging_palette(190,369, s=82, l=58, n=12)[:5]
    eas_pal = sns.diverging_palette(110,105, s=75, l=65, n=10)[:5]
    
    if ref:
        my_pal = eur_pal + eas_pal + amr_pal + afr_pal
    else:
        my_pal = eur_pal + amr_pal + eas_pal + sas_pal + afr_pal
    sns.set(font_scale=4, context = 'poster', style='white')
    plt.figure(figsize=(100,30))
    sns.stripplot(x='sub_pop', y='divergence', data=df, dodge=False,s=20, 
                  palette=my_pal, 
                  order = my_order)
    plt.xlabel('\nPopulation', fontsize=100)
    plt.ylabel('Genome-Wide\n3D Divergence\n', fontsize=100)
    plt.legend(bbox_to_anchor=(1.02, 1), loc='upper left', borderaxespad=0, fontsize=100)
    plt.tick_params(axis='both', which='major', labelsize=80)
    #plt.savefig('fig2.svg', format='svg')
    
    return

In [None]:
sup_order = ref.groupby(by=["super_pop"])["divergence"].mean().sort_values().index
sub_order = ref.groupby(by=["sub_pop"])["divergence"].mean().sort_values().index

In [None]:
sup_order

In [None]:
sub_order

In [None]:
plot_pop_divergence(ref, True)

In [None]:
plot_pop_divergence(anc, False)

# Window wise metrics

### Distributions

In [None]:
sns.set(palette='husl', font_scale=1, context = 'poster', style='white')

In [None]:

anc_mse = pd.read_table('/wynton/group/capra/projects/modern_human_3Dgenome/results/comp_tables/anc_window_mse.csv'
                    , sep=',', header=[0,1,2,3], index_col=[0,1])
anc_spear = pd.read_table('/wynton/group/capra/projects/modern_human_3Dgenome/results/comp_tables/anc_window_spearman.csv'
                    , sep=',', header=[0,1,2,3], index_col=[0,1])
anc_div = 1-anc_spear

# ref_mse = pd.read_table('/wynton/group/capra/projects/modern_human_3Dgenome/results/comp_tables/ref_window_mse.csv'
#                     , sep=',', header=[0,1,2,3], index_col=[0,1])
# ref_spear = pd.read_table('/wynton/group/capra/projects/modern_human_3Dgenome/results/comp_tables/ref_window_spearman.csv'
#                     , sep=',', header=[0,1,2,3], index_col=[0,1])
# ref_div = 1-ref_spear

## HSMRCA

In [None]:
per_window_hists(anc_mse, anc_spear, anc_div)

In [None]:
div_flat = anc_div.to_numpy().flatten()
plt.figure(figsize=(10,4))
plt.hist(div_flat, bins=500)
plt.xlim(0, .1)
plt.xlabel('3D Divergence (per window per individual)')

In [None]:
df = anc_div
chrm='chr20'

afr_avg = df['AFR'].std(axis=1)
amr_avg = df['AMR'].std(axis=1)
eur_avg = df['EUR'].std(axis=1)
eas_avg = df['EAS'].std(axis=1)
sas_avg = df['SAS'].std(axis=1)

windows = pd.DataFrame([afr_avg, amr_avg, eur_avg, eas_avg, sas_avg]).transpose()
windows.columns = ['AFR','AMR','EUR','EAS','SAS']

windows['AVERAGE'] = windows.mean(axis=1)

df = windows.loc[chrm]

fig, ax = plt.subplots(figsize=[15,5])
sns.lineplot(y=df.AFR, 
             x=df.index, 
            label='AFR')
sns.lineplot(y=df.EUR, 
             x=df.index, 
            label='EUR')
sns.lineplot(y=df.EAS, 
             x=df.index, 
            label='EAS')
sns.lineplot(y=df.SAS, 
             x=df.index, 
            label='SAS')
sns.lineplot(y=df.AMR, 
             x=df.index, 
            label='AMR')



plt.ylabel('3D Divergence\nStandard Deviation')
plt.xlabel('Chromosome 20 Position')

plt.savefig('chr20std.svg', format='svg')

# Enrichment?

In [None]:
genes = pybedtools.BedTool('/wynton/group/capra/projects/modern_human_3Dgenome/data/phenotype_enrichment_divergent_from_ancestral/data/grch38_gene_annotations.bed')

In [None]:
mean_df = pd.DataFrame()
mean_df['AFR'] = anc_div['AFR'].mean(axis=1)
mean_df['AMR'] = anc_div['AMR'].mean(axis=1)
mean_df['EUR'] = anc_div['EUR'].mean(axis=1)
mean_df['EAS'] = anc_div['EAS'].mean(axis=1)
mean_df['SAS'] = anc_div['SAS'].mean(axis=1)
mean_df['All'] = anc_div.mean(axis=1)

In [None]:
mean_df

In [None]:
top_10 = mean_df[mean_df.All >= (mean_df.All.quantile(.9))]

In [None]:
all_windows = pd.DataFrame(index=mean_df.index).reset_index()
all_windows.columns = ['chr','start']

In [None]:
all_windows['end'] = all_windows['start'] + 2**20

In [None]:
all_windows

In [None]:
windows = pd.DataFrame(index=top_10.index).reset_index()
windows.columns = ['chr','start']

In [None]:
all_bed = pybedtools.BedTool.from_dataframe(all_windows)

In [None]:
all_bed.saveas('/wynton/group/capra/projects/modern_human_3Dgenome/data/phenotype_enrichment_divergent_from_ancestral//data/all_windows.bed')

In [None]:
gene_df = genes.to_dataframe(disable_auto_names=True, header=None)

In [None]:
gene_df

In [None]:
windows

In [None]:
windows['end'] = windows['start'] + 2**20

In [None]:
windows['chr'] = windows['chr'].str.strip('chr')

In [None]:
windows

In [None]:
genes = pybedtools.BedTool('/wynton/group/capra/projects/archaic_splicing/data/annotations/grch38_gene_annotations.bed')

In [None]:
window_bed = pybedtools.BedTool.from_dataframe(windows)

In [None]:
window_bed.saveas('/wynton/group/capra/projects/modern_human_3Dgenome/data/phenotype_enrichment/data/diff_windows.bed')

In [None]:
gene_overlap = window_bed.intersect(genes, wao=True).to_dataframe(names=['window_chrm','start','end','gene_chr','gene_start','gene_end','gene_name','overlap'])

In [None]:
gene_overlap.gene_name.unique().shape

In [None]:
gene_overlap

In [None]:
gene_map = gene_overlap.drop(columns=['gene_chr','gene_start','gene_end','overlap'])

In [None]:
gene_map['chr'] = 'chr' + gene_map['window_chrm'].astype('str')

In [None]:
gene_map = gene_map.drop(columns=['window_chrm'])

In [None]:
a = gene_map['gene_name'].unique()

In [None]:
pd.DataFrame(a).to_csv('/wynton/group/capra/projects/modern_human_3Dgenome/data/phenotype_enrichment/data/diff_window_genes_observed.txt', header=None, index=False)

In [None]:
np.savetxt('diff_window_genes_observed.txt', a, delimiter='\n')

In [None]:
gene_map.set_index(['chr','start'])

In [None]:
pops_out[pops_out.AFR_out].index

In [None]:
popsgene_map

In [None]:
def get_gene():
    gene_list=[]
    


In [None]:
all_windows = pd.DataFrame(index=pop_diffs.index).reset_index()
all_windows.columns = ['chr','start']

all_windows['end'] = all_windows['start'] + 2**20

all_windows



all_bed = pybedtools.BedTool.from_dataframe(all_windows)

all_bed.saveas('/wynton/group/capra/projects/modern_human_3Dgenome/data/phenotype_enrichment/data/all_windows.bed')

gene_df = genes.to_dataframe(disable_auto_names=True, header=None)

shuffled = all_windows.sample(n=windows.shape[0], random_state=1)
shuffled_bed = pybedtools.BedTool.from_dataframe(shuffled)

genes = pybedtools.BedTool('/wynton/group/capra/projects/modern_human_3Dgenome/data/phenotype_enrichment/data/grch38_gene_annotations.bed')





intersect = all_bed.intersect(shuffled_bed, wo = True).intersect(genes, loj = True).to_dataframe(disable_auto_names=True, header=None)

intersect










windows = pd.DataFrame(index=pops_out.index).reset_index()
windows.columns = ['chr','start']

windows

windows['end'] = windows['start'] + 2**20

windows['chr'] = windows['chr'].str.strip('chr')

windows

genes = pybedtools.BedTool('/wynton/group/capra/projects/archaic_splicing/data/annotations/grch38_gene_annotations.bed')

window_bed = pybedtools.BedTool.from_dataframe(windows)

window_bed.saveas('/wynton/group/capra/projects/modern_human_3Dgenome/data/phenotype_enrichment/data/diff_windows.bed')

gene_overlap = window_bed.intersect(genes, wao=True).to_dataframe(names=['window_chrm','start','end','gene_chr','gene_start','gene_end','gene_name','overlap'])

gene_overlap.gene_name.unique().shape

gene_overlap

gene_map = gene_overlap.drop(columns=['gene_chr','gene_start','gene_end','overlap'])

gene_map['chr'] = 'chr' + gene_map['window_chrm'].astype('str')

gene_map = gene_map.drop(columns=['window_chrm'])

a = gene_map['gene_name'].unique()

pd.DataFrame(a).to_csv('/wynton/group/capra/projects/modern_human_3Dgenome/data/phenotype_enrichment/data/diff_window_genes_observed.txt', header=None, index=False)

np.savetxt('diff_window_genes_observed.txt', a, delimiter='\n')

gene_map.set_index(['chr','start'])

pops_out[pops_out.AFR_out].index

popsgene_map

def get_gene():
    gene_list=[]
    


# Sequence

In [20]:
list_file = os.path.join(DATA_PATH, "reference/lists/listOfPairwiseComps_hsmrca.txt")
base = 'hsmrca_ancestral'
COMP_PATH = os.path.join(DATA_PATH,"pairwise/sequence/1KGvsHSMRCA")

In [21]:
comp_list=pd.read_table(list_file, sep=' ', names=['1KG','ancestor'])
if base != 'hg38_reference':
    base_df = comp_list[comp_list.ancestor==base].set_index('1KG')
else:
    base_df = comp_list.set_index('1KG')

In [22]:
comp_dict_seq = {}
print('first for')
for row in base_df.index:
    ind = str(base_df.loc[row].name)
    filename = 'SeqComps_%s_vs_%s.txt' % (ind,base)
    if os.path.exists('%s/%s' % (COMP_PATH,filename)):
        try:
            df = pd.read_table('%s/%s' % (COMP_PATH,filename))
            df = df[df.chrm != 'chrX']
            comp_dict_seq[ind] = df
        except:
            print('broken')
            continue     
print('done')

first for
/wynton/group/capra/projects/modern_human_3Dgenome/data/pairwise/sequence/1KGvsHSMRCA
SeqComps_AFR_ACB_female_HG01880_vs_hsmrca_ancestral.txt
/wynton/group/capra/projects/modern_human_3Dgenome/data/pairwise/sequence/1KGvsHSMRCA
SeqComps_AFR_ACB_female_HG01883_vs_hsmrca_ancestral.txt
/wynton/group/capra/projects/modern_human_3Dgenome/data/pairwise/sequence/1KGvsHSMRCA
SeqComps_AFR_ACB_female_HG01886_vs_hsmrca_ancestral.txt
/wynton/group/capra/projects/modern_human_3Dgenome/data/pairwise/sequence/1KGvsHSMRCA
SeqComps_AFR_ACB_female_HG01889_vs_hsmrca_ancestral.txt
/wynton/group/capra/projects/modern_human_3Dgenome/data/pairwise/sequence/1KGvsHSMRCA
SeqComps_AFR_ACB_female_HG01894_vs_hsmrca_ancestral.txt
/wynton/group/capra/projects/modern_human_3Dgenome/data/pairwise/sequence/1KGvsHSMRCA
SeqComps_AFR_ACB_female_HG01896_vs_hsmrca_ancestral.txt
/wynton/group/capra/projects/modern_human_3Dgenome/data/pairwise/sequence/1KGvsHSMRCA
SeqComps_AFR_ACB_female_HG01915_vs_hsmrca_ancestral.

/wynton/group/capra/projects/modern_human_3Dgenome/data/pairwise/sequence/1KGvsHSMRCA
SeqComps_AFR_ACB_male_HG02255_vs_hsmrca_ancestral.txt
/wynton/group/capra/projects/modern_human_3Dgenome/data/pairwise/sequence/1KGvsHSMRCA
SeqComps_AFR_ACB_male_HG02281_vs_hsmrca_ancestral.txt
/wynton/group/capra/projects/modern_human_3Dgenome/data/pairwise/sequence/1KGvsHSMRCA
SeqComps_AFR_ACB_male_HG02283_vs_hsmrca_ancestral.txt
/wynton/group/capra/projects/modern_human_3Dgenome/data/pairwise/sequence/1KGvsHSMRCA
SeqComps_AFR_ACB_male_HG02284_vs_hsmrca_ancestral.txt
/wynton/group/capra/projects/modern_human_3Dgenome/data/pairwise/sequence/1KGvsHSMRCA
SeqComps_AFR_ACB_male_HG02307_vs_hsmrca_ancestral.txt
/wynton/group/capra/projects/modern_human_3Dgenome/data/pairwise/sequence/1KGvsHSMRCA
SeqComps_AFR_ACB_male_HG02314_vs_hsmrca_ancestral.txt
/wynton/group/capra/projects/modern_human_3Dgenome/data/pairwise/sequence/1KGvsHSMRCA
SeqComps_AFR_ACB_male_HG02317_vs_hsmrca_ancestral.txt
/wynton/group/capra/

/wynton/group/capra/projects/modern_human_3Dgenome/data/pairwise/sequence/1KGvsHSMRCA
SeqComps_AFR_ASW_male_NA19711_vs_hsmrca_ancestral.txt
/wynton/group/capra/projects/modern_human_3Dgenome/data/pairwise/sequence/1KGvsHSMRCA
SeqComps_AFR_ASW_male_NA19818_vs_hsmrca_ancestral.txt
/wynton/group/capra/projects/modern_human_3Dgenome/data/pairwise/sequence/1KGvsHSMRCA
SeqComps_AFR_ASW_male_NA19834_vs_hsmrca_ancestral.txt
/wynton/group/capra/projects/modern_human_3Dgenome/data/pairwise/sequence/1KGvsHSMRCA
SeqComps_AFR_ASW_male_NA19900_vs_hsmrca_ancestral.txt
/wynton/group/capra/projects/modern_human_3Dgenome/data/pairwise/sequence/1KGvsHSMRCA
SeqComps_AFR_ASW_male_NA19904_vs_hsmrca_ancestral.txt
/wynton/group/capra/projects/modern_human_3Dgenome/data/pairwise/sequence/1KGvsHSMRCA
SeqComps_AFR_ASW_male_NA19908_vs_hsmrca_ancestral.txt
/wynton/group/capra/projects/modern_human_3Dgenome/data/pairwise/sequence/1KGvsHSMRCA
SeqComps_AFR_ASW_male_NA19916_vs_hsmrca_ancestral.txt
/wynton/group/capra/

/wynton/group/capra/projects/modern_human_3Dgenome/data/pairwise/sequence/1KGvsHSMRCA
SeqComps_AFR_ESN_female_HG03369_vs_hsmrca_ancestral.txt
/wynton/group/capra/projects/modern_human_3Dgenome/data/pairwise/sequence/1KGvsHSMRCA
SeqComps_AFR_ESN_female_HG03372_vs_hsmrca_ancestral.txt
/wynton/group/capra/projects/modern_human_3Dgenome/data/pairwise/sequence/1KGvsHSMRCA
SeqComps_AFR_ESN_female_HG03499_vs_hsmrca_ancestral.txt
/wynton/group/capra/projects/modern_human_3Dgenome/data/pairwise/sequence/1KGvsHSMRCA
SeqComps_AFR_ESN_female_HG03511_vs_hsmrca_ancestral.txt
/wynton/group/capra/projects/modern_human_3Dgenome/data/pairwise/sequence/1KGvsHSMRCA
SeqComps_AFR_ESN_female_HG03514_vs_hsmrca_ancestral.txt
/wynton/group/capra/projects/modern_human_3Dgenome/data/pairwise/sequence/1KGvsHSMRCA
SeqComps_AFR_ESN_female_HG03517_vs_hsmrca_ancestral.txt
/wynton/group/capra/projects/modern_human_3Dgenome/data/pairwise/sequence/1KGvsHSMRCA
SeqComps_AFR_ESN_female_HG03520_vs_hsmrca_ancestral.txt
/wynto

/wynton/group/capra/projects/modern_human_3Dgenome/data/pairwise/sequence/1KGvsHSMRCA
SeqComps_AFR_GWD_female_HG02574_vs_hsmrca_ancestral.txt
/wynton/group/capra/projects/modern_human_3Dgenome/data/pairwise/sequence/1KGvsHSMRCA
SeqComps_AFR_GWD_female_HG02583_vs_hsmrca_ancestral.txt
/wynton/group/capra/projects/modern_human_3Dgenome/data/pairwise/sequence/1KGvsHSMRCA
SeqComps_AFR_GWD_female_HG02586_vs_hsmrca_ancestral.txt
/wynton/group/capra/projects/modern_human_3Dgenome/data/pairwise/sequence/1KGvsHSMRCA
SeqComps_AFR_GWD_female_HG02589_vs_hsmrca_ancestral.txt
/wynton/group/capra/projects/modern_human_3Dgenome/data/pairwise/sequence/1KGvsHSMRCA
SeqComps_AFR_GWD_female_HG02595_vs_hsmrca_ancestral.txt
/wynton/group/capra/projects/modern_human_3Dgenome/data/pairwise/sequence/1KGvsHSMRCA
SeqComps_AFR_GWD_female_HG02611_vs_hsmrca_ancestral.txt
/wynton/group/capra/projects/modern_human_3Dgenome/data/pairwise/sequence/1KGvsHSMRCA
SeqComps_AFR_GWD_female_HG02614_vs_hsmrca_ancestral.txt
/wynto

/wynton/group/capra/projects/modern_human_3Dgenome/data/pairwise/sequence/1KGvsHSMRCA
SeqComps_AFR_GWD_male_HG02582_vs_hsmrca_ancestral.txt
/wynton/group/capra/projects/modern_human_3Dgenome/data/pairwise/sequence/1KGvsHSMRCA
SeqComps_AFR_GWD_male_HG02585_vs_hsmrca_ancestral.txt
/wynton/group/capra/projects/modern_human_3Dgenome/data/pairwise/sequence/1KGvsHSMRCA
SeqComps_AFR_GWD_male_HG02588_vs_hsmrca_ancestral.txt
/wynton/group/capra/projects/modern_human_3Dgenome/data/pairwise/sequence/1KGvsHSMRCA
SeqComps_AFR_GWD_male_HG02594_vs_hsmrca_ancestral.txt
/wynton/group/capra/projects/modern_human_3Dgenome/data/pairwise/sequence/1KGvsHSMRCA
SeqComps_AFR_GWD_male_HG02610_vs_hsmrca_ancestral.txt
/wynton/group/capra/projects/modern_human_3Dgenome/data/pairwise/sequence/1KGvsHSMRCA
SeqComps_AFR_GWD_male_HG02613_vs_hsmrca_ancestral.txt
/wynton/group/capra/projects/modern_human_3Dgenome/data/pairwise/sequence/1KGvsHSMRCA
SeqComps_AFR_GWD_male_HG02620_vs_hsmrca_ancestral.txt
/wynton/group/capra/

/wynton/group/capra/projects/modern_human_3Dgenome/data/pairwise/sequence/1KGvsHSMRCA
SeqComps_AFR_LWK_female_NA19377_vs_hsmrca_ancestral.txt
/wynton/group/capra/projects/modern_human_3Dgenome/data/pairwise/sequence/1KGvsHSMRCA
SeqComps_AFR_LWK_female_NA19378_vs_hsmrca_ancestral.txt
/wynton/group/capra/projects/modern_human_3Dgenome/data/pairwise/sequence/1KGvsHSMRCA
SeqComps_AFR_LWK_female_NA19379_vs_hsmrca_ancestral.txt
/wynton/group/capra/projects/modern_human_3Dgenome/data/pairwise/sequence/1KGvsHSMRCA
SeqComps_AFR_LWK_female_NA19390_vs_hsmrca_ancestral.txt
/wynton/group/capra/projects/modern_human_3Dgenome/data/pairwise/sequence/1KGvsHSMRCA
SeqComps_AFR_LWK_female_NA19391_vs_hsmrca_ancestral.txt
/wynton/group/capra/projects/modern_human_3Dgenome/data/pairwise/sequence/1KGvsHSMRCA
SeqComps_AFR_LWK_female_NA19395_vs_hsmrca_ancestral.txt
/wynton/group/capra/projects/modern_human_3Dgenome/data/pairwise/sequence/1KGvsHSMRCA
SeqComps_AFR_LWK_female_NA19399_vs_hsmrca_ancestral.txt
/wynto

/wynton/group/capra/projects/modern_human_3Dgenome/data/pairwise/sequence/1KGvsHSMRCA
SeqComps_AFR_LWK_male_NA19429_vs_hsmrca_ancestral.txt
/wynton/group/capra/projects/modern_human_3Dgenome/data/pairwise/sequence/1KGvsHSMRCA
SeqComps_AFR_LWK_male_NA19430_vs_hsmrca_ancestral.txt
/wynton/group/capra/projects/modern_human_3Dgenome/data/pairwise/sequence/1KGvsHSMRCA
SeqComps_AFR_LWK_male_NA19443_vs_hsmrca_ancestral.txt
/wynton/group/capra/projects/modern_human_3Dgenome/data/pairwise/sequence/1KGvsHSMRCA
SeqComps_AFR_LWK_male_NA19448_vs_hsmrca_ancestral.txt
/wynton/group/capra/projects/modern_human_3Dgenome/data/pairwise/sequence/1KGvsHSMRCA
SeqComps_AFR_LWK_male_NA19451_vs_hsmrca_ancestral.txt
/wynton/group/capra/projects/modern_human_3Dgenome/data/pairwise/sequence/1KGvsHSMRCA
SeqComps_AFR_LWK_male_NA19452_vs_hsmrca_ancestral.txt
/wynton/group/capra/projects/modern_human_3Dgenome/data/pairwise/sequence/1KGvsHSMRCA
SeqComps_AFR_LWK_male_NA19454_vs_hsmrca_ancestral.txt
/wynton/group/capra/

/wynton/group/capra/projects/modern_human_3Dgenome/data/pairwise/sequence/1KGvsHSMRCA
SeqComps_AFR_MSL_male_HG03225_vs_hsmrca_ancestral.txt
/wynton/group/capra/projects/modern_human_3Dgenome/data/pairwise/sequence/1KGvsHSMRCA
SeqComps_AFR_MSL_male_HG03376_vs_hsmrca_ancestral.txt
/wynton/group/capra/projects/modern_human_3Dgenome/data/pairwise/sequence/1KGvsHSMRCA
SeqComps_AFR_MSL_male_HG03382_vs_hsmrca_ancestral.txt
/wynton/group/capra/projects/modern_human_3Dgenome/data/pairwise/sequence/1KGvsHSMRCA
SeqComps_AFR_MSL_male_HG03385_vs_hsmrca_ancestral.txt
/wynton/group/capra/projects/modern_human_3Dgenome/data/pairwise/sequence/1KGvsHSMRCA
SeqComps_AFR_MSL_male_HG03388_vs_hsmrca_ancestral.txt
/wynton/group/capra/projects/modern_human_3Dgenome/data/pairwise/sequence/1KGvsHSMRCA
SeqComps_AFR_MSL_male_HG03391_vs_hsmrca_ancestral.txt
/wynton/group/capra/projects/modern_human_3Dgenome/data/pairwise/sequence/1KGvsHSMRCA
SeqComps_AFR_MSL_male_HG03394_vs_hsmrca_ancestral.txt
/wynton/group/capra/

/wynton/group/capra/projects/modern_human_3Dgenome/data/pairwise/sequence/1KGvsHSMRCA
SeqComps_AFR_YRI_female_NA19190_vs_hsmrca_ancestral.txt
/wynton/group/capra/projects/modern_human_3Dgenome/data/pairwise/sequence/1KGvsHSMRCA
SeqComps_AFR_YRI_female_NA19197_vs_hsmrca_ancestral.txt
/wynton/group/capra/projects/modern_human_3Dgenome/data/pairwise/sequence/1KGvsHSMRCA
SeqComps_AFR_YRI_female_NA19201_vs_hsmrca_ancestral.txt
/wynton/group/capra/projects/modern_human_3Dgenome/data/pairwise/sequence/1KGvsHSMRCA
SeqComps_AFR_YRI_female_NA19204_vs_hsmrca_ancestral.txt
/wynton/group/capra/projects/modern_human_3Dgenome/data/pairwise/sequence/1KGvsHSMRCA
SeqComps_AFR_YRI_female_NA19206_vs_hsmrca_ancestral.txt
/wynton/group/capra/projects/modern_human_3Dgenome/data/pairwise/sequence/1KGvsHSMRCA
SeqComps_AFR_YRI_female_NA19209_vs_hsmrca_ancestral.txt
/wynton/group/capra/projects/modern_human_3Dgenome/data/pairwise/sequence/1KGvsHSMRCA
SeqComps_AFR_YRI_female_NA19214_vs_hsmrca_ancestral.txt
/wynto

/wynton/group/capra/projects/modern_human_3Dgenome/data/pairwise/sequence/1KGvsHSMRCA
SeqComps_AMR_CLM_female_HG01134_vs_hsmrca_ancestral.txt
/wynton/group/capra/projects/modern_human_3Dgenome/data/pairwise/sequence/1KGvsHSMRCA
SeqComps_AMR_CLM_female_HG01137_vs_hsmrca_ancestral.txt
/wynton/group/capra/projects/modern_human_3Dgenome/data/pairwise/sequence/1KGvsHSMRCA
SeqComps_AMR_CLM_female_HG01140_vs_hsmrca_ancestral.txt
/wynton/group/capra/projects/modern_human_3Dgenome/data/pairwise/sequence/1KGvsHSMRCA
SeqComps_AMR_CLM_female_HG01149_vs_hsmrca_ancestral.txt
/wynton/group/capra/projects/modern_human_3Dgenome/data/pairwise/sequence/1KGvsHSMRCA
SeqComps_AMR_CLM_female_HG01251_vs_hsmrca_ancestral.txt
/wynton/group/capra/projects/modern_human_3Dgenome/data/pairwise/sequence/1KGvsHSMRCA
SeqComps_AMR_CLM_female_HG01254_vs_hsmrca_ancestral.txt
/wynton/group/capra/projects/modern_human_3Dgenome/data/pairwise/sequence/1KGvsHSMRCA
SeqComps_AMR_CLM_female_HG01257_vs_hsmrca_ancestral.txt
/wynto

/wynton/group/capra/projects/modern_human_3Dgenome/data/pairwise/sequence/1KGvsHSMRCA
SeqComps_AMR_CLM_male_HG01344_vs_hsmrca_ancestral.txt
/wynton/group/capra/projects/modern_human_3Dgenome/data/pairwise/sequence/1KGvsHSMRCA
SeqComps_AMR_CLM_male_HG01350_vs_hsmrca_ancestral.txt
/wynton/group/capra/projects/modern_human_3Dgenome/data/pairwise/sequence/1KGvsHSMRCA
SeqComps_AMR_CLM_male_HG01353_vs_hsmrca_ancestral.txt
/wynton/group/capra/projects/modern_human_3Dgenome/data/pairwise/sequence/1KGvsHSMRCA
SeqComps_AMR_CLM_male_HG01356_vs_hsmrca_ancestral.txt
/wynton/group/capra/projects/modern_human_3Dgenome/data/pairwise/sequence/1KGvsHSMRCA
SeqComps_AMR_CLM_male_HG01359_vs_hsmrca_ancestral.txt
/wynton/group/capra/projects/modern_human_3Dgenome/data/pairwise/sequence/1KGvsHSMRCA
SeqComps_AMR_CLM_male_HG01362_vs_hsmrca_ancestral.txt
/wynton/group/capra/projects/modern_human_3Dgenome/data/pairwise/sequence/1KGvsHSMRCA
SeqComps_AMR_CLM_male_HG01365_vs_hsmrca_ancestral.txt
/wynton/group/capra/

/wynton/group/capra/projects/modern_human_3Dgenome/data/pairwise/sequence/1KGvsHSMRCA
SeqComps_AMR_MXL_male_NA19658_vs_hsmrca_ancestral.txt
/wynton/group/capra/projects/modern_human_3Dgenome/data/pairwise/sequence/1KGvsHSMRCA
SeqComps_AMR_MXL_male_NA19661_vs_hsmrca_ancestral.txt
/wynton/group/capra/projects/modern_human_3Dgenome/data/pairwise/sequence/1KGvsHSMRCA
SeqComps_AMR_MXL_male_NA19664_vs_hsmrca_ancestral.txt
/wynton/group/capra/projects/modern_human_3Dgenome/data/pairwise/sequence/1KGvsHSMRCA
SeqComps_AMR_MXL_male_NA19670_vs_hsmrca_ancestral.txt
/wynton/group/capra/projects/modern_human_3Dgenome/data/pairwise/sequence/1KGvsHSMRCA
SeqComps_AMR_MXL_male_NA19676_vs_hsmrca_ancestral.txt
/wynton/group/capra/projects/modern_human_3Dgenome/data/pairwise/sequence/1KGvsHSMRCA
SeqComps_AMR_MXL_male_NA19679_vs_hsmrca_ancestral.txt
/wynton/group/capra/projects/modern_human_3Dgenome/data/pairwise/sequence/1KGvsHSMRCA
SeqComps_AMR_MXL_male_NA19682_vs_hsmrca_ancestral.txt
/wynton/group/capra/

/wynton/group/capra/projects/modern_human_3Dgenome/data/pairwise/sequence/1KGvsHSMRCA
SeqComps_AMR_PEL_female_HG02345_vs_hsmrca_ancestral.txt
/wynton/group/capra/projects/modern_human_3Dgenome/data/pairwise/sequence/1KGvsHSMRCA
SeqComps_AMR_PEL_female_HG02348_vs_hsmrca_ancestral.txt
/wynton/group/capra/projects/modern_human_3Dgenome/data/pairwise/sequence/1KGvsHSMRCA
SeqComps_AMR_PEL_female_HG02425_vs_hsmrca_ancestral.txt
/wynton/group/capra/projects/modern_human_3Dgenome/data/pairwise/sequence/1KGvsHSMRCA
SeqComps_AMR_PEL_male_HG01565_vs_hsmrca_ancestral.txt
/wynton/group/capra/projects/modern_human_3Dgenome/data/pairwise/sequence/1KGvsHSMRCA
SeqComps_AMR_PEL_male_HG01571_vs_hsmrca_ancestral.txt
/wynton/group/capra/projects/modern_human_3Dgenome/data/pairwise/sequence/1KGvsHSMRCA
SeqComps_AMR_PEL_male_HG01577_vs_hsmrca_ancestral.txt
/wynton/group/capra/projects/modern_human_3Dgenome/data/pairwise/sequence/1KGvsHSMRCA
SeqComps_AMR_PEL_male_HG01892_vs_hsmrca_ancestral.txt
/wynton/group/

/wynton/group/capra/projects/modern_human_3Dgenome/data/pairwise/sequence/1KGvsHSMRCA
SeqComps_AMR_PUR_female_HG01086_vs_hsmrca_ancestral.txt
/wynton/group/capra/projects/modern_human_3Dgenome/data/pairwise/sequence/1KGvsHSMRCA
SeqComps_AMR_PUR_female_HG01089_vs_hsmrca_ancestral.txt
/wynton/group/capra/projects/modern_human_3Dgenome/data/pairwise/sequence/1KGvsHSMRCA
SeqComps_AMR_PUR_female_HG01092_vs_hsmrca_ancestral.txt
/wynton/group/capra/projects/modern_human_3Dgenome/data/pairwise/sequence/1KGvsHSMRCA
SeqComps_AMR_PUR_female_HG01095_vs_hsmrca_ancestral.txt
/wynton/group/capra/projects/modern_human_3Dgenome/data/pairwise/sequence/1KGvsHSMRCA
SeqComps_AMR_PUR_female_HG01098_vs_hsmrca_ancestral.txt
/wynton/group/capra/projects/modern_human_3Dgenome/data/pairwise/sequence/1KGvsHSMRCA
SeqComps_AMR_PUR_female_HG01102_vs_hsmrca_ancestral.txt
/wynton/group/capra/projects/modern_human_3Dgenome/data/pairwise/sequence/1KGvsHSMRCA
SeqComps_AMR_PUR_female_HG01105_vs_hsmrca_ancestral.txt
/wynto

KeyboardInterrupt: 

In [19]:
comp_dict_seq['AFR_ACB_female_HG01880']

NameError: name 'comp_dict_seq' is not defined

In [None]:
indivs = list(comp_dict.keys())
df = comp_dict[list(comp_dict.keys())[0]].set_index(['chr','windowStartPos'])
windows = df.index
### Window by window
mse = pd.DataFrame(index = windows, columns = indivs)
spear = pd.DataFrame(index = windows, columns = indivs)

### Genome wide averages        
base_df['genome_avg_mse']=''
base_df['genome_avg_spearman']=''
print('second for')
for ind in comp_dict.keys():
    df = comp_dict[ind]

    avg_mse =float(np.mean(df['mse']))
    avg_spearman = float(np.mean(df['spearman']))

    base_df.genome_avg_mse.loc[ind]=avg_mse
    base_df.genome_avg_spearman.loc[ind]=avg_spearman


    df = comp_dict[ind]
    df = df.set_index(['chr','windowStartPos'])
    mse[ind]=df['mse']
    spear[ind]=df['spearman']
print('done')  



base_df['temp'] = base_df.index
new = base_df.temp.str.split("_", expand = True)
base_df['super_pop'] = new[0]
base_df['sub_pop'] = new[1]
base_df['sex'] = new[2]
base_df['id'] = new[3]

genome_wide = base_df.drop(columns=['temp'])

mse.columns = mse.columns.str.split('_', expand=True)
spear.columns = spear.columns.str.split('_', expand=True)

return genome_wide, mse, spear