# Generate Dataframes

## Table of Contents

In [1]:
import pandas as pd

In [2]:
cd ../../data/comparisons

/wynton/group/capra/projects/pan_3d_genome/data/comparisons


## Load Pairwise Data and Generate New Dataframe

Let's load all of the pairwise HFF data to start. 

In [3]:
comparisons_header = ['ind1','ind2','chr','window_start','mse','spearman']
HFF_comparisons = pd.read_csv('HFF/all_HFF_comparisons.txt', sep = '\t', header = None, names = comparisons_header)
HFF_comparisons.head(5)

Unnamed: 0,ind1,ind2,chr,window_start,mse,spearman
0,Akwaya-Jean,Alfred,chr10,0,0.002914,0.990228
1,Akwaya-Jean,Alfred,chr10,524288,0.000705,0.998648
2,Akwaya-Jean,Alfred,chr10,1048576,0.000967,0.990023
3,Akwaya-Jean,Alfred,chr10,1572864,0.000168,0.987655
4,Akwaya-Jean,Alfred,chr10,2097152,0.000481,0.969809


In [4]:
len(HFF_comparisons)

13212745

Quick sanity check that this length is what we expect. Let's multiply the number of windows by the number of possible pairs.

In [5]:
5317 * 2485

13212745

Add two columns: one with the 'divergence' metric (1 - spearman) and another with a window label that concatenates the chromosome and window start position.

In [6]:
HFF_comparisons['divergence'] = 1 - HFF_comparisons['spearman']
HFF_comparisons['window'] = HFF_comparisons['chr'] + '_' + HFF_comparisons['window_start'].astype(str)
HFF_comparisons = HFF_comparisons[['ind1','ind2','chr','window_start','window','mse','spearman','divergence']]
HFF_comparisons.head(5)

Unnamed: 0,ind1,ind2,chr,window_start,window,mse,spearman,divergence
0,Akwaya-Jean,Alfred,chr10,0,chr10_0,0.002914,0.990228,0.009772
1,Akwaya-Jean,Alfred,chr10,524288,chr10_524288,0.000705,0.998648,0.001352
2,Akwaya-Jean,Alfred,chr10,1048576,chr10_1048576,0.000967,0.990023,0.009977
3,Akwaya-Jean,Alfred,chr10,1572864,chr10_1572864,0.000168,0.987655,0.012345
4,Akwaya-Jean,Alfred,chr10,2097152,chr10_2097152,0.000481,0.969809,0.030191


Let's filter out windows that met any of the following criteria: 1) have at least one 'N' in the reference genome, 2) individuals with variants that did not pass our quality filters, and 3) any X chromosome window comparison that includes a male due to hemizygosity in males.

In [7]:
excluded_header = ['chr','start','end','N_missing']
excluded = pd.read_csv('../metadata/panTro6_excluded_windows.bed', sep = '\t', header = None, names = excluded_header)
excluded['window'] = excluded['chr'] + '_' + excluded['start'].astype(str)
excluded.head(5)

Unnamed: 0,chr,start,end,N_missing,window
0,chr1,0,1048576,100,chr1_0
1,chr1,524288,1572864,100,chr1_524288
2,chr1,4194304,5242880,100,chr1_4194304
3,chr1,4718592,5767168,100,chr1_4718592
4,chr1,10485760,11534336,7505,chr1_10485760


In [8]:
excluded_windows = excluded['window'].tolist()

In [9]:
inds_to_exclude = ['Annie','Banyo','Basho','Catherine','Chipita','Clara','Harriet','Kopongo','LB502','Mike','Noemie','Padda','Paquita','Salonga','Tobi']

In [10]:
males = ['Akwaya-Jean','Alfred','Athanga','Basho','Bono','Bosco','Brigitta','Bwamble','Clint','Damian','Desmond','Gamin','Koby','Koto','Mgbadolite','Mike','Padda','SeppToni','Tongo','Vaillant','Vincent','Washu','Yogui']

In [11]:
HFF_comparisons = HFF_comparisons[~(HFF_comparisons['window'].isin(excluded_windows))]
HFF_comparisons = HFF_comparisons[~((HFF_comparisons['ind1'].isin(males)) & (HFF_comparisons['chr'] == 'chrX'))]
HFF_comparisons = HFF_comparisons[~((HFF_comparisons['ind2'].isin(males)) & (HFF_comparisons['chr'] == 'chrX'))]
HFF_comparisons = HFF_comparisons[~((HFF_comparisons['ind1'].isin(inds_to_exclude)) | (HFF_comparisons['ind2'].isin(inds_to_exclude)))]
HFF_comparisons.head(5)

Unnamed: 0,ind1,ind2,chr,window_start,window,mse,spearman,divergence
3,Akwaya-Jean,Alfred,chr10,1572864,chr10_1572864,0.000168,0.987655,0.012345
4,Akwaya-Jean,Alfred,chr10,2097152,chr10_2097152,0.000481,0.969809,0.030191
5,Akwaya-Jean,Alfred,chr10,2621440,chr10_2621440,0.001675,0.996398,0.003602
6,Akwaya-Jean,Alfred,chr10,3145728,chr10_3145728,0.000323,0.997899,0.002101
7,Akwaya-Jean,Alfred,chr10,3670016,chr10_3670016,0.000143,0.996732,0.003268


In [12]:
len(HFF_comparisons)

6669390

Designate the dyad type per comparison. We want to identify dyads that reflect within lineage comparisons, among chimpanzee lineages, and between bonobos and chimpanzees.

In [13]:
ppn = ['Bono','Desmond','Dzeeta','Hermien','Hortense','Kombote','Kosana','Kumbuka','Natalie']
pte = ['Akwaya-Jean','Damian','Julie-LWC21','Koto','Taweh']
pts = ['Andromeda','Athanga','Bihati','Bwamble','Cindy-schwein','Cleo','Coco-chimp','Frederike','Ikuru','Kidongo','Maya','Mgbadolite','Nakuu','Tongo','Trixie','Vincent','Washu']
ptt = ['Alfred','Blanquita','Brigitta','Cindy-troglodytes','Doris','Gamin','Julie-A959','Lara','Luky','Marlin','Mirinda','Negrita','Tibe','Ula','Vaillant','Yogui']
ptv = ['Alice','Berta','Bosco','Cindy-verus','Clint','Jimmie','Koby','Linda','SeppToni']

In [14]:
def dyad_type(HFF_comparisons):
    if (HFF_comparisons['ind1'] in ppn) & (HFF_comparisons['ind2'] in ppn):
        return 'ppn'
    elif (HFF_comparisons['ind1'] in pte) & (HFF_comparisons['ind2'] in pte):
        return 'pte'
    elif (HFF_comparisons['ind1'] in pts) & (HFF_comparisons['ind2'] in pts):
        return 'pts'
    elif (HFF_comparisons['ind1'] in ptt) & (HFF_comparisons['ind2'] in ptt):
        return 'ptt'
    elif (HFF_comparisons['ind1'] in ptv) & (HFF_comparisons['ind2'] in ptv):
        return 'ptv'
    elif (HFF_comparisons['ind1'] in ppn) & (HFF_comparisons['ind2'] in pte) | (HFF_comparisons['ind1'] in pte) & (HFF_comparisons['ind2'] in ppn):
        return 'ppn-pt'
    elif (HFF_comparisons['ind1'] in ppn) & (HFF_comparisons['ind2'] in pts) | (HFF_comparisons['ind1'] in pts) & (HFF_comparisons['ind2'] in ppn):
        return 'ppn-pt'
    elif (HFF_comparisons['ind1'] in ppn) & (HFF_comparisons['ind2'] in ptt) | (HFF_comparisons['ind1'] in ptt) & (HFF_comparisons['ind2'] in ppn):
        return 'ppn-pt'
    elif (HFF_comparisons['ind1'] in ppn) & (HFF_comparisons['ind2'] in ptv) | (HFF_comparisons['ind1'] in ptv) & (HFF_comparisons['ind2'] in ppn):
        return 'ppn-pt'
    elif (HFF_comparisons['ind1'] in pte) & (HFF_comparisons['ind2'] in pts) | (HFF_comparisons['ind1'] in pts) & (HFF_comparisons['ind2'] in pte):
        return 'pte-pts'
    elif (HFF_comparisons['ind1'] in pte) & (HFF_comparisons['ind2'] in ptt) | (HFF_comparisons['ind1'] in ptt) & (HFF_comparisons['ind2'] in pte):
        return 'pte-ptt'
    elif (HFF_comparisons['ind1'] in pte) & (HFF_comparisons['ind2'] in ptv) | (HFF_comparisons['ind1'] in ptv) & (HFF_comparisons['ind2'] in pte):
        return 'pte-ptv'
    elif (HFF_comparisons['ind1'] in pts) & (HFF_comparisons['ind2'] in ptt) | (HFF_comparisons['ind1'] in ptt) & (HFF_comparisons['ind2'] in pts):
        return 'pts-ptt'
    elif (HFF_comparisons['ind1'] in pts) & (HFF_comparisons['ind2'] in ptv) | (HFF_comparisons['ind1'] in ptv) & (HFF_comparisons['ind2'] in pts):
        return 'pts-ptv'
    elif (HFF_comparisons['ind1'] in ptt) & (HFF_comparisons['ind2'] in ptv) | (HFF_comparisons['ind1'] in ptv) & (HFF_comparisons['ind2'] in ptt):
        return 'ptt-ptv'
    else:
        return 'other'

HFF_comparisons['dyad_type'] = HFF_comparisons.apply(dyad_type, axis = 1)

In [15]:
len(HFF_comparisons)

6669390

In [16]:
HFF_comparisons = HFF_comparisons[['ind1','ind2','dyad_type','chr','window_start','window','mse','spearman','divergence']]
HFF_comparisons.head(5)

Unnamed: 0,ind1,ind2,dyad_type,chr,window_start,window,mse,spearman,divergence
3,Akwaya-Jean,Alfred,pte-ptt,chr10,1572864,chr10_1572864,0.000168,0.987655,0.012345
4,Akwaya-Jean,Alfred,pte-ptt,chr10,2097152,chr10_2097152,0.000481,0.969809,0.030191
5,Akwaya-Jean,Alfred,pte-ptt,chr10,2621440,chr10_2621440,0.001675,0.996398,0.003602
6,Akwaya-Jean,Alfred,pte-ptt,chr10,3145728,chr10_3145728,0.000323,0.997899,0.002101
7,Akwaya-Jean,Alfred,pte-ptt,chr10,3670016,chr10_3670016,0.000143,0.996732,0.003268


Let's add information on sequence divergence per window.

In [17]:
seqs_header = ['ind1','ind2','chr','window_start','window_end','seq_diff']
seqs = pd.read_csv('../sequence_differences/all_sequence_differences.txt', sep = '\t', header = None, names = seqs_header)
seqs.head(10)

Unnamed: 0,ind1,ind2,chr,window_start,window_end,seq_diff
0,Akwaya-Jean,Alfred,chr1,0,1048576,1705
1,Akwaya-Jean,Alfred,chr1,524288,1572864,1644
2,Akwaya-Jean,Alfred,chr1,1048576,2097152,1866
3,Akwaya-Jean,Alfred,chr1,1572864,2621440,2284
4,Akwaya-Jean,Alfred,chr1,2097152,3145728,3089
5,Akwaya-Jean,Alfred,chr1,2621440,3670016,3550
6,Akwaya-Jean,Alfred,chr1,3145728,4194304,3444
7,Akwaya-Jean,Alfred,chr1,3670016,4718592,3133
8,Akwaya-Jean,Alfred,chr1,4194304,5242880,2359
9,Akwaya-Jean,Alfred,chr1,4718592,5767168,2237


In [18]:
HFF_comparisons_with_seqs = pd.merge(HFF_comparisons, seqs, on = ['ind1','ind2','chr','window_start'])
HFF_comparisons_with_seqs = HFF_comparisons_with_seqs.drop('window_end', axis = 1)
HFF_comparisons_with_seqs.head(5)

Unnamed: 0,ind1,ind2,dyad_type,chr,window_start,window,mse,spearman,divergence,seq_diff
0,Akwaya-Jean,Alfred,pte-ptt,chr10,1572864,chr10_1572864,0.000168,0.987655,0.012345,2803
1,Akwaya-Jean,Alfred,pte-ptt,chr10,2097152,chr10_2097152,0.000481,0.969809,0.030191,2715
2,Akwaya-Jean,Alfred,pte-ptt,chr10,2621440,chr10_2621440,0.001675,0.996398,0.003602,2849
3,Akwaya-Jean,Alfred,pte-ptt,chr10,3145728,chr10_3145728,0.000323,0.997899,0.002101,2606
4,Akwaya-Jean,Alfred,pte-ptt,chr10,3670016,chr10_3670016,0.000143,0.996732,0.003268,2594


In [19]:
len(HFF_comparisons_with_seqs)

6669390

Save the dataframe.

In [20]:
HFF_comparisons_with_seqs.to_csv('../dataframes/HFF_comparisons.txt', sep = '\t', header = True, index = False)

## GM12878

Now let's generate a dataframe for predictions from the GM12878 cell line.

In [21]:
GM12878_comparisons = pd.read_csv('GM12878/all_GM12878_comparisons.txt', sep = '\t', header = None, names = comparisons_header)
GM12878_comparisons.head(5)

Unnamed: 0,ind1,ind2,chr,window_start,mse,spearman
0,Akwaya-Jean,Alfred,chr10,0,0.001005,0.992669
1,Akwaya-Jean,Alfred,chr10,524288,0.000853,0.996613
2,Akwaya-Jean,Alfred,chr10,1048576,0.000537,0.984502
3,Akwaya-Jean,Alfred,chr10,1572864,5.7e-05,0.985524
4,Akwaya-Jean,Alfred,chr10,2097152,0.000183,0.966059


In [22]:
len(GM12878_comparisons)

13212745

Add the columns we need.

In [23]:
GM12878_comparisons['divergence'] = 1 - GM12878_comparisons['spearman']
GM12878_comparisons['window'] = GM12878_comparisons['chr'] + '_' + GM12878_comparisons['window_start'].astype(str)
GM12878_comparisons = GM12878_comparisons[['ind1','ind2','chr','window_start','window','mse','spearman','divergence']]
GM12878_comparisons.head(5)

Unnamed: 0,ind1,ind2,chr,window_start,window,mse,spearman,divergence
0,Akwaya-Jean,Alfred,chr10,0,chr10_0,0.001005,0.992669,0.007331
1,Akwaya-Jean,Alfred,chr10,524288,chr10_524288,0.000853,0.996613,0.003387
2,Akwaya-Jean,Alfred,chr10,1048576,chr10_1048576,0.000537,0.984502,0.015498
3,Akwaya-Jean,Alfred,chr10,1572864,chr10_1572864,5.7e-05,0.985524,0.014476
4,Akwaya-Jean,Alfred,chr10,2097152,chr10_2097152,0.000183,0.966059,0.033941


Repeat filtering from HFF above.

In [24]:
GM12878_comparisons = GM12878_comparisons[~(GM12878_comparisons['window'].isin(excluded_windows))]
GM12878_comparisons = GM12878_comparisons[~((GM12878_comparisons['ind1'].isin(males)) & (GM12878_comparisons['chr'] == 'chrX'))]
GM12878_comparisons = GM12878_comparisons[~((GM12878_comparisons['ind2'].isin(males)) & (GM12878_comparisons['chr'] == 'chrX'))]
GM12878_comparisons = GM12878_comparisons[~((GM12878_comparisons['ind1'].isin(inds_to_exclude)) | (GM12878_comparisons['ind2'].isin(inds_to_exclude)))]
GM12878_comparisons.head(5)

Unnamed: 0,ind1,ind2,chr,window_start,window,mse,spearman,divergence
3,Akwaya-Jean,Alfred,chr10,1572864,chr10_1572864,5.7e-05,0.985524,0.014476
4,Akwaya-Jean,Alfred,chr10,2097152,chr10_2097152,0.000183,0.966059,0.033941
5,Akwaya-Jean,Alfred,chr10,2621440,chr10_2621440,0.000751,0.997055,0.002945
6,Akwaya-Jean,Alfred,chr10,3145728,chr10_3145728,0.000114,0.998515,0.001485
7,Akwaya-Jean,Alfred,chr10,3670016,chr10_3670016,5.6e-05,0.997281,0.002719


In [25]:
len(GM12878_comparisons)

6669390

Compare lengths from HFF and GM12878.

In [26]:
len(HFF_comparisons) == len(GM12878_comparisons)

True

Save the dataframe.

In [27]:
GM12878_comparisons.to_csv('../dataframes/GM12878_comparisons.txt', sep = '\t', header = True, index = False)