In [120]:
import pandas as pd
import numpy as np
import scipy.stats as st
import seaborn as sns
import sys
import os
import matplotlib.pyplot as plt
import swan_vis as swan
import yaml
from snakemake.io import expand
import cerberus
import pyranges as pr
import upsetplot

p = os.path.dirname(os.path.dirname(os.getcwd()))
sys.path.append(p)

from scripts.utils import *
from scripts.vcf_utils import *
from scripts.sm_utils import *
from scripts.plotting import *

In [121]:
config = load_config()
od = ''

def proc_cfg(entry, od):
    entry = entry.replace('../../', '')
    entry = od+entry
    return entry

In [126]:
# sample information
df = load_meta()
df = df.loc[~df['sample'].str.contains('_')]
df['lab_sample'] = df['lab_number_sample'].astype(str)+'_'+\
                      df['lab_sampleid'].astype(str)+'_'+\
                      df['cell_line_id'].astype(str)
df = df[['cell_line_id', 'sample', 'hapmap_DNA_ID', 'lab_sample']].drop_duplicates()

temp_df = pd.read_csv('cell_line_ids.txt', header=None, names=['cell_line_id'])

# make a 1000g cell line id col
df['cell_line_id_1000g'] = df.cell_line_id

inds = df.loc[~df.cell_line_id_1000g.isin(temp_df.cell_line_id.tolist())].index
df.loc[inds, 'cell_line_id_1000g'] = df.loc[inds, 'hapmap_DNA_ID']
len(df.index)

# limit to just those in 1000g
df = df.loc[df.cell_line_id_1000g.isin(temp_df.cell_line_id.tolist())]
assert len(df.index) == 30

hap = ['hap1', 'hap2']

In [137]:
# genome information table
sqanti_genomes = ['hap1', 'hap2', 'hg38']
sqanti_df = df.copy(deep=True)
sqanti_df['sqanti_genome'] = [sqanti_genomes] * len(df)
sqanti_df = sqanti_df.explode('sqanti_genome').reset_index(drop=True)
# sqanti_df.head()
# sqanti_df['sqanti_genome_fa'] = config['ref']['fa']
# for ind, entry in sqanti_df.iterrows():
#     if entry.sqanti_genome in hap:
#         entry.sqanti_genome_fa = expand(config['lr']['td_personal']['ref_fa'],
#                                         hap=entry.sqanti_genome,
#                                         cell_line_id=entry.cell_line_id_1000g)[0]
# # check no hap things in hg38 rows
# assert len(sqanti_df.loc[(~(sqanti_df.sqanti_genome=='hg38')&
#                           (sqanti_df.sqanti_genome_fa.str.contains('hg38')))]) == 0

In [138]:
def get_sqanti_ref_fa(wc):
    if wc['sqanti_genome'] == 'hg38':
        return config['ref']['fa']
    else:
        return expand(config['lr']['td_personal']['ref_fa'],
                        cell_line_id=wc['cell_line_id'],
                        hap=wc['sqanti_genome'])[0]
    

In [142]:
wc = {'cell_line_id': 'HG04217', 'sqanti_genome': 'hap1'}
print(get_sqanti_ref_fa(wc))
print()

wc = {'cell_line_id': 'HG04217', 'sqanti_genome': 'hap2'}
print(get_sqanti_ref_fa(wc))
print()

wc = {'cell_line_id': 'HG04217', 'sqanti_genome': 'hg38'}
print(get_sqanti_ref_fa(wc))
print()

/gpfs/projects/bsc83/Projects/pantranscriptome/pclavell/08_allele_specifics/data/01_samples_vcf/HG04217_hap1.fa

/gpfs/projects/bsc83/Projects/pantranscriptome/pclavell/08_allele_specifics/data/01_samples_vcf/HG04217_hap2.fa

../../ref/hg38.fa



In [139]:
sqanti_df.head()

Unnamed: 0,cell_line_id,sample,hapmap_DNA_ID,lab_sample,cell_line_id_1000g,sqanti_genome
0,HG04217,ITU5,no_hapmap,39_IN5_HG04217,HG04217,hap1
1,HG04217,ITU5,no_hapmap,39_IN5_HG04217,HG04217,hap2
2,HG04217,ITU5,no_hapmap,39_IN5_HG04217,HG04217,hg38
3,HG04216,ITU4,no_hapmap,38_IN4_HG04216,HG04216,hap1
4,HG04216,ITU4,no_hapmap,38_IN4_HG04216,HG04216,hap2


In [123]:
# df

In [124]:
df = load_meta()
df = df.loc[~df['sample'].str.contains('_')]
df['lab_sample'] = df['lab_number_sample'].astype(str)+'_'+\
                      df['lab_sampleid'].astype(str)+'_'+\
                      df['cell_line_id'].astype(str)
df = df[['cell_line_id', 'sample', 'hapmap_DNA_ID', 'lab_sample']].drop_duplicates()

temp_df = pd.read_csv('cell_line_ids.txt', header=None, names=['cell_line_id'])

# make a 1000g cell line id col
df['cell_line_id_1000g'] = df.cell_line_id

inds = df.loc[~df.cell_line_id_1000g.isin(temp_df.cell_line_id.tolist())].index
df.loc[inds, 'cell_line_id_1000g'] = df.loc[inds, 'hapmap_DNA_ID']
len(df.index)

# limit to just those in 1000g
df = df.loc[df.cell_line_id_1000g.isin(temp_df.cell_line_id.tolist())]
assert len(df.index) == 30

hap = ['hap1', 'hap2']

In [125]:
bad_samples = ['NA19328']

df.loc[df.cell_line_id_1000g.isin(bad_samples)]

Unnamed: 0,cell_line_id,sample,hapmap_DNA_ID,lab_sample,cell_line_id_1000g
27,GM19328,LWK2,NA19328,20_KE2_GM19328,NA19328


In [118]:
def get_df_val(df, col1, col_dict):
    temp = df.copy(deep=True)

    for key, item in col_dict.items():
        temp = temp.loc[temp[key] == item]

    val = temp[col1].unique()
    assert len(val) == 1
    return val[0]
get_df_val(df,
                            'lab_sample',
                            {'cell_line_id_1000g': 'HG04217'})

'39_IN5_HG04217'

In [119]:
expand(config['lr']['espresso']['gtf'],
       lab_rep=get_df_val(df,
                        'lab_sample',
                        {'cell_line_id_1000g': 'HG04217'}))

['/gpfs/projects/bsc83/Projects/pantranscriptome/novelannotations/espresso/39_IN5_HG04217_espresso.gtf']

## Old

In [65]:
meta_df = parse_config(meta_file)
print(len(meta_df.index))
meta_df.rename({'sample':'cell_line_id'}, axis=1, inplace=True)
meta_df.head()

43


Unnamed: 0,lab_rep,cell_line_id,tech_rep_num,tech_rep
0,10_NI5_GM19117,GM19117,1,GM19117_1
1,11_NI6_GM19129,GM19129,1,GM19129_1
2,12_NI7_GM19240,GM19240,1,GM19240_1
3,13_CH1_GM18542,GM18542,1,GM18542_1
4,14_CH2_GM18561,GM18561,1,GM18561_1


In [47]:
temp_df = pd.read_csv('cell_line_ids.txt', header=None, names=['cell_line_id'])

In [48]:
meta_df_2 = load_meta()
meta_df_2 = meta_df_2.loc[~meta_df_2['sample'].str.contains('_')]
meta_df_2[['cell_line_id', 'sample', 'hapmap_DNA_ID']].drop_duplicates()
meta_df = meta_df.merge(meta_df_2, 
                        how='left',
                        on='cell_line_id')
len(meta_df.index)

43

In [49]:
# meta_df = meta_df.loc[meta_df.cell_line_id.isin(temp_df['cell_line_id'].tolist())]
# len(meta_df.index)

In [50]:
len(meta_df['cell_line_id'].unique())

43

In [57]:
meta_df = meta_df.loc[(meta_df.cell_line_id.isin(temp_df.cell_line_id.tolist()))|
                (meta_df.hapmap_DNA_ID.isin(temp_df.cell_line_id.tolist()))]

30

In [58]:
# sample information
meta_file = '../config.tsv'
meta_df = parse_config(meta_file)
# meta_df.rename({'sample':'cell_line_id'}, axis=1, inplace=True)

# # info about the ref files
# temp_df = pd.read_csv('cell_line_ids.txt', header=None, names=['cell_line_id'])

# meta_df_2 = load_meta()
# meta_df_2 = meta_df_2.loc[~meta_df_2['sample'].str.contains('_')]
# meta_df_2[['cell_line_id', 'sample', 'hapmap_DNA_ID']].drop_duplicates()
# meta_df = meta_df.merge(meta_df_2,
#                         how='left',
#                         on='cell_line_id')

# # limit to 1000g samples
# meta_df = meta_df.loc[(meta_df.cell_line_id.isin(temp_df.cell_line_id.tolist()))|
#                 (meta_df.hapmap_DNA_ID.isin(temp_df.cell_line_id.tolist()))]

# hap = ['hap1', 'hap2']
# print(len(meta_df.index))

30


In [60]:
meta_df = meta_df.merge(temp_df,
                        how='left', 
                        on='cell_line_id')
meta_df = meta_df.merge(temp_df,
                        how='left',
                        left_on='hapmap_DNA_ID',
                        right_on='cell_line_id')

60

In [61]:
meta_df.head()

Unnamed: 0,lab_rep,cell_line_id_x,tech_rep_num,tech_rep,lab_number_sample,lab_sampleid,sample,population,color_pop,ooa,...,two_runs,was_switched_with,relabeled,family_member,family,population_full_name,population _description,extra_info,coriell_link,cell_line_id_y
0,10_NI5_GM19117,GM19117,1,GM19117_1,10,NI5,YRI5,YRI,#DFBD00,AFR,...,False,False,False,Father,Y100,"Yoruba in Ibadan, Nigeria","Yoruba in Ibadan, Nigeria",Empty,https://www.coriell.org/0/Sections/Search/Samp...,NA19117
1,10_NI5_GM19117,GM19117,1,GM19117_1,10,NI5,YRI5,YRI,#DFBD00,AFR,...,False,False,False,Father,Y100,"Yoruba in Ibadan, Nigeria","Yoruba in Ibadan, Nigeria",Empty,https://www.coriell.org/0/Sections/Search/Samp...,NA19117
2,11_NI6_GM19129,GM19129,1,GM19129_1,11,NI6,YRI6,YRI,#DFBD00,AFR,...,False,False,False,Child,Y077,"Yoruba in Ibadan, Nigeria","Yoruba in Ibadan, Nigeria",Empty,https://www.coriell.org/0/Sections/Search/Samp...,NA19129
3,11_NI6_GM19129,GM19129,1,GM19129_1,11,NI6,YRI6,YRI,#DFBD00,AFR,...,False,False,False,Child,Y077,"Yoruba in Ibadan, Nigeria","Yoruba in Ibadan, Nigeria",Empty,https://www.coriell.org/0/Sections/Search/Samp...,NA19129
4,12_NI7_GM19240,GM19240,1,GM19240_1,12,NI7,YRI7,YRI,#DFBD00,AFR,...,False,False,False,Child,Y117,"Yoruba in Ibadan, Nigeria","Yoruba in Ibadan, Nigeria",Empty,https://www.coriell.org/0/Sections/Search/Samp...,NA19240


In [51]:
# print(len(meta_df.loc[meta_df['1000G']==True]['cell_line_id'].unique()))
# print(len(meta_df.loc[(meta_df['1000G']==True)&
#                       (meta_df['1000g_cell_line_id_x'].isnull())&\
#                       (meta_df['1000g_cell_line_id_y'].notnull())]['cell_line_id'].unique()))
# print(len(meta_df.loc[(meta_df['1000G']==True)&
#                       (meta_df['1000g_cell_line_id_y'].isnull())&\
#                       (meta_df['1000g_cell_line_id_x'].notnull())]['cell_line_id'].unique()))
# print(len(meta_df.loc[(meta_df['1000G']==True)&
#                       (meta_df['1000g_cell_line_id_y'].notnull())|\
#                       (meta_df['1000g_cell_line_id_x'].notnull())]['cell_line_id'].unique()))


In [55]:
def get_df_val(df, col1, col_dict):
    temp = df.copy(deep=True)

    for key, item in col_dict.items():
        if type(key) == tuple:
            temp2 = pd.DataFrame()
            for i in range(len(key)):
                sub_key = key[i]
                temp2 = pd.concat([temp2, temp.loc[temp[sub_key]==item]], axis=0)
            temp = temp2.drop_duplicates()
        else:
            temp = temp.loc[temp[key] == item]

    val = temp[col1].unique()
    assert len(val) == 1
    return val[0]

In [56]:
for thing in temp_df['cell_line_id'].unique():
    print(get_df_val(meta_df, 'sample', {('cell_line_id', 'hapmap_DNA_ID'):thing}))

PEL3
PEL5
YRI6
ITU3
ITU2
YRI5
CEU5
YRI3
LWK2
LWK1
YRI2
YRI1
HAC1
YRI7
LWK5
CEU1
ITU1
HAC3
LWK4
HAC6
PEL2
HAC2
CEU2
CEU3
PEL4
ITU5
PEL6
PEL1
ITU4
CEU4
