## Goal: make supplementary table for personal-GRCh38 assemblies

In [1]:
import pandas as pd
import numpy as np
import scipy.stats as st
import seaborn as sns
import sys
import os
import matplotlib.pyplot as plt
import swan_vis as swan
import yaml
from snakemake.io import expand
import cerberus
import pyranges as pr
import upsetplot

p = os.path.dirname(os.getcwd())
sys.path.append(p)

from scripts.utils import *
from scripts.vcf_utils import *
from scripts.plotting import *

In [2]:
config = load_config()
od = '../'

def proc_cfg(entry, od):
    entry = entry.replace('../../', '')
    entry = od+entry
    return entry

In [3]:
df = load_meta()
df = df.loc[~df['sample'].str.contains('_')]
df['lab_sample'] = df['lab_number_sample'].astype(str)+'_'+\
                      df['lab_sampleid'].astype(str)+'_'+\
                      df['cell_line_id'].astype(str)
df.columns
df = df[['cell_line_id', 'sample', 'hapmap_DNA_ID',
          'map_reads_assemblymap','lab_sample', 'population']].drop_duplicates()

temp_df = pd.read_csv('../snakemake/transcript_discovery_personal/cell_line_ids.txt', header=None, names=['cell_line_id'])

# make a 1000g cell line id col
df['cell_line_id_1000g'] = df.cell_line_id

inds = df.loc[~df.cell_line_id_1000g.isin(temp_df.cell_line_id.tolist())].index
df.loc[inds, 'cell_line_id_1000g'] = df.loc[inds, 'hapmap_DNA_ID']
len(df.index)

# limit to just those in 1000g
df = df.loc[df.cell_line_id_1000g.isin(temp_df.cell_line_id.tolist())]
assert len(df.index) == 30

# TODO bad sample that hasn't finished on espresso
# bad_samples = ['NA19328']
# df = df.loc[~df.cell_line_id_1000g.isin(bad_samples)]

hap = ['hap1', 'hap2']

meta_df = df.copy(deep=True)

In [4]:
file = proc_cfg(config['lr']['td_personal']['cerb']['ic_summary'],od)
df = pd.read_csv(file)
df.drop('Unnamed: 0', axis=1, inplace=True)

# remove sqanti genome and drop dupes
# the sqanti genome SHOULD be irrelevant here
df = df.drop('sqanti_genome', axis=1)
print(len(df.index))
df = df.drop_duplicates()
print(len(df.index))

# then make sure that there are no dupe. ic+structural cat+sample+map genome
temp = df.loc[df[['ic_id', 'structural_category',
                  'cell_line_id', 'map_genome']].duplicated(keep=False)]
assert len(temp.index) == 0
del temp

# transform to be t/f for each ic per genome
df.drop('transcript_id', axis=1, inplace=True)
df_crosstab = pd.crosstab(index=[df.ic_id, df.structural_category, df.cell_line_id], 
                          columns=df.map_genome, 
                          values=df.map_genome, 
                          aggfunc=lambda x: True).fillna(False).reset_index()

12993951
4331317


In [6]:
df = df_crosstab.copy(deep=True)

In [10]:
df.columns.name=''
df.head()

Unnamed: 0,ic_id,structural_category,cell_line_id,hap1,hap2,hg38
0,GL000008.2_+_83545-83926-84014-85456-85477-855...,full-splice_match,HG01567,True,True,True
1,GL000008.2_+_83545-83926-84014-85566-85625-129984,full-splice_match,HG03732,True,True,True
2,GL000008.2_+_83545-83926-84014-85566-85625-155...,full-splice_match,HG03732,True,True,True
3,GL000008.2_+_83545-83926-84014-85566-85625-199427,full-splice_match,HG03732,True,True,True
4,GL000008.2_+_83545-83926-84014-85566-85625-88635,full-splice_match,HG01567,True,True,True


In [12]:
df.to_csv('../supp_tables/17_personalized_grch38_ics.tsv', sep='\t', index=False)

In [13]:
df.head()

Unnamed: 0,ic_id,structural_category,cell_line_id,hap1,hap2,hg38
0,GL000008.2_+_83545-83926-84014-85456-85477-855...,full-splice_match,HG01567,True,True,True
1,GL000008.2_+_83545-83926-84014-85566-85625-129984,full-splice_match,HG03732,True,True,True
2,GL000008.2_+_83545-83926-84014-85566-85625-155...,full-splice_match,HG03732,True,True,True
3,GL000008.2_+_83545-83926-84014-85566-85625-199427,full-splice_match,HG03732,True,True,True
4,GL000008.2_+_83545-83926-84014-85566-85625-88635,full-splice_match,HG01567,True,True,True
