In [2]:
import pandas as pd
import numpy as np
import scipy.stats as st
import seaborn as sns
import sys
import os
import matplotlib.pyplot as plt
import swan_vis as swan
import yaml
from snakemake.io import expand
import cerberus
import pyranges as pr
import upsetplot

p = os.path.dirname(os.path.dirname(os.getcwd()))
sys.path.append(p)

from scripts.utils import *
from scripts.vcf_utils import *
from scripts.plotting import *

In [3]:
config = load_config()
od = ''

def proc_cfg(entry, od):
    entry = entry.replace('../../', '')
    entry = od+entry
    return entry

In [4]:
def clean_figure(ax):
    ax.spines['right'].set_visible(False)
    ax.spines['top'].set_visible(False)
    ax.tick_params(axis="x", rotation=45)

In [5]:
meta_file = 'filereport_read_run_PRJNA851328_tsv.txt'

df = pd.read_csv(meta_file, sep='\t')
df[['cell_line_id', 'batch', 'rep']] = df.experiment_alias.str.split('_', expand=True)
len(df.index)
len(df.cell_line_id.unique())

df[['r1_fq_link', 'r2_fq_link']] = df.fastq_ftp.str.split(';', expand=True)
df['r1_verify'] = df.r1_fq_link.str.endswith('_1.fastq.gz')
df['r2_verify'] = df.r2_fq_link.str.endswith('_2.fastq.gz')
assert len(df.loc[df.r1_verify==False].index)==0
assert len(df.loc[df.r2_verify==False].index)==0

df['sample'] = df['experiment_alias']

In [6]:
df['cell_line_id'] = df['sample'].str.split('_', expand=True)[0]
df[['sample', 'cell_line_id']].head()

Unnamed: 0,sample,cell_line_id
0,NA19704_batch11_rep1,NA19704
1,NA19332_batch14_rep1,NA19332
2,NA19317_batch15_rep1,NA19317
3,NA19312_batch05_rep1,NA19312
4,NA19323_batch11_rep1,NA19323


## Filter samples w/ replicates to one w/ highest read depth

In [None]:
df = pd.read_csv(exp_file, sep='\t')
df = df.set_index('transcript_id')
df = df.transpose()

# counts / sample
df = df.reset_index()
df = df.rename({'index':'sample'}, axis=1)
df['cell_line_id'] = df['sample'].str.split('_', expand=True)[0]
df = df.set_index(['cell_line_id', 'sample'])
df['total_counts'] = df.sum(axis=1)
df = df[['total_counts']].reset_index()

# dedupe on total counts
df = df.sort_values(by='total_counts', ascending=False)
print(len(df.index))
df = df.drop_duplicates(subset=['cell_line_id'], keep='first')
print(len(df.index))

df = df[['sample']]
df.to_csv('keep_samples.tsv', sep='\t', index=False)

In [23]:
exp_file = od+config['mage']['v47_kallisto']['merge_matrix_tsv']
df = pd.read_csv(exp_file, sep='\t')

In [24]:
df = df.set_index('transcript_id')
df = df.transpose()

In [25]:
# counts / sample
df = df.reset_index()
df = df.rename({'index':'sample'}, axis=1)
df['cell_line_id'] = df['sample'].str.split('_', expand=True)[0]
df = df.set_index(['cell_line_id', 'sample'])
df['total_counts'] = df.sum(axis=1)
df.head()

Unnamed: 0_level_0,transcript_id,ENST00000000233.10,ENST00000000412.8,ENST00000000442.11,ENST00000001008.6,ENST00000001146.7,ENST00000002125.9,ENST00000002165.11,ENST00000002501.11,ENST00000002596.6,ENST00000002829.8,...,ENST00000850835.1,ENST00000850836.1,ENST00000850837.1,ENST00000850838.1,ENST00000850839.1,ENST00000850840.1,ENST00000850841.1,ENST00000850842.1,ENST00000850843.1,total_counts
cell_line_id,sample,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
NA19704,NA19704_batch11_rep1,2187.66,4203.78,335.718,2064.24,0.0,322.11,992.572,915.726,15.0,20.0654,...,25.9881,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,24870250.0
NA19332,NA19332_batch14_rep1,2483.13,5302.25,603.101,1585.8,0.0,176.919,1536.97,158.61,39.5692,0.0,...,11.7222,1.41875,0.0,0.0,6.93641,0.0,0.0,0.0,0.921029,29440930.0
NA19317,NA19317_batch15_rep1,1081.26,3306.58,307.345,1706.69,0.0,169.047,757.023,614.355,60.2248,0.0,...,12.7007,2.52598,0.0,0.0,4.17823,0.0,1.19724,0.0,0.0,20113300.0
NA19312,NA19312_batch05_rep1,4053.85,6273.98,729.524,3544.72,0.0,314.616,1372.78,2229.53,31.6113,1.7321,...,70.3466,2.18983,4.43783,0.0,11.0699,0.0,8.75674,0.0,0.0,42380010.0
NA19323,NA19323_batch11_rep1,2051.01,3338.97,310.898,2224.32,0.0,218.433,539.937,81.6014,15.0,18.7386,...,25.7538,0.0,0.0,0.0,1.90098,0.0,0.0,0.0,0.0,23932700.0


In [26]:
df = df[['total_counts']].reset_index()

In [27]:
df = df.sort_values(by='total_counts', ascending=False)
print(len(df.index))

779


In [28]:
df.loc[df.cell_line_id=='NA19143']

transcript_id,cell_line_id,sample,total_counts
6,NA19143,NA19143_batch15_rep2,20299750.0
509,NA19143,NA19143_batch15_rep1,18466670.0
5,NA19143,NA19143_batch04_rep3,18120860.0


In [29]:
df = df.drop_duplicates(subset=['cell_line_id'], keep='first')
print(len(df.index))

731


In [30]:
df.loc[df.cell_line_id=='NA19143']

transcript_id,cell_line_id,sample,total_counts
6,NA19143,NA19143_batch15_rep2,20299750.0


In [31]:
df = df[['sample']]
df.to_csv('keep_samples.tsv', sep='\t', index=False)

In [33]:
# now filter
s_df = pd.read_csv('keep_samples.tsv', sep='\t')
df = pd.read_csv(exp_file, sep='\t')

In [34]:
samples = s_df['sample'].tolist()
samples[:5]

['HG00335_batch17_rep1',
 'HG03082_batch17_rep1',
 'HG01461_batch17_rep1',
 'HG03598_batch17_rep1',
 'NA19773_batch17_rep1']

In [36]:
ind_cols = [c for c in df.columns if 'id' in c]
df = df.set_index(ind_cols)

Unnamed: 0_level_0,NA19704_batch11_rep1,NA19332_batch14_rep1,NA19317_batch15_rep1,NA19312_batch05_rep1,NA19323_batch11_rep1,NA19143_batch04_rep3,NA19143_batch15_rep2,HG00326_batch08_rep1,NA18942_batch15_rep1,NA18915_batch11_rep1,...,NA19731_batch09_rep1,NA19732_batch14_rep1,HG00525_batch10_rep1,HG00473_batch03_rep1,NA21127_batch04_rep1,NA20342_batch15_rep1,NA20412_batch07_rep1,NA20318_batch07_rep1,NA20298_batch01_rep1,NA20320_batch05_rep1
transcript_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
ENST00000000233.10,2187.66,2483.13,1081.26,4053.85,2051.01,1898.63,1037.97,1923.43,945.648,1877.51,...,2843.35,2398.14,1717.92,1313.38,1464.37,760.503,1878.09,2689.8,2031.42,3365.31
ENST00000000412.8,4203.78,5302.25,3306.58,6273.98,3338.97,3420.26,3246.81,3352.52,3296.86,3734.27,...,4518.26,6763.18,3087.19,1953.76,3633.84,2991.08,3368.53,4747.02,3129.72,5863.28
ENST00000000442.11,335.718,603.101,307.345,729.524,310.898,441.57,396.765,378.063,170.085,368.509,...,788.457,770.24,395.389,311.986,427.172,72.0386,421.711,500.598,411.257,690.474
ENST00000001008.6,2064.24,1585.8,1706.69,3544.72,2224.32,3766.0,2172.18,1608.99,1572.02,2701.11,...,2378.49,4240.4,2167.06,1171.16,2734.65,1239.95,1806.9,3532.98,2550.03,3521.23
ENST00000001146.7,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,6.50477,0.0,0.0,0.0,0.0,0.0,6.0


In [38]:
df = df[samples]
df = df.reset_index()
df.head()

Unnamed: 0,transcript_id,HG00335_batch17_rep1,HG03082_batch17_rep1,HG01461_batch17_rep1,HG03598_batch17_rep1,NA19773_batch17_rep1,HG02471_batch17_rep1,HG02814_batch17_rep1,HG02697_batch17_rep1,HG01190_batch17_rep1,...,HG02597_batch08_rep1,NA12874_batch08_rep1,HG00740_batch16_rep1,HG00127_batch16_rep1,NA19682_batch10_rep1,HG00589_batch03_rep1,HG00327_batch03_rep1,NA12546_batch03_rep1,HG00260_batch03_rep1,HG00473_batch03_rep1
0,ENST00000000233.10,5635.67,5520.66,4941.54,5056.13,5451.79,6076.64,4721.5,5326.82,4802.56,...,1193.22,1678.95,1046.23,946.335,1253.29,1488.12,1365.79,1539.46,1268.33,1313.38
1,ENST00000000412.8,10935.5,10911.8,11334.4,8223.08,10318.0,9885.03,9299.14,10408.0,9563.69,...,2658.83,2077.18,2282.66,1887.34,2180.2,1875.96,2591.49,2089.25,2097.11,1953.76
2,ENST00000000442.11,1238.04,1121.47,1449.83,1375.36,1480.39,943.708,1457.29,1161.82,1457.87,...,239.693,240.742,312.71,254.255,258.383,263.724,180.086,191.682,338.016,311.986
3,ENST00000001008.6,9326.71,5263.39,8079.58,6188.96,9163.46,4161.08,6328.56,5460.25,8504.89,...,1917.02,1513.26,1208.86,1453.63,1283.07,1754.61,2536.3,1042.45,1202.44,1171.16
4,ENST00000001146.7,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,6.50477


In [39]:
df.to_csv('test', sep='\t', index=False)

## Filter genes / transcripts used to perform sQTL calling

- genes expressed ≥ 1 TPM in at least 80% of the samples
- with at least two isoforms and a minimum isoform expression of 0.1 TPM

In [40]:
g_exp_file = od+config['mage']['v47_kallisto']['gene_tpm_tsv']
exp_file = od+config['mage']['v47_kallisto']['merge_matrix_tpm_tsv']
t2g_file = od+config['ref']['v47_kallisto_short']['t2g']

min_samp_prop = 0.8
min_g_exp = 1
min_t_exp = 0.1
min_n_t = 2

In [41]:
# genes exp. >= 1 TPM in at least 80% of samples
df = pd.read_csv(g_exp_file, sep='\t')
n_samples = len([c for c in df.columns.tolist() if c not in ['gid']])
n_sample_cutoff = int(n_samples*min_samp_prop)

df = df.set_index('gid')
df = df>=min_g_exp
df['n_exp_samples'] = df.sum(axis=1)
print(len(df.index))
df = df.loc[df['n_exp_samples']>=n_sample_cutoff]
print(len(df.index))
gids = df.index.tolist()

78932
12225


In [42]:
# with at least two isoforms and
# minimum isoform expression of 0.1 TPM
# genes exp. >= 1 TPM in at least 80% of samples
df = pd.read_csv(exp_file, sep='\t')
df = df.set_index('transcript_id')
df = df>=min_t_exp
df['n_exp_samples'] = df.sum(axis=1)
print(len(df.index))
df = df.loc[df['n_exp_samples']>=1]
print(len(df.index))
tids = df.index.tolist()

387944
315312


In [43]:
# limit to tids and gids
t2g_df = pd.read_csv(t2g_file, sep='\t', header=None)
t2g_df = t2g_df[[0,1]]
t2g_df.columns = ['tid', 'gid']

print(len(t2g_df.index))
t2g_df = t2g_df.loc[(t2g_df.gid.isin(gids))&\
                    (t2g_df.tid.isin(tids))]
print(len(t2g_df.index))

387944
154491


In [44]:
# then count # isos / gene and filter one more time
t2g_df = t2g_df.groupby('gid').nunique().reset_index().rename({'tid':'n_tid'}, axis=1)
print(len(t2g_df.index))
t2g_df = t2g_df.loc[t2g_df.n_tid >= min_n_t]
print(len(t2g_df.index))

12225
11199


In [46]:
# finally, re-read the transcript tpm exp and filt.
df = pd.read_csv(exp_file, sep='\t')


## Get gene expression values (TPM and counts) 

In [27]:
# tpm_df = pd.read_csv(od+config['mage']['v47_kallisto']['merge_matrix_tpm_tsv'], 
#                         sep='\t')

In [48]:
exp_file = od+config['mage']['v47_kallisto']['merge_matrix_tsv']
t2g_file = od+config['ref']['v47_kallisto_short']['t2g']

In [49]:
exp_df = pd.read_csv(exp_file, sep='\t')
t2g = pd.read_csv(t2g_file, sep='\t', header=None)
t2g = t2g[[0,1]]
t2g.columns = ['transcript_id', 'gid']
t2g.head()

Unnamed: 0,transcript_id,gid
0,ENST00000456328.3,ENSG00000290825.2
1,ENST00000832823.1,ENSG00000290825.2
2,ENST00000832824.1,ENSG00000290825.2
3,ENST00000832825.1,ENSG00000290825.2
4,ENST00000832826.1,ENSG00000290825.2


In [50]:
# get gene assignments for each transcript using t2g file
exp_df = exp_df.merge(t2g[['gid', 'transcript_id']],
              how='left', 
              on='transcript_id')

In [51]:
assert len(exp_df.loc[exp_df.gid.isnull()].index) == 0

In [52]:
# drop tid, gb on gid and sum
exp_df.drop('transcript_id', axis=1, inplace=True)
exp_df = exp_df.groupby('gid').sum().reset_index()
exp_df.head()

Unnamed: 0,gid,NA19704_batch11_rep1,NA19332_batch14_rep1,NA19317_batch15_rep1,NA19312_batch05_rep1,NA19323_batch11_rep1,NA19143_batch04_rep3,NA19143_batch15_rep2,HG00326_batch08_rep1,NA18942_batch15_rep1,...,NA19731_batch09_rep1,NA19732_batch14_rep1,HG00525_batch10_rep1,HG00473_batch03_rep1,NA21127_batch04_rep1,NA20342_batch15_rep1,NA20412_batch07_rep1,NA20318_batch07_rep1,NA20298_batch01_rep1,NA20320_batch05_rep1
0,ENSG00000000003.16,31.0,5.0,0.0,14.000527,45.0,88.0,0.0,13.0,0.0,...,0.0,2.0,1.0,27.0,14.0,1.0,13.0,28.0,12.00002,12.99999
1,ENSG00000000005.6,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,ENSG00000000419.14,2334.45539,1795.54352,1400.40445,2132.88953,2103.78422,1360.08493,1276.42764,1472.713011,1397.2174,...,1551.96421,2235.43711,1502.4821,1004.45766,962.10114,1229.1965,1735.5228,2432.45182,1369.26527,2413.30774
3,ENSG00000000457.14,225.56137,257.59018,165.3598,351.4602,188.856,212.4961,271.2117,215.61502,236.790578,...,340.2066,249.75316,162.365,112.53656,142.3209,265.5457,160.06109,239.53843,165.29459,344.41416
4,ENSG00000000460.17,309.438419,347.45108,207.64022,458.54023,310.14371,378.50431,267.78767,230.38537,230.20892,...,352.79262,597.24767,300.63499,108.46337,218.6791,150.45407,229.938904,226.46133,294.7055,326.58617


In [47]:
df.to_csv(ksdljfkl;ajsdkfl;ja)

Unnamed: 0,gid,NA19704_batch11_rep1,NA19332_batch14_rep1,NA19317_batch15_rep1,NA19312_batch05_rep1,NA19323_batch11_rep1,NA19143_batch04_rep3,NA19143_batch15_rep2,HG00326_batch08_rep1,NA18942_batch15_rep1,...,NA19731_batch09_rep1,NA19732_batch14_rep1,HG00525_batch10_rep1,HG00473_batch03_rep1,NA21127_batch04_rep1,NA20342_batch15_rep1,NA20412_batch07_rep1,NA20318_batch07_rep1,NA20298_batch01_rep1,NA20320_batch05_rep1
0,ENSG00000000003.16,31.0,5.0,0.0,14.000527,45.0,88.0,0.0,13.0,0.0,...,0.0,2.0,1.0,27.0,14.0,1.0,13.0,28.0,12.00002,12.99999
1,ENSG00000000005.6,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,ENSG00000000419.14,2334.45539,1795.54352,1400.40445,2132.88953,2103.78422,1360.08493,1276.42764,1472.713011,1397.2174,...,1551.96421,2235.43711,1502.4821,1004.45766,962.10114,1229.1965,1735.5228,2432.45182,1369.26527,2413.30774
3,ENSG00000000457.14,225.56137,257.59018,165.3598,351.4602,188.856,212.4961,271.2117,215.61502,236.790578,...,340.2066,249.75316,162.365,112.53656,142.3209,265.5457,160.06109,239.53843,165.29459,344.41416
4,ENSG00000000460.17,309.438419,347.45108,207.64022,458.54023,310.14371,378.50431,267.78767,230.38537,230.20892,...,352.79262,597.24767,300.63499,108.46337,218.6791,150.45407,229.938904,226.46133,294.7055,326.58617


## try to intersect mage sample names w/ 1000G sample names

In [14]:
meta = pd.read_csv('../1000g/1000G_metadata.tsv', sep='\t', comment='#', header=None)
trios_meta = pd.read_csv('../1000g/1000G_trios_metadata.tsv', sep='\t', comment='#', header=None)

In [18]:
# check sample correspondence between mage and 1000g
df.loc[~(df['cell_line_id'].isin(meta[14].tolist()))&\
       ~(df['cell_line_id'].isin(trios_meta[14].tolist()))]

Unnamed: 0,run_accession,sample_accession,experiment_accession,study_accession,tax_id,scientific_name,instrument_platform,instrument_model,experiment_alias,fastq_bytes,...,bam_ftp,bam_bytes,cell_line_id,batch,rep,r1_fq_link,r2_fq_link,r1_verify,r2_verify,sample


In [None]:
# perfect