In [2]:
import pandas as pd
import numpy as np
import scipy.stats as st
import seaborn as sns
import sys
import os
import matplotlib.pyplot as plt
import swan_vis as swan
import yaml
from snakemake.io import expand
import cerberus
import pyranges as pr
import upsetplot

p = os.path.dirname(os.path.dirname(os.getcwd()))
sys.path.append(p)

from scripts.utils import *
from scripts.vcf_utils import *
from scripts.plotting import *

In [81]:
config = load_config()
od = ''

def proc_cfg(entry, od):
    entry = entry.replace('../../', '')
    entry = od+entry
    return entry

In [82]:
def clean_figure(ax):
    ax.spines['right'].set_visible(False)
    ax.spines['top'].set_visible(False)
    ax.tick_params(axis="x", rotation=45)

In [83]:
meta_file = 'filereport_read_run_PRJNA851328_tsv.txt'

df = pd.read_csv(meta_file, sep='\t')
df[['cell_line_id', 'batch', 'rep']] = df.experiment_alias.str.split('_', expand=True)
len(df.index)
len(df.cell_line_id.unique())

df[['r1_fq_link', 'r2_fq_link']] = df.fastq_ftp.str.split(';', expand=True)
df['r1_verify'] = df.r1_fq_link.str.endswith('_1.fastq.gz')
df['r2_verify'] = df.r2_fq_link.str.endswith('_2.fastq.gz')
assert len(df.loc[df.r1_verify==False].index)==0
assert len(df.loc[df.r2_verify==False].index)==0

df['sample'] = df['experiment_alias']

In [84]:
df['cell_line_id'] = df['sample'].str.split('_', expand=True)[0]
df[['sample', 'cell_line_id']].head()

Unnamed: 0,sample,cell_line_id
0,NA19704_batch11_rep1,NA19704
1,NA19332_batch14_rep1,NA19332
2,NA19317_batch15_rep1,NA19317
3,NA19312_batch05_rep1,NA19312
4,NA19323_batch11_rep1,NA19323


## Filter genes / transcripts used to perform sQTL calling

- genes expressed ≥ 1 TPM in at least 80% of the samples
- with at least two isoforms and a minimum isoform expression of 0.1 TPM

In [110]:
g_exp_file = od+config['mage']['v47_kallisto']['gene_tpm_tsv']
exp_file = od+config['mage']['v47_kallisto']['merge_matrix_tpm_tsv']
t2g_file = od+config['ref']['v47_kallisto_short']['t2g']

min_samp_prop = 0.8
min_g_exp = 1
min_t_exp = 0.1
min_n_t = 2

In [98]:
# genes exp. >= 1 TPM in at least 80% of samples
df = pd.read_csv(g_exp_file, sep='\t')
n_samples = len([c for c in df.columns.tolist() if c not in ['gid']])
n_sample_cutoff = int(n_samples*min_samp_prop)

df = df.set_index('gid')
df = df>=min_g_exp
df['n_exp_samples'] = df.sum(axis=1)
print(len(df.index))
df = df.loc[df['n_exp_samples']>=n_sample_cutoff]
print(len(df.index))
gids = df.index.tolist()

78932
12225


In [103]:
# with at least two isoforms and
# minimum isoform expression of 0.1 TPM
# genes exp. >= 1 TPM in at least 80% of samples
df = pd.read_csv(exp_file, sep='\t')
df = df.set_index('transcript_id')
df = df>=min_t_exp
df['n_exp_samples'] = df.sum(axis=1)
print(len(df.index))
df = df.loc[df['n_exp_samples']>=1]
print(len(df.index))
tids = df.index.tolist()

387944
315312


In [114]:
# limit to tids and gids
t2g_df = pd.read_csv(t2g_file, sep='\t', header=None)
t2g_df = t2g_df[[0,1]]
t2g_df.columns = ['tid', 'gid']

print(len(t2g_df.index))
t2g_df = t2g_df.loc[(t2g_df.gid.isin(gids))&\
                    (t2g_df.tid.isin(tids))]
print(len(t2g_df.index))

387944
154491


In [115]:
# then count # isos / gene and filter one more time
t2g_df = t2g_df.groupby('gid').nunique().reset_index().rename({'tid':'n_tid'}, axis=1)
print(len(t2g_df.index))
t2g_df = t2g_df.loc[t2g_df.n_tid >= min_n_t]
print(len(t2g_df.index))

12225
11199


## Get gene expression values (TPM and counts) 

In [27]:
# tpm_df = pd.read_csv(od+config['mage']['v47_kallisto']['merge_matrix_tpm_tsv'], 
#                         sep='\t')

In [48]:
exp_file = od+config['mage']['v47_kallisto']['merge_matrix_tsv']
t2g_file = od+config['ref']['v47_kallisto_short']['t2g']

In [49]:
exp_df = pd.read_csv(exp_file, sep='\t')
t2g = pd.read_csv(t2g_file, sep='\t', header=None)
t2g = t2g[[0,1]]
t2g.columns = ['transcript_id', 'gid']
t2g.head()

Unnamed: 0,transcript_id,gid
0,ENST00000456328.3,ENSG00000290825.2
1,ENST00000832823.1,ENSG00000290825.2
2,ENST00000832824.1,ENSG00000290825.2
3,ENST00000832825.1,ENSG00000290825.2
4,ENST00000832826.1,ENSG00000290825.2


In [50]:
# get gene assignments for each transcript using t2g file
exp_df = exp_df.merge(t2g[['gid', 'transcript_id']],
              how='left', 
              on='transcript_id')

In [51]:
assert len(exp_df.loc[exp_df.gid.isnull()].index) == 0

In [52]:
# drop tid, gb on gid and sum
exp_df.drop('transcript_id', axis=1, inplace=True)
exp_df = exp_df.groupby('gid').sum().reset_index()
exp_df.head()

Unnamed: 0,gid,NA19704_batch11_rep1,NA19332_batch14_rep1,NA19317_batch15_rep1,NA19312_batch05_rep1,NA19323_batch11_rep1,NA19143_batch04_rep3,NA19143_batch15_rep2,HG00326_batch08_rep1,NA18942_batch15_rep1,...,NA19731_batch09_rep1,NA19732_batch14_rep1,HG00525_batch10_rep1,HG00473_batch03_rep1,NA21127_batch04_rep1,NA20342_batch15_rep1,NA20412_batch07_rep1,NA20318_batch07_rep1,NA20298_batch01_rep1,NA20320_batch05_rep1
0,ENSG00000000003.16,31.0,5.0,0.0,14.000527,45.0,88.0,0.0,13.0,0.0,...,0.0,2.0,1.0,27.0,14.0,1.0,13.0,28.0,12.00002,12.99999
1,ENSG00000000005.6,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,ENSG00000000419.14,2334.45539,1795.54352,1400.40445,2132.88953,2103.78422,1360.08493,1276.42764,1472.713011,1397.2174,...,1551.96421,2235.43711,1502.4821,1004.45766,962.10114,1229.1965,1735.5228,2432.45182,1369.26527,2413.30774
3,ENSG00000000457.14,225.56137,257.59018,165.3598,351.4602,188.856,212.4961,271.2117,215.61502,236.790578,...,340.2066,249.75316,162.365,112.53656,142.3209,265.5457,160.06109,239.53843,165.29459,344.41416
4,ENSG00000000460.17,309.438419,347.45108,207.64022,458.54023,310.14371,378.50431,267.78767,230.38537,230.20892,...,352.79262,597.24767,300.63499,108.46337,218.6791,150.45407,229.938904,226.46133,294.7055,326.58617


In [47]:
df.to_csv(ksdljfkl;ajsdkfl;ja)

Unnamed: 0,gid,NA19704_batch11_rep1,NA19332_batch14_rep1,NA19317_batch15_rep1,NA19312_batch05_rep1,NA19323_batch11_rep1,NA19143_batch04_rep3,NA19143_batch15_rep2,HG00326_batch08_rep1,NA18942_batch15_rep1,...,NA19731_batch09_rep1,NA19732_batch14_rep1,HG00525_batch10_rep1,HG00473_batch03_rep1,NA21127_batch04_rep1,NA20342_batch15_rep1,NA20412_batch07_rep1,NA20318_batch07_rep1,NA20298_batch01_rep1,NA20320_batch05_rep1
0,ENSG00000000003.16,31.0,5.0,0.0,14.000527,45.0,88.0,0.0,13.0,0.0,...,0.0,2.0,1.0,27.0,14.0,1.0,13.0,28.0,12.00002,12.99999
1,ENSG00000000005.6,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,ENSG00000000419.14,2334.45539,1795.54352,1400.40445,2132.88953,2103.78422,1360.08493,1276.42764,1472.713011,1397.2174,...,1551.96421,2235.43711,1502.4821,1004.45766,962.10114,1229.1965,1735.5228,2432.45182,1369.26527,2413.30774
3,ENSG00000000457.14,225.56137,257.59018,165.3598,351.4602,188.856,212.4961,271.2117,215.61502,236.790578,...,340.2066,249.75316,162.365,112.53656,142.3209,265.5457,160.06109,239.53843,165.29459,344.41416
4,ENSG00000000460.17,309.438419,347.45108,207.64022,458.54023,310.14371,378.50431,267.78767,230.38537,230.20892,...,352.79262,597.24767,300.63499,108.46337,218.6791,150.45407,229.938904,226.46133,294.7055,326.58617


## try to intersect mage sample names w/ 1000G sample names

In [14]:
meta = pd.read_csv('../1000g/1000G_metadata.tsv', sep='\t', comment='#', header=None)
trios_meta = pd.read_csv('../1000g/1000G_trios_metadata.tsv', sep='\t', comment='#', header=None)

In [18]:
# check sample correspondence between mage and 1000g
df.loc[~(df['cell_line_id'].isin(meta[14].tolist()))&\
       ~(df['cell_line_id'].isin(trios_meta[14].tolist()))]

Unnamed: 0,run_accession,sample_accession,experiment_accession,study_accession,tax_id,scientific_name,instrument_platform,instrument_model,experiment_alias,fastq_bytes,...,bam_ftp,bam_bytes,cell_line_id,batch,rep,r1_fq_link,r2_fq_link,r1_verify,r2_verify,sample


In [None]:
# perfect