In [2]:
import pandas as pd
import numpy as np
import scipy.stats as st
import seaborn as sns
import sys
import os
import matplotlib.pyplot as plt
import swan_vis as swan
import yaml
from snakemake.io import expand
import cerberus
import pyranges as pr
import upsetplot

p = os.path.dirname(os.path.dirname(os.getcwd()))
sys.path.append(p)

from scripts.utils import *
from scripts.vcf_utils import *
from scripts.plotting import *

In [3]:
config = load_config()
od = ''

def proc_cfg(entry, od):
    entry = entry.replace('../../', '')
    entry = od+entry
    return entry

In [4]:
def clean_figure(ax):
    ax.spines['right'].set_visible(False)
    ax.spines['top'].set_visible(False)
    ax.tick_params(axis="x", rotation=45)

In [5]:
meta_file = 'filereport_read_run_PRJNA851328_tsv.txt'

df = pd.read_csv(meta_file, sep='\t')
df[['cell_line_id', 'batch', 'rep']] = df.experiment_alias.str.split('_', expand=True)
len(df.index)
len(df.cell_line_id.unique())

df[['r1_fq_link', 'r2_fq_link']] = df.fastq_ftp.str.split(';', expand=True)
df['r1_verify'] = df.r1_fq_link.str.endswith('_1.fastq.gz')
df['r2_verify'] = df.r2_fq_link.str.endswith('_2.fastq.gz')
assert len(df.loc[df.r1_verify==False].index)==0
assert len(df.loc[df.r2_verify==False].index)==0

df['sample'] = df['experiment_alias']

## Prepare metadata file

In [6]:
meta_file = 'filereport_read_run_PRJNA851328_tsv.txt'
df = pd.read_csv(meta_file, sep='\t')

## Annotate each t-g pair based on sqtl calling testablilty thresholds

In [7]:
def filt_sqtl_transcripts(t_tpm_tsv,
                          t2g,
                          min_samp_prop,
                          min_g_exp,
                          min_t_exp,
                          min_n_t,
                          ofile):

    # # g_exp_file = od+config['mage']['v47_kallisto']['gene_tpm_tsv']
    # t_tpm_tsv = od+config['mage']['v47_kallisto']['merge_matrix_tpm_tsv']
    # t2g = od+config['ref']['v47_kallisto_short']['t2g']

    # min_samp_prop = 0.8
    # min_g_exp = 1
    # min_t_exp = 0.1
    # min_n_t = 2

    # get melted version of table w/ tid, gid, sample, t_exp, n_t (per gene), passed t exp filt, passed g exp filt
    # this is the main output tale
    filt_df = pd.read_csv(t_tpm_tsv, sep='\t')
    filt_df.rename({'transcript_id':'tid'}, axis=1, inplace=True)
    filt_df = filt_df.set_index('tid').melt(ignore_index=False, value_name='tpm', var_name='sample').reset_index()
    t2g_df = pd.read_csv(t2g, sep='\t', header=None)
    t2g_df = t2g_df[[0,1]]
    t2g_df.columns = ['tid', 'gid']
    filt_df = filt_df.merge(t2g_df, 
                            how='left',
                            on='tid')

    # isoform expressed in at least 1 sample >= 0.1 TPM
    df = pd.read_csv(t_tpm_tsv, sep='\t')
    df = df.set_index('transcript_id')
    df = df>=min_t_exp
    df['n_exp_samples'] = df.sum(axis=1)
    print(len(df.index))
    df = df.loc[df['n_exp_samples']>=1]
    print(len(df.index))
    tids = df.index.tolist()
    filt_df['t_passed_exp_filt'] = filt_df.tid.isin(tids)

    # using these isoform, which genes pass the n transcripts / gene
    t2g_df = pd.read_csv(t2g, sep='\t', header=None)
    t2g_df = t2g_df[[0,1]]
    t2g_df.columns = ['tid', 'gid']
    print(len(t2g_df.index))
    t2g_df = t2g_df.loc[(t2g_df.tid.isin(tids))]
    print(len(t2g_df.index))
    # count # isos / gene and filter one more time
    t2g_df = t2g_df.groupby('gid').nunique().reset_index().rename({'tid':'n_tid'}, axis=1)
    t2g_df = t2g_df.loc[t2g_df.n_tid >= min_n_t]
    print(len(t2g_df.index))
    gids2 = t2g_df.gid.tolist()
    filt_df['g_passed_n_t_filt'] = filt_df.gid.isin(gids2)

    # if sum of remaining transcripts (so gene expression based on filtered trx) is >=1 TPM 
    # keep transcript, if not, convert to NA (this is per sample)
    df = pd.read_csv(t_tpm_tsv, sep='\t')
    df.rename({'transcript_id':'tid'}, axis=1, inplace=True)
    df = df.loc[df.tid.isin(tids)]
    t2g_df = pd.read_csv(t2g_file, sep='\t', header=None)
    t2g_df = t2g_df[[0,1]]
    t2g_df.columns = ['tid', 'gid']
    df = df.merge(t2g_df, 
                  how='left',
                  on='tid')
    df = df.drop('tid', axis=1)
    df = df.groupby('gid').sum().reset_index()
    df = df.set_index('gid').melt(ignore_index=False, value_name='gene_tpm', var_name='sample').reset_index()
    df['g_passed_exp_filt'] = df.gene_tpm>=min_g_exp
    filt_df = filt_df.merge(df,
                            how='left',
                            on=['gid', 'sample'])

    # using just these isoforms, re-compute gene tpm 
    # genes exp. >= 1 TPM in at least 80% of samples
    n_samples = len([c for c in df.columns.tolist() if c not in ['gid']])
    n_sample_cutoff = int(n_samples*min_samp_prop)
    df = pd.read_csv(t_tpm_tsv, sep='\t')
    df.rename({'transcript_id':'tid'}, axis=1, inplace=True)
    df = df.loc[df.tid.isin(tids)]
    t2g_df = pd.read_csv(t2g_file, sep='\t', header=None)
    t2g_df = t2g_df[[0,1]]
    t2g_df.columns = ['tid', 'gid']
    df = df.merge(t2g_df, 
                  how='left',
                  on='tid')
    df = df.drop('tid', axis=1)
    df = df.groupby('gid').sum().reset_index()
    df = df.set_index('gid')
    df = df>=min_g_exp
    df['n_exp_samples'] = df.sum(axis=1)
    print(len(df.index))
    df = df.loc[df['n_exp_samples']>=n_sample_cutoff]
    print(len(df.index))
    gids = df.index.tolist()
    filt_df['g_passed_exp_min_samples_filt'] = filt_df.gid.isin(gids)
    filt_df.to_csv(ofile, sep='\t', index=False)



387944
315312
30599


In [11]:
# using just these isoforms, re-compute gene tpm 
# genes exp. >= 1 TPM in at least 80% of samples
n_samples = len([c for c in df.columns.tolist() if c not in ['gid']])
n_sample_cutoff = int(n_samples*min_samp_prop)
df = pd.read_csv(t_tpm_tsv, sep='\t')
df.rename({'transcript_id':'tid'}, axis=1, inplace=True)
df = df.loc[df.tid.isin(tids)]
t2g_df = pd.read_csv(t2g_file, sep='\t', header=None)
t2g_df = t2g_df[[0,1]]
t2g_df.columns = ['tid', 'gid']
df = df.merge(t2g_df, 
              how='left',
              on='tid')
df = df.drop('tid', axis=1)
df = df.groupby('gid').sum().reset_index()
df = df.set_index('gid')
df = df>=min_g_exp
df['n_exp_samples'] = df.sum(axis=1)
print(len(df.index))
df = df.loc[df['n_exp_samples']>=n_sample_cutoff]
print(len(df.index))
gids = df.index.tolist()
filt_df['g_passed_exp_min_samples_filt'] = filt_df.gid.isin(gids)
filt_df.to_csv(ofile, sep='\t', index=False)



68777
12218


FileNotFoundError: [Errno 2] No such file or directory: '../data/mage/thing.tsv'

In [12]:
filt_df.to_csv('../../data/mage/thing.tsv', sep='\t', index=False)


KeyboardInterrupt: 

In [13]:
filt_df.head()

Unnamed: 0,tid,sample,tpm,gid,t_passed_exp_filt,g_passed_n_t_filt,gene_tpm,g_passed_exp_filt,g_passed_exp_min_samples_filt
0,ENST00000000233.10,NA19704_batch11_rep1,106.12,ENSG00000004059.11,True,True,115.487841,True,True
1,ENST00000000412.8,NA19704_batch11_rep1,71.2595,ENSG00000003056.8,True,True,105.136776,True,True
2,ENST00000000442.11,NA19704_batch11_rep1,6.19072,ENSG00000173153.17,True,True,12.247247,True,True
3,ENST00000001008.6,NA19704_batch11_rep1,22.1415,ENSG00000004478.8,True,True,31.879552,True,True
4,ENST00000001146.7,NA19704_batch11_rep1,0.0,ENSG00000003137.9,True,True,0.150489,False,False


In [None]:
g_exp_file = od+config['mage']['v47_kallisto']['gene_tpm_tsv']
exp_file = od+config['mage']['v47_kallisto']['merge_matrix_tpm_tsv']
t2g_file = od+config['ref']['v47_kallisto_short']['t2g']

min_samp_prop = 0.8
min_g_exp = 1
min_t_exp = 0.1
min_n_t = 2

filt_sqtl_transcripts(g_exp_file,
                          exp_file,
                          t2g_file,
                          min_samp_prop,
                          min_g_exp,
                          min_t_exp,
                          min_n_t,
                          '../../data/mage/test.tsv')

# # get melted version of table w/ tid, gid, sample, t_exp, n_t (per gene), passed t exp filt, passed g exp filt
# # this is the main output tale
# filt_df = pd.read_csv(exp_file, sep='\t')
# filt_df.rename({'transcript_id':'tid'}, axis=1, inplace=True)
# filt_df = filt_df.set_index('tid').melt(ignore_index=False, value_name='tpm', var_name='sample').reset_index()
# t2g_df = pd.read_csv(t2g_file, sep='\t', header=None)
# t2g_df = t2g_df[[0,1]]
# t2g_df.columns = ['tid', 'gid']
# filt_df = filt_df.merge(t2g_df, 
#                         how='left',
#                         on='tid')

# # genes exp. >= 1 TPM in at least 80% of samples
# df = pd.read_csv(g_exp_file, sep='\t')
# n_samples = len([c for c in df.columns.tolist() if c not in ['gid']])
# n_sample_cutoff = int(n_samples*min_samp_prop)
# df = df.set_index('gid')
# df = df>=min_g_exp
# df['n_exp_samples'] = df.sum(axis=1)
# print(len(df.index))
# df = df.loc[df['n_exp_samples']>=n_sample_cutoff]
# print(len(df.index))
# gids = df.index.tolist()
# filt_df['g_passed_exp_filt'] = filt_df.gid.isin(gids)

# # isoform expressed in at least 1 sample >= 0.1 TPM
# df = pd.read_csv(exp_file, sep='\t')
# df = df.set_index('transcript_id')
# df = df>=min_t_exp
# df['n_exp_samples'] = df.sum(axis=1)
# print(len(df.index))
# df = df.loc[df['n_exp_samples']>=1]
# print(len(df.index))
# tids = df.index.tolist()
# filt_df['t_passed_exp_filt'] = filt_df.tid.isin(tids)

# # of these g/isoform pairs, which ones pass the n transcripts / gene
# t2g_df = pd.read_csv(t2g_file, sep='\t', header=None)
# t2g_df = t2g_df[[0,1]]
# t2g_df.columns = ['tid', 'gid']

# print(len(t2g_df.index))
# t2g_df = t2g_df.loc[(t2g_df.gid.isin(gids))&\
#                     (t2g_df.tid.isin(tids))]
# print(len(t2g_df.index))

# # then count # isos / gene and filter one more time
# t2g_df = t2g_df.groupby('gid').nunique().reset_index().rename({'tid':'n_tid'}, axis=1)
# print(len(t2g_df.index))
# t2g_df = t2g_df.loc[t2g_df.n_tid >= min_n_t]
# print(len(t2g_df.index))
# gids2 = t2g_df.gid.tolist()

# filt_df['g_passed_n_t_filt'] = filt_df.gid.isin(gids2)
# filt_df.head()

78932
12225
387944


387944
154491
12225
11199


Unnamed: 0,tid,sample,tpm,gid,g_passed_exp_filt,t_passed_exp_filt,g_passed_n_t_filt
0,ENST00000000233.10,NA19704_batch11_rep1,106.12,ENSG00000004059.11,True,True,True
1,ENST00000000412.8,NA19704_batch11_rep1,71.2595,ENSG00000003056.8,True,True,True
2,ENST00000000442.11,NA19704_batch11_rep1,6.19072,ENSG00000173153.17,True,True,True
3,ENST00000001008.6,NA19704_batch11_rep1,22.1415,ENSG00000004478.8,True,True,True
4,ENST00000001146.7,NA19704_batch11_rep1,0.0,ENSG00000003137.9,False,True,False


In [6]:
df['cell_line_id'] = df['sample'].str.split('_', expand=True)[0]
df[['sample', 'cell_line_id']].head()

Unnamed: 0,sample,cell_line_id
0,NA19704_batch11_rep1,NA19704
1,NA19332_batch14_rep1,NA19332
2,NA19317_batch15_rep1,NA19317
3,NA19312_batch05_rep1,NA19312
4,NA19323_batch11_rep1,NA19323


## Format the file the way sqtlseeker wants
- expression of each transcript in each sample TPM
- columns trId and geneId

In [66]:
t2g_file = od+config['ref']['v47_kallisto_short']['t2g']

In [64]:
df = pd.read_csv('test.tsv', sep='\t')

In [65]:
df.head()

Unnamed: 0,transcript_id,HG03117,NA19785,HG00956,HG01809,HG01970,NA19308,HG01395,HG02941,HG01170,...,HG02574,NA19670,NA18878,HG00331,HG03091,HG00704,HG00705,NA19704,HG04100,HG01515
0,ENST00000000233.10,131.372,112.575,86.5855,100.433,119.743,129.113,82.8047,94.6886,74.2677,...,121.222,91.7148,81.5165,120.051,139.913,80.6047,97.9626,106.12,81.8979,70.5026
1,ENST00000000412.8,84.4788,65.6551,68.6408,62.5138,81.4771,81.6217,68.7051,75.7423,71.8525,...,71.6736,81.4719,68.1853,59.4231,74.7962,67.039,44.6339,71.2595,57.9072,68.5762
2,ENST00000000442.11,10.0302,6.90736,9.12874,12.6433,16.8596,8.20068,8.68647,7.85214,8.61134,...,9.56501,10.0081,8.45307,11.6978,7.45239,5.43539,6.17647,6.19072,8.32944,6.85148
3,ENST00000001008.6,29.0699,33.8503,20.2716,43.3731,32.3596,20.7628,38.8559,24.1062,32.5837,...,33.4447,32.5083,18.6325,38.4592,26.096,17.3489,25.5857,22.1415,23.0492,24.0624
4,ENST00000001146.7,0.0,0.0,0.0,0.0,0.238889,0.744964,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.013434,0.0,0.0,0.0


In [71]:
df.rename({'transcript_id':'trId'}, axis=1, inplace=True)

In [75]:
t2g_df.head()

Unnamed: 0,geneId,trId
0,ENST00000456328.3,ENSG00000290825.2
1,ENST00000832823.1,ENSG00000290825.2
2,ENST00000832824.1,ENSG00000290825.2
3,ENST00000832825.1,ENSG00000290825.2
4,ENST00000832826.1,ENSG00000290825.2


In [76]:
t2g_df = pd.read_csv(t2g_file, sep='\t', header=None)
t2g_df = t2g_df[[0,1]]
t2g_df.columns = ['trId', 'geneId']
df = df.merge(t2g_df, how='left', 
              on='trId')
assert len(df.loc[df.geneId.isnull()].index)==0

In [79]:
df.to_csv(, sep='\t', index=None)

Unnamed: 0,trId,HG03117,NA19785,HG00956,HG01809,HG01970,NA19308,HG01395,HG02941,HG01170,...,NA18878,HG00331,HG03091,HG00704,HG00705,NA19704,HG04100,HG01515,geneId_x,geneId_y


## Filter samples w/ replicates to one w/ highest read depth

In [47]:
df = pd.read_csv(exp_file, sep='\t')
df = df.set_index('transcript_id')
df = df.transpose()

# counts / sample
df = df.reset_index()
df = df.rename({'index':'sample'}, axis=1)
df['cell_line_id'] = df['sample'].str.split('_', expand=True)[0]
df = df.set_index(['cell_line_id', 'sample'])
df['total_counts'] = df.sum(axis=1)
df = df[['total_counts']].reset_index()

# dedupe on total counts
df = df.sort_values(by='total_counts', ascending=False)
print(len(df.index))
df = df.drop_duplicates(subset=['cell_line_id'], keep='first')
print(len(df.index))

df = df[['sample']]
df.to_csv('keep_samples.tsv', sep='\t', index=False)

779
731


In [23]:
exp_file = od+config['mage']['v47_kallisto']['merge_matrix_tsv']
df = pd.read_csv(exp_file, sep='\t')

In [24]:
df = df.set_index('transcript_id')
df = df.transpose()

In [25]:
# counts / sample
df = df.reset_index()
df = df.rename({'index':'sample'}, axis=1)
df['cell_line_id'] = df['sample'].str.split('_', expand=True)[0]
df = df.set_index(['cell_line_id', 'sample'])
df['total_counts'] = df.sum(axis=1)
df.head()

Unnamed: 0_level_0,transcript_id,ENST00000000233.10,ENST00000000412.8,ENST00000000442.11,ENST00000001008.6,ENST00000001146.7,ENST00000002125.9,ENST00000002165.11,ENST00000002501.11,ENST00000002596.6,ENST00000002829.8,...,ENST00000850835.1,ENST00000850836.1,ENST00000850837.1,ENST00000850838.1,ENST00000850839.1,ENST00000850840.1,ENST00000850841.1,ENST00000850842.1,ENST00000850843.1,total_counts
cell_line_id,sample,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
NA19704,NA19704_batch11_rep1,2187.66,4203.78,335.718,2064.24,0.0,322.11,992.572,915.726,15.0,20.0654,...,25.9881,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,24870250.0
NA19332,NA19332_batch14_rep1,2483.13,5302.25,603.101,1585.8,0.0,176.919,1536.97,158.61,39.5692,0.0,...,11.7222,1.41875,0.0,0.0,6.93641,0.0,0.0,0.0,0.921029,29440930.0
NA19317,NA19317_batch15_rep1,1081.26,3306.58,307.345,1706.69,0.0,169.047,757.023,614.355,60.2248,0.0,...,12.7007,2.52598,0.0,0.0,4.17823,0.0,1.19724,0.0,0.0,20113300.0
NA19312,NA19312_batch05_rep1,4053.85,6273.98,729.524,3544.72,0.0,314.616,1372.78,2229.53,31.6113,1.7321,...,70.3466,2.18983,4.43783,0.0,11.0699,0.0,8.75674,0.0,0.0,42380010.0
NA19323,NA19323_batch11_rep1,2051.01,3338.97,310.898,2224.32,0.0,218.433,539.937,81.6014,15.0,18.7386,...,25.7538,0.0,0.0,0.0,1.90098,0.0,0.0,0.0,0.0,23932700.0


In [26]:
df = df[['total_counts']].reset_index()

In [27]:
df = df.sort_values(by='total_counts', ascending=False)
print(len(df.index))

779


In [28]:
df.loc[df.cell_line_id=='NA19143']

transcript_id,cell_line_id,sample,total_counts
6,NA19143,NA19143_batch15_rep2,20299750.0
509,NA19143,NA19143_batch15_rep1,18466670.0
5,NA19143,NA19143_batch04_rep3,18120860.0


In [29]:
df = df.drop_duplicates(subset=['cell_line_id'], keep='first')
print(len(df.index))

731


In [30]:
df.loc[df.cell_line_id=='NA19143']

transcript_id,cell_line_id,sample,total_counts
6,NA19143,NA19143_batch15_rep2,20299750.0


In [31]:
df = df[['sample']]
df.to_csv('keep_samples.tsv', sep='\t', index=False)

In [58]:
# now filter
s_df = pd.read_csv('keep_samples.tsv', sep='\t')
df = pd.read_csv(exp_file, sep='\t')

In [59]:
samples = s_df['sample'].tolist()
samples[:5]

['HG03117_batch07_rep1',
 'NA19785_batch07_rep1',
 'HG00956_batch11_rep1',
 'HG01809_batch09_rep1',
 'HG01970_batch02_rep1']

In [60]:
ind_cols = [c for c in df.columns if 'id' in c]
df = df.set_index(ind_cols)

In [61]:
df = df[samples]
df.columns = [c.split('_')[0] for c in df.columns]
df = df.reset_index()
df.head()

Unnamed: 0,transcript_id,HG03117,NA19785,HG00956,HG01809,HG01970,NA19308,HG01395,HG02941,HG01170,...,HG02574,NA19670,NA18878,HG00331,HG03091,HG00704,HG00705,NA19704,HG04100,HG01515
0,ENST00000000233.10,131.372,112.575,86.5855,100.433,119.743,129.113,82.8047,94.6886,74.2677,...,121.222,91.7148,81.5165,120.051,139.913,80.6047,97.9626,106.12,81.8979,70.5026
1,ENST00000000412.8,84.4788,65.6551,68.6408,62.5138,81.4771,81.6217,68.7051,75.7423,71.8525,...,71.6736,81.4719,68.1853,59.4231,74.7962,67.039,44.6339,71.2595,57.9072,68.5762
2,ENST00000000442.11,10.0302,6.90736,9.12874,12.6433,16.8596,8.20068,8.68647,7.85214,8.61134,...,9.56501,10.0081,8.45307,11.6978,7.45239,5.43539,6.17647,6.19072,8.32944,6.85148
3,ENST00000001008.6,29.0699,33.8503,20.2716,43.3731,32.3596,20.7628,38.8559,24.1062,32.5837,...,33.4447,32.5083,18.6325,38.4592,26.096,17.3489,25.5857,22.1415,23.0492,24.0624
4,ENST00000001146.7,0.0,0.0,0.0,0.0,0.238889,0.744964,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.013434,0.0,0.0,0.0


In [62]:
len(df.columns)

732

In [63]:
df.to_csv('test.tsv', sep='\t', index=False)

## Filter genes / transcripts used to perform sQTL calling

- genes expressed ≥ 1 TPM in at least 80% of the samples
- with at least two isoforms and a minimum isoform expression of 0.1 TPM

In [40]:
g_exp_file = od+config['mage']['v47_kallisto']['gene_tpm_tsv']
exp_file = od+config['mage']['v47_kallisto']['merge_matrix_tpm_tsv']
t2g_file = od+config['ref']['v47_kallisto_short']['t2g']

min_samp_prop = 0.8
min_g_exp = 1
min_t_exp = 0.1
min_n_t = 2

In [41]:
# genes exp. >= 1 TPM in at least 80% of samples
df = pd.read_csv(g_exp_file, sep='\t')
n_samples = len([c for c in df.columns.tolist() if c not in ['gid']])
n_sample_cutoff = int(n_samples*min_samp_prop)

df = df.set_index('gid')
df = df>=min_g_exp
df['n_exp_samples'] = df.sum(axis=1)
print(len(df.index))
df = df.loc[df['n_exp_samples']>=n_sample_cutoff]
print(len(df.index))
gids = df.index.tolist()

78932
12225


In [42]:
# with at least two isoforms and
# minimum isoform expression of 0.1 TPM
# genes exp. >= 1 TPM in at least 80% of samples
df = pd.read_csv(exp_file, sep='\t')
df = df.set_index('transcript_id')
df = df>=min_t_exp
df['n_exp_samples'] = df.sum(axis=1)
print(len(df.index))
df = df.loc[df['n_exp_samples']>=1]
print(len(df.index))
tids = df.index.tolist()

387944
315312


In [43]:
# limit to tids and gids
t2g_df = pd.read_csv(t2g_file, sep='\t', header=None)
t2g_df = t2g_df[[0,1]]
t2g_df.columns = ['tid', 'gid']

print(len(t2g_df.index))
t2g_df = t2g_df.loc[(t2g_df.gid.isin(gids))&\
                    (t2g_df.tid.isin(tids))]
print(len(t2g_df.index))

387944
154491


In [44]:
# then count # isos / gene and filter one more time
t2g_df = t2g_df.groupby('gid').nunique().reset_index().rename({'tid':'n_tid'}, axis=1)
print(len(t2g_df.index))
t2g_df = t2g_df.loc[t2g_df.n_tid >= min_n_t]
print(len(t2g_df.index))

12225
11199


In [46]:
# finally, re-read the transcript tpm exp and filt.
df = pd.read_csv(exp_file, sep='\t')


## Get gene expression values (TPM and counts) 

In [27]:
# tpm_df = pd.read_csv(od+config['mage']['v47_kallisto']['merge_matrix_tpm_tsv'], 
#                         sep='\t')

In [48]:
exp_file = od+config['mage']['v47_kallisto']['merge_matrix_tsv']
t2g_file = od+config['ref']['v47_kallisto_short']['t2g']

In [49]:
exp_df = pd.read_csv(exp_file, sep='\t')
t2g = pd.read_csv(t2g_file, sep='\t', header=None)
t2g = t2g[[0,1]]
t2g.columns = ['transcript_id', 'gid']
t2g.head()

Unnamed: 0,transcript_id,gid
0,ENST00000456328.3,ENSG00000290825.2
1,ENST00000832823.1,ENSG00000290825.2
2,ENST00000832824.1,ENSG00000290825.2
3,ENST00000832825.1,ENSG00000290825.2
4,ENST00000832826.1,ENSG00000290825.2


In [50]:
# get gene assignments for each transcript using t2g file
exp_df = exp_df.merge(t2g[['gid', 'transcript_id']],
              how='left', 
              on='transcript_id')

In [51]:
assert len(exp_df.loc[exp_df.gid.isnull()].index) == 0

In [52]:
# drop tid, gb on gid and sum
exp_df.drop('transcript_id', axis=1, inplace=True)
exp_df = exp_df.groupby('gid').sum().reset_index()
exp_df.head()

Unnamed: 0,gid,NA19704_batch11_rep1,NA19332_batch14_rep1,NA19317_batch15_rep1,NA19312_batch05_rep1,NA19323_batch11_rep1,NA19143_batch04_rep3,NA19143_batch15_rep2,HG00326_batch08_rep1,NA18942_batch15_rep1,...,NA19731_batch09_rep1,NA19732_batch14_rep1,HG00525_batch10_rep1,HG00473_batch03_rep1,NA21127_batch04_rep1,NA20342_batch15_rep1,NA20412_batch07_rep1,NA20318_batch07_rep1,NA20298_batch01_rep1,NA20320_batch05_rep1
0,ENSG00000000003.16,31.0,5.0,0.0,14.000527,45.0,88.0,0.0,13.0,0.0,...,0.0,2.0,1.0,27.0,14.0,1.0,13.0,28.0,12.00002,12.99999
1,ENSG00000000005.6,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,ENSG00000000419.14,2334.45539,1795.54352,1400.40445,2132.88953,2103.78422,1360.08493,1276.42764,1472.713011,1397.2174,...,1551.96421,2235.43711,1502.4821,1004.45766,962.10114,1229.1965,1735.5228,2432.45182,1369.26527,2413.30774
3,ENSG00000000457.14,225.56137,257.59018,165.3598,351.4602,188.856,212.4961,271.2117,215.61502,236.790578,...,340.2066,249.75316,162.365,112.53656,142.3209,265.5457,160.06109,239.53843,165.29459,344.41416
4,ENSG00000000460.17,309.438419,347.45108,207.64022,458.54023,310.14371,378.50431,267.78767,230.38537,230.20892,...,352.79262,597.24767,300.63499,108.46337,218.6791,150.45407,229.938904,226.46133,294.7055,326.58617


In [47]:
df.to_csv(ksdljfkl;ajsdkfl;ja)

Unnamed: 0,gid,NA19704_batch11_rep1,NA19332_batch14_rep1,NA19317_batch15_rep1,NA19312_batch05_rep1,NA19323_batch11_rep1,NA19143_batch04_rep3,NA19143_batch15_rep2,HG00326_batch08_rep1,NA18942_batch15_rep1,...,NA19731_batch09_rep1,NA19732_batch14_rep1,HG00525_batch10_rep1,HG00473_batch03_rep1,NA21127_batch04_rep1,NA20342_batch15_rep1,NA20412_batch07_rep1,NA20318_batch07_rep1,NA20298_batch01_rep1,NA20320_batch05_rep1
0,ENSG00000000003.16,31.0,5.0,0.0,14.000527,45.0,88.0,0.0,13.0,0.0,...,0.0,2.0,1.0,27.0,14.0,1.0,13.0,28.0,12.00002,12.99999
1,ENSG00000000005.6,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,ENSG00000000419.14,2334.45539,1795.54352,1400.40445,2132.88953,2103.78422,1360.08493,1276.42764,1472.713011,1397.2174,...,1551.96421,2235.43711,1502.4821,1004.45766,962.10114,1229.1965,1735.5228,2432.45182,1369.26527,2413.30774
3,ENSG00000000457.14,225.56137,257.59018,165.3598,351.4602,188.856,212.4961,271.2117,215.61502,236.790578,...,340.2066,249.75316,162.365,112.53656,142.3209,265.5457,160.06109,239.53843,165.29459,344.41416
4,ENSG00000000460.17,309.438419,347.45108,207.64022,458.54023,310.14371,378.50431,267.78767,230.38537,230.20892,...,352.79262,597.24767,300.63499,108.46337,218.6791,150.45407,229.938904,226.46133,294.7055,326.58617


## try to intersect mage sample names w/ 1000G sample names

In [14]:
meta = pd.read_csv('../1000g/1000G_metadata.tsv', sep='\t', comment='#', header=None)
trios_meta = pd.read_csv('../1000g/1000G_trios_metadata.tsv', sep='\t', comment='#', header=None)

In [18]:
# check sample correspondence between mage and 1000g
df.loc[~(df['cell_line_id'].isin(meta[14].tolist()))&\
       ~(df['cell_line_id'].isin(trios_meta[14].tolist()))]

Unnamed: 0,run_accession,sample_accession,experiment_accession,study_accession,tax_id,scientific_name,instrument_platform,instrument_model,experiment_alias,fastq_bytes,...,bam_ftp,bam_bytes,cell_line_id,batch,rep,r1_fq_link,r2_fq_link,r1_verify,r2_verify,sample


In [None]:
# perfect