In [1]:
import pandas as pd
import numpy as np
import scipy.stats as st
import seaborn as sns
import sys
import os
import matplotlib.pyplot as plt
import swan_vis as swan
import yaml
from snakemake.io import expand
import cerberus
import pyranges as pr
import upsetplot

p = os.path.dirname(os.getcwd())
sys.path.append(p)

from scripts.utils import *
from scripts.vcf_utils import *
from scripts.plotting import *

In [2]:
config = load_config()
od = '../'

def proc_cfg(entry, od):
    entry = entry.replace('../../', '')
    entry = od+entry
    return entry

In [3]:
def clean_figure(ax):
    ax.spines['right'].set_visible(False)
    ax.spines['top'].set_visible(False)
    ax.tick_params(axis="x", rotation=45)

## Median # expressed transcripts / sample

In [34]:
meta = load_meta()
meta = meta.loc[meta.merged_run_mode==True]
sample_d = dict([(entry.cell_line_id, entry['sample']) \
                 for ind, entry in meta.iterrows()])
    
    

# using counts
f = expand(proc_cfg(config['lr']['kallisto']['quant']['merge_matrix_tsv'],od))[0]
df = pd.read_csv(f, sep='\t')
df.head()
df.columns = [d if d == 'transcript_id' else d.split('_')[0] for d in df.columns]
df.rename({'transcript_id':'tid'}, axis=1, inplace=True)
df.rename(sample_d, axis=1, inplace=True)

# melt 
df = df.melt(id_vars=['tid'], var_name='sample', value_name='counts')
df.head()

# add gene info
mt_df = pd.read_csv('../data/05_mastertable/poder_master_table_fixed_genics.tsv', sep='\t')
mt_df['pop_spec_t'] = (mt_df.population_sharing==1)&\
                      (mt_df.sample_sharing>1)
df = df.merge(mt_df[['isoform', 'geneid.v']],
              how='left',
             left_on='tid', 
             right_on='isoform')

In [35]:
# impose min. count of 1
df = df.loc[df.counts>=1]

In [36]:
temp = df[['tid', 'sample']].groupby('sample').nunique().reset_index().rename({'tid':'n_exp_t'}, axis=1)

In [37]:
temp.n_exp_t.median()

63011.0

## Median # expressed genes / sample

In [None]:
meta = load_meta()
meta = meta.loc[meta.merged_run_mode==True]
sample_d = dict([(entry.cell_line_id, entry['sample']) \
                 for ind, entry in meta.iterrows()])
    
    

# using counts
f = expand(proc_cfg(config['lr']['kallisto']['quant']['merge_matrix_tsv'],od))[0]
df = pd.read_csv(f, sep='\t')
df.head()
df.columns = [d if d == 'transcript_id' else d.split('_')[0] for d in df.columns]
df.rename({'transcript_id':'tid'}, axis=1, inplace=True)
df.rename(sample_d, axis=1, inplace=True)

# melt 
df = df.melt(id_vars=['tid'], var_name='sample', value_name='counts')
df.head()

# add gene info
mt_df = pd.read_csv('../data/05_mastertable/poder_master_table_fixed_genics.tsv', sep='\t')
mt_df['pop_spec_t'] = (mt_df.population_sharing==1)&\
                      (mt_df.sample_sharing>1)
df = df.merge(mt_df[['isoform', 'geneid.v']],
              how='left',
             left_on='tid', 
             right_on='isoform')

In [38]:
# sum up over genes
df = df[['sample','geneid.v', 'counts']].groupby(['geneid.v', 'sample']).sum().reset_index()

In [39]:
# impose min. count of 1
df = df.loc[df.counts>=1]

In [41]:
temp = df[['geneid.v', 'sample']].groupby('sample').nunique().reset_index().rename({'geneid.v':'n_exp_g'}, axis=1)

In [44]:
temp.n_exp_g.median()

16247.0

## % of expressed transcripts that are PC

In [46]:
meta = load_meta()
meta = meta.loc[meta.merged_run_mode==True]
sample_d = dict([(entry.cell_line_id, entry['sample']) \
                 for ind, entry in meta.iterrows()])
    
    

# using counts
f = expand(proc_cfg(config['lr']['kallisto']['quant']['merge_matrix_tsv'],od))[0]
df = pd.read_csv(f, sep='\t')
df.head()
df.columns = [d if d == 'transcript_id' else d.split('_')[0] for d in df.columns]
df.rename({'transcript_id':'tid'}, axis=1, inplace=True)
df.rename(sample_d, axis=1, inplace=True)

# melt 
df = df.melt(id_vars=['tid'], var_name='sample', value_name='counts')
df.head()

# add gene info
mt_df = pd.read_csv('../data/05_mastertable/poder_master_table_fixed_genics.tsv', sep='\t')
mt_df['pop_spec_t'] = (mt_df.population_sharing==1)&\
                      (mt_df.sample_sharing>1)
df = df.merge(mt_df[['isoform', 'geneid.v', 'associated_gene_biotype']],
              how='left',
             left_on='tid', 
             right_on='isoform')

In [47]:
# impose min. count of 1
df = df.loc[df.counts>=1]

In [49]:
temp = df[['tid', 'sample']].groupby('sample').nunique().reset_index().rename({'tid':'n_exp_t'}, axis=1)
temp.head()

Unnamed: 0,sample,n_exp_t
0,AJI1,56339
1,AJI2,60238
2,AJI3,67098
3,AJI4,54866
4,AJI5,71210


In [50]:
temp2 = df[['tid', 'associated_gene_biotype', 'sample']].groupby(['sample', 'associated_gene_biotype']).nunique().reset_index().rename({'tid':'n_exp_biotype_t'}, axis=1)

Unnamed: 0,sample,associated_gene_biotype,n_exp_biotype_t
0,AJI1,Novel/Ambiguous Gene,1310
1,AJI1,Protein Coding,45118
2,AJI1,lncRNA,9911
3,AJI2,Novel/Ambiguous Gene,1581
4,AJI2,Protein Coding,46931


In [51]:
temp = temp.merge(temp2, 
                  how='outer',
                  on='sample')

In [53]:
temp['perc'] = (temp['n_exp_biotype_t']/temp['n_exp_t'])*100
temp.head()

Unnamed: 0,sample,n_exp_t,associated_gene_biotype,n_exp_biotype_t,perc
0,AJI1,56339,Novel/Ambiguous Gene,1310,2.32521
1,AJI1,56339,Protein Coding,45118,80.083069
2,AJI1,56339,lncRNA,9911,17.591722
3,AJI2,60238,Novel/Ambiguous Gene,1581,2.624589
4,AJI2,60238,Protein Coding,46931,77.909293


In [54]:
temp = temp.loc[temp.associated_gene_biotype=='Protein Coding']
temp.head()

Unnamed: 0,sample,n_exp_t,associated_gene_biotype,n_exp_biotype_t,perc
1,AJI1,56339,Protein Coding,45118,80.083069
4,AJI2,60238,Protein Coding,46931,77.909293
7,AJI3,67098,Protein Coding,53873,80.290024
10,AJI4,54866,Protein Coding,43822,79.870958
13,AJI5,71210,Protein Coding,55948,78.567617


In [55]:
temp.perc.median()

79.19000069827527