In [4]:
import pandas as pd
import numpy as np
import scipy.stats as st
import seaborn as sns
import sys
import os
import gseapy as gp
import matplotlib.pyplot as plt
import swan_vis as swan
import yaml
from snakemake.io import expand

p = os.path.dirname(os.path.dirname(os.getcwd()))
sys.path.append(p)

from proc_revisions.utils import *
from proc_revisions.plotting import *
# from proc_revisions.mane_utils import *

In [5]:
od = '../../proc_revisions/'
config_file = f'{od}/config.yml'
with open(config_file) as f:
    config = yaml.safe_load(f)

In [34]:
ab = od+expand(config['lr']['talon']['fusion_fix']['ab'], species='human')[0]
filt_ab = od+expand(config['lr']['cerberus']['filt_ab'], species='human')[0]
read_annot = od+expand(config['lr']['talon']['full_annot'], species='human')[0]
t_metadata = od+expand(config['ref']['cerberus']['new_gtf_t_info'], species='human')[0]
lib_meta = od+expand(config['lr']['meta'], species='human')[0]
swan_file = od+expand(config['lr']['swan']['sg'], species='human')[0]
cerberus_h5 = od+expand(config['lr']['cerberus']['ca_triplets'], species='human', obs_col='sample')[0]
cerb_t_metadata = od+expand(config['lr']['cerberus']['gtf_t_info'], species='human')[0]
major_isos = od+expand(config['lr']['analysis']['major_isos'], species='human', obs_col='sample')[0]
pi_tpm_table = od+expand(config['lr']['mane']['pi_tpm']['triplet'], species='human', obs_col='sample')[0]

ref_t_metadata = od+expand(config['ref']['new_gtf_t_info'], species='human')[0]
ref_g_metadata = od+expand(config['ref']['new_gtf_g_info'], species='human')[0]
read_annot = od+expand(config['lr']['talon']['full_annot'], species='human')[0]

talon_ref_gtf = od+expand(config['ref']['talon']['gtf'], species='human')[0]
# pp_summary = '../'+expand(config['data']['p_pred']['summary'], species='human')[0]

ver = 'v40_cerberus'
min_tpm = 1
gene_subset = 'polya'
obs_col = 'sample'
go_gene_subset = 'protein_coding'
predom_iso_subset = 'protein_coding'

m_lib_meta = od+expand(config['lr']['meta'], species='mouse')[0]

## Which libraries have spike ins, and which spikeins do they have?

In [8]:
meta_df = pd.read_csv(lib_meta, sep='\t')
meta_df[['dataset', 'spikein_name']].groupby('spikein_name').nunique().reset_index()

Unnamed: 0,spikein_name,dataset
0,"ERCC,SIRV",7
1,"ERCC,SIRV_4",22
2,"SIRV,ERCC",57
3,"SIRV_4,ERCC",23
4,"SIRV_4,LRGASP ERCC",3


## Subset our ref. GTF on the spike ins

In [16]:
# gtf_df = pr.read_gtf(talon_ref_gtf).df

In [17]:
# print(gtf_df.loc[gtf_df.Chromosome.str.contains('ERCC')].Chromosome.unique().tolist()[:10])
# gtf_df.loc[gtf_df.Chromosome.str.contains('SIRV')].Chromosome.unique().tolist()[:10]
# gtf_df = gtf_df.loc[(gtf_df.Chromosome.str.contains('ERCC'))|(gtf_df.Chromosome.str.contains('SIRV'))]
# pr.PyRanges(gtf_df).to_gtf('spikeins.gtf')

In [61]:
spike_df = pr.read_gtf('spikeins.gtf')
spike_df = spike_df.df[['gene_id', 'transcript_id', 'Chromosome']].drop_duplicates()
spike_df = spike_df.loc[spike_df.transcript_id.notnull()]

spike_df['spike_type'] = False
spike_df.loc[spike_df.Chromosome.str.contains('ERCC'), 'spike_type'] = 'ERCC'
spike_df.loc[spike_df.Chromosome.str.contains('SIRV'), 'spike_type'] = 'SIRV'
spike_df.loc[(spike_df.Chromosome.str.contains('00'))&(spike_df.spike_type=='SIRV'), 'spike_type'] = 'SIRV4'

In [62]:
total_spikes = spike_df[['transcript_id', 'spike_type']].groupby('spike_type').nunique().reset_index().rename({'transcript_id':'n_spikes'}, axis=1)
spike_df = spike_df.merge(total_spikes, how='left', on='spike_type')
total_spikes

Unnamed: 0,spike_type,n_spikes
0,ERCC,96
1,SIRV,69
2,SIRV4,15


In [63]:
spike_df.head()

Unnamed: 0,gene_id,transcript_id,Chromosome,spike_type,n_spikes
0,gSpikein_ERCC-00002,tSpikein_ERCC-00002,ERCC-00002,ERCC,96
1,gSpikein_ERCC-00003,tSpikein_ERCC-00003,ERCC-00003,ERCC,96
2,gSpikein_ERCC-00004,tSpikein_ERCC-00004,ERCC-00004,ERCC,96
3,gSpikein_ERCC-00007,tSpikein_ERCC-00007,ERCC-00007,ERCC,96
4,gSpikein_ERCC-00009,tSpikein_ERCC-00009,ERCC-00009,ERCC,96


In [70]:
# read from the unfilt. ab file because it doesn't make sense to use post-LAPA,
# post-Cerb results for the spikes
# ab_df = pd.read_csv(ab, sep='\t')
# ab_df, tids = get_det_table(ab_df, 
#                          how='iso',
#                          min_tpm=min_tpm,
#                          gene_subset=None)
# ab_df.reset_index(inplace=True)
# ab_df.rename({'annot_transcript_id':'transcript_id'}, axis=1, inplace=True)
# ab_df = ab_df.merge(spike_df, how='inner', on='transcript_id')
# ab_df.to_csv('spike_iso_ab.tsv', sep='\t')


In [73]:
ab_df = pd.read_csv('spike_iso_ab.tsv', sep='\t')
ab_df.head()

Unnamed: 0.1,Unnamed: 0,index,transcript_id,brodmann_area_46_3_1,calu3_1_2,h1_1_2,h9_de_1_1,hct116_1_1,hepg2_2_2,hl60_m1_24hr_1_1,...,hl60_m2_72hr_1_2,k562_3_2,lower_lobe_of_right_lung_1_1,ovary_2_1,pgp1_endo_1_1,right_cardiac_atrium_3_1,gene_id,Chromosome,spike_type,n_spikes
0,0,72046,tSpikein_ERCC-00002,16485.117491,2844.978394,4088.946063,10.165424,1071.827607,0.0,209.0306,...,5.893429,115.911172,203.549884,0.0,0.0,21.097598,gSpikein_ERCC-00002,ERCC-00002,ERCC,96
1,1,72047,tSpikein_ERCC-00003,891.114402,121.267279,234.310392,0.406617,26.705469,0.0,5.972303,...,0.0,6.481875,4.581509,0.0,0.0,1.265856,gSpikein_ERCC-00003,ERCC-00003,ERCC,96
2,2,72048,tSpikein_ERCC-00004,3845.862156,456.253129,679.95957,0.406617,140.745039,0.0,9.555685,...,0.0,24.402352,24.216546,0.0,0.0,0.843904,gSpikein_ERCC-00004,ERCC-00004,ERCC,96
3,3,72049,tSpikein_ERCC-00013,0.997888,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,gSpikein_ERCC-00013,ERCC-00013,ERCC,96
4,4,72050,tSpikein_ERCC-00019,16.964104,1.200666,0.0,0.0,0.0,0.0,0.0,...,0.0,0.381287,0.0,0.0,0.0,0.0,gSpikein_ERCC-00019,ERCC-00019,ERCC,96


In [91]:
# loop through each dataset and count # det spikes / dataset 
temp2 = pd.DataFrame()
for d in list(set(get_datasets(species='human'))):
    temp = ab_df[['transcript_id', 'gene_id', 'Chromosome', 'spike_type', 'n_spikes', d]]
    temp.head()
    
    # get detected ones
    temp = temp.loc[temp[d]>=min_tpm]
    temp = temp[['n_spikes', 'spike_type', 'transcript_id']].groupby(['n_spikes', 'spike_type']).nunique().reset_index()
    temp.rename({'transcript_id':'n_det_spikes'}, axis=1, inplace=True)
    temp['dataset'] = d
    temp2 = pd.concat([temp2, temp], axis=0)

In [93]:
temp2.loc[temp2.spike_type.isin(['SIRV', 'SIRV4'])]

Unnamed: 0,n_spikes,spike_type,n_det_spikes,dataset


In [76]:
# ab_df = ab_df.transpose()

In [77]:
# ab_df.head()

In [94]:
# which datasets were supposed to have these spikes in the first place?
df = pd.read_csv(lib_meta, sep='\t')
df2 = pd.read_csv(m_lib_meta, sep='\t')
df = pd.concat([df, df2], axis=0)
df.reset_index(drop=True, inplace=True)
df['spikein_name'] = df.spikein_name.fillna('')

# get melted version
up_df = upsetplot.from_memberships(df.spikein_name.str.split(','), data=df).reset_index()
up_df = up_df.drop_duplicates()
up_df = up_df[['ERCC', 'LRGASP ERCC', 'SIRV', 'SIRV4', 'dataset']]
up_df.head()

Unnamed: 0,ERCC,LRGASP ERCC,SIRV,SIRV4,dataset
0,True,False,False,True,a673_1_1
4,True,False,False,True,a673_1_2
8,False,False,False,False,adrenal_gland_1_1
9,True,False,False,True,adrenal_gland_2_1
10,False,False,False,False,adrenal_gland_3_1


In [95]:
up_df.loc[up_df.dataset=='wtc11_1_1']

Unnamed: 0,ERCC,LRGASP ERCC,SIRV,SIRV4,dataset
235,False,True,False,True,wtc11_1_1
