In [1]:
import pandas as pd
import numpy as np
import scipy.stats as st
import seaborn as sns
import sys
import os
import gseapy as gp
import matplotlib.pyplot as plt
import swan_vis as swan
import yaml
from snakemake.io import expand

p = os.path.dirname(os.path.dirname(os.getcwd()))
sys.path.append(p)

from scripts.utils import *
from scripts.plotting import *

In [2]:
config_file = '../snakemake/config.yml'
with open(config_file) as f:
    config = yaml.safe_load(f)

In [9]:
ab = '../'+expand(config['data']['ab'], species='human')[0]
talon_ab = '../'+expand(config['data']['talon_filt_ab'], species='human')[0]
lapa_ab = '../'+expand(config['data']['lapa_filt_ab'], species='human')[0]
filt_ab = '../'+expand(config['data']['filt_ab'], species='human')[0]
read_annot = '../'+expand(config['data']['read_annot'], species='human')[0]
t_metadata = '../'+expand(config['ref']['cerberus']['t_info'], species='human')[0]
lib_meta = '../'+expand(config['data']['meta'], species='human')[0]
swan_file = '../'+expand(config['data']['sg'], species='human')[0]
cerberus_h5 = '../'+expand(config['data']['cerb_annot'], species='human')[0]
cerb_t_metadata = '../'+expand(config['data']['t_info'], species='human')[0]
major_isos = '../'+expand(config['data']['major_isos'], species='human', obs_col='sample')[0]
pp_summary = '../'+expand(config['data']['p_pred']['summary'], species='human')[0]
ref_t_metadata = '../'+expand(config['ref']['t_info'], species='human')[0]
ref_g_metadata = '../'+expand(config['ref']['g_info'], species='human')[0]

spike_filt_ab = '../'+config['data']['spikes']['ca_ab']
spike_ca = '../'+config['data']['spikes']['ca_annot']
gene_spike_ab = '../'+config['data']['spikes']['gene_ab']

ver = 'v40_cerberus'
min_tpm = 1
gene_subset = 'protein_coding'
obs_col = 'sample'
go_gene_subset = 'protein_coding'

m_lib_meta = '../'+expand(config['data']['meta'], species='mouse')[0]

## SIRV novel isoform detection

In [10]:
df = pd.read_csv(spike_filt_ab, sep='\t')
df, ic_ids = get_tpm_table(df,
                   how='ic',
                   species='spikes',
                   min_tpm=1)

Calculating ic TPM values
Enforcing minimum TPM
Total # ics detected: 214
# ics >= 1 tpm: 213
Number of ics reported: 213


  df[tpm_col] = (df[d]*1000000)/df[total_col]
  df[total_col] = df[d].sum()


In [14]:
# add novelty info
ca = cerberus.read(spike_ca)
ic = ca.ic.copy(deep=True)
ic = ic.loc[ic.Name.isin(ic_ids)]

In [17]:
# only the sirvs
ic = ic.loc[ic.gene_id.str.contains('SIRV')]

In [19]:
ic[['Name', 'novelty']].groupby('novelty').count()

Unnamed: 0_level_0,Name
novelty,Unnamed: 1_level_1
ISM,5
Known,79
NIC,3
NNC,38


In [20]:
ic.loc[ic.novelty=='NNC'].head()

Unnamed: 0,Chromosome,Strand,Coordinates,Name,source,novelty,gene_id,ic
178,SIRV2,-,5788-4800-4687-4479-4338-4094-3963-3825-3665-3...,SIRV2_6,lapa,NNC,SIRV2,6
179,SIRV3,+,2005-4568-4779-6057-7986-8124-8207-8755,SIRV3_12,lapa,NNC,SIRV3,12
180,SIRV4,+,1343-1678-1885-2389,SIRV4_8,lapa,NNC,SIRV4,8
181,SIRV4,+,1346-1678-1885-2385,SIRV4_9,lapa,NNC,SIRV4,9
183,SIRV5,+,1087-3298-3404-3483-3643-5380-5450-5543-5626-6...,SIRV5_13,lapa,NNC,SIRV5,13


In [136]:
temp = df.loc[df.transcript_novelty!='Known']
temp = df.copy(deep=True)
temp = temp[['annot_transcript_id', 'transcript_novelty']].groupby('transcript_novelty').count()
temp

Unnamed: 0_level_0,annot_transcript_id
transcript_novelty,Unnamed: 1_level_1
ISM_rescue,11
Known,255
NIC,21
NNC,14


In [105]:
meta_df = pd.DataFrame()
meta_df['dataset'] = get_datasets('human')
meta_df['species'] = 'human'
temp = pd.DataFrame()
temp['dataset'] = get_datasets('mouse')
temp['species'] = 'mouse'
meta_df = pd.concat([meta_df, temp], axis=0)
meta_df['dataset'] = meta_df['dataset']+'_labeled_'+meta_df['species']
meta_df.head()
# # ?dataset_cols = [d+'_labeled_human' for d in get_datasets('human')]
# # dataset_cols += [d+'_labeled_mouse' for d in get_datasets('mouse')]
dataset_cols = meta_df.dataset.tolist()
print(len(dataset_cols))
dataset_cols = list(set(dataset_cols)&set(df.columns.tolist()))
print(len(dataset_cols))

id_col = 'annot_transcript_id'

df = df[dataset_cols+[id_col]]
df.set_index(id_col, inplace=True)

# compute TPM
tpm_cols = []
for d in dataset_cols:
    tpm_col = '{}_tpm'.format(d)
    total_col = '{}_total'.format(d)
    df[total_col] = df[d].sum()
    df[tpm_col] = (df[d]*1000000)/df[total_col]
    tpm_cols.append(tpm_col)
df = df[tpm_cols]

# enforce tpm threshold
if min_tpm:
    print('Enforcing minimum TPM')
    print('Total # {}s detected: {}'.format('isoforms', len(df.index)))
    df = df.loc[(df >= min_tpm).any(axis=1)]
    print('# {}s >= {} tpm: {}'.format('isoforms', min_tpm, len(df.index)))


264
226
Enforcing minimum TPM
Total # isoformss detected: 467
# isoformss >= 1 tpm: 446




In [106]:
# df.head()

In [107]:
ab = pd.read_csv(spike_filt_ab, sep='\t')
ab = ab[['annot_transcript_id', 'transcript_novelty', 'annot_gene_id']]

In [108]:
df = df.merge(ab, how='left', left_index=True, right_on='annot_transcript_id')

In [109]:
df.annot_gene_id.tail()

462    gSpikein_ERCC-00165
463    gSpikein_ERCC-00168
464    gSpikein_ERCC-00170
465    gSpikein_ERCC-00170
466    gSpikein_ERCC-00171
Name: annot_gene_id, dtype: object

In [113]:
# temp = df.loc[df.transcript_novelty!='Known']
temp = df.copy(deep=True)
temp = temp.loc[temp.annot_gene_id.str.contains('SIRV')]
temp
temp = temp[['annot_transcript_id', 'transcript_novelty']].groupby('transcript_novelty').count()
temp

Unnamed: 0_level_0,annot_transcript_id
transcript_novelty,Unnamed: 1_level_1
ISM_rescue,12
Known,242
NIC,31
NNC,20


In [115]:
df['ic_id'] = df.annot_transcript_id.str.split('#', n=1, expand=True)[0]

In [116]:
# temp = df.loc[df.transcript_novelty!='Known']
temp = df.copy(deep=True)
temp = temp.loc[temp.annot_gene_id.str.contains('SIRV')]
temp
temp = temp[['ic_id', 'transcript_novelty']].groupby('transcript_novelty').nunique()
temp

Unnamed: 0_level_0,ic_id
transcript_novelty,Unnamed: 1_level_1
ISM_rescue,12
Known,82
NIC,21
NNC,14


In [111]:
df.loc[df.annot_transcript_id.str.contains('ERCC')]

Unnamed: 0,right_cardiac_atrium_3_1_labeled_human_tpm,gastroc_4d_f_2_labeled_mouse_tpm,mesenteric_fat_pad_1_1_labeled_human_tpm,psoas_muscle_2_1_labeled_human_tpm,adrenal_4d_m_1_labeled_mouse_tpm,adrenal_25d_f_2_labeled_mouse_tpm,hl60_m1_12hr_1_2_labeled_human_tpm,hl60_m2_24hr_1_2_labeled_human_tpm,hippocampus_5x_m_1_2_labeled_mouse_tpm,k562_3_2_labeled_human_tpm,...,brodmann_area_46_2_1_labeled_human_tpm,adrenal_4d_f_2_labeled_mouse_tpm,hippocampus_5x_f_1_1_labeled_mouse_tpm,brodmann_area_46_3_1_labeled_human_tpm,ovary_1_1_labeled_human_tpm,heart_right_ventricle_3_1_labeled_human_tpm,h9_panc_progen_1_1_labeled_human_tpm,annot_transcript_id,transcript_novelty,annot_gene_id
326,70921.985816,224322.781196,106743.271848,211453.744493,186820.169701,215034.373943,163531.669866,154126.213592,147679.324895,161666.666667,...,186400.318080,225219.102128,29268.292683,176291.598604,96418.732782,203551.912568,47945.205479,tSpikein_ERCC-00002,Known,gSpikein_ERCC-00002
327,0.000000,317.198503,0.000000,6456.602885,149.515942,1070.663812,383.877159,809.061489,0.000000,7222.222222,...,65.538530,1681.273475,2439.024390,85.412596,0.000000,8587.041374,0.000000,tSpikein_ERCC-00002#0,Known,gSpikein_ERCC-00002
328,0.000000,0.000000,0.000000,100.884420,0.000000,169.052181,0.000000,404.530744,0.000000,1111.111111,...,34.953883,107.315328,0.000000,32.029724,0.000000,97.580016,0.000000,tSpikein_ERCC-00003,Known,gSpikein_ERCC-00003
329,4255.319149,7485.884667,5140.610826,5380.502404,6503.943483,6874.788685,2111.324376,2022.653722,7434.197308,6666.666667,...,8288.439440,5151.135754,0.000000,9352.679286,2066.115702,4488.680718,4109.589041,tSpikein_ERCC-00003#0,Known,gSpikein_ERCC-00003
330,0.000000,0.000000,0.000000,33.628140,37.378986,0.000000,0.000000,0.000000,0.000000,0.000000,...,4.369235,35.771776,0.000000,10.676575,0.000000,0.000000,0.000000,tSpikein_ERCC-00003#1,Known,gSpikein_ERCC-00003
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
462,0.000000,190.319102,302.388872,437.165820,224.273913,394.455088,191.938580,0.000000,602.772755,555.555556,...,384.492710,357.717761,0.000000,320.297236,688.705234,487.900078,0.000000,tSpikein_ERCC-00165,Known,gSpikein_ERCC-00165
463,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,4.369235,0.000000,0.000000,21.353149,0.000000,0.000000,0.000000,tSpikein_ERCC-00168,Known,gSpikein_ERCC-00168
464,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,13.107706,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,tSpikein_ERCC-00170,Known,gSpikein_ERCC-00170
465,0.000000,63.439701,0.000000,100.884420,74.757971,112.701454,0.000000,0.000000,200.924252,0.000000,...,139.815531,0.000000,0.000000,128.118894,0.000000,97.580016,0.000000,tSpikein_ERCC-00170#0,Known,gSpikein_ERCC-00170


## 230720 ok so the sirv "genes" overlap, but how many genes do we call per sirv "chromosome"?

In [41]:
df = pd.read_csv(lapa_ab, sep='\t')
# df.head()

In [40]:
df[['transcript_novelty', 'annot_transcript_id']].groupby('transcript_novelty').count()
# these look like they've just been filtered for reproducibility and to remove genomic
# figured out through breadcrumbs trails that these have indeed been filtered for reproducibility: 
# https://sandbox.zenodo.org/record/1095599

Unnamed: 0_level_0,annot_transcript_id
transcript_novelty,Unnamed: 1_level_1
Antisense,23316
ISM,80937
Intergenic,1803
Known,193913
NIC,79168
NNC,23323


In [None]:
## 230720 ok so the sirv "genes" overlap, but what do they look like on the brwoser?

## 230720 Is there only one sirv / ercc gene per "chromosome"? 

In [43]:
spike_gtf = '/Users/fairliereese/Documents/programming/mortazavi_lab/data/rnawg/refs/gencode_v29_sirv4_ercc.gtf'
df = pr.read_gtf(spike_gtf, duplicate_attr=True).as_df()

In [46]:
df = df.loc[(df.Chromosome.str.contains('ERCC'))|(df.Chromosome.str.contains('SIRV'))]
df.Chromosome = df.Chromosome.astype('str')
df.Chromosome.unique()

array(['ERCC-00002', 'ERCC-00003', 'ERCC-00004', 'ERCC-00009',
       'ERCC-00012', 'ERCC-00013', 'ERCC-00014', 'ERCC-00016',
       'ERCC-00017', 'ERCC-00019', 'ERCC-00022', 'ERCC-00024',
       'ERCC-00025', 'ERCC-00028', 'ERCC-00031', 'ERCC-00033',
       'ERCC-00034', 'ERCC-00035', 'ERCC-00039', 'ERCC-00040',
       'ERCC-00041', 'ERCC-00042', 'ERCC-00043', 'ERCC-00044',
       'ERCC-00046', 'ERCC-00048', 'ERCC-00051', 'ERCC-00053',
       'ERCC-00054', 'ERCC-00057', 'ERCC-00058', 'ERCC-00059',
       'ERCC-00060', 'ERCC-00061', 'ERCC-00062', 'ERCC-00067',
       'ERCC-00069', 'ERCC-00071', 'ERCC-00073', 'ERCC-00074',
       'ERCC-00075', 'ERCC-00076', 'ERCC-00077', 'ERCC-00078',
       'ERCC-00079', 'ERCC-00081', 'ERCC-00083', 'ERCC-00084',
       'ERCC-00085', 'ERCC-00086', 'ERCC-00092', 'ERCC-00095',
       'ERCC-00096', 'ERCC-00097', 'ERCC-00098', 'ERCC-00099',
       'ERCC-00104', 'ERCC-00108', 'ERCC-00109', 'ERCC-00111',
       'ERCC-00112', 'ERCC-00113', 'ERCC-00116', 'ERCC-

In [48]:
df.loc[df.Chromosome.str.contains('SIRV'),'Chromosome'].unique()

array(['SIRV1', 'SIRV2', 'SIRV3', 'SIRV4', 'SIRV5', 'SIRV6', 'SIRV7',
       'SIRV4001', 'SIRV4002', 'SIRV4003', 'SIRV6001', 'SIRV6002',
       'SIRV6003', 'SIRV8001', 'SIRV8002', 'SIRV8003', 'SIRV10001',
       'SIRV10002', 'SIRV10003', 'SIRV12001', 'SIRV12002', 'SIRV12003'],
      dtype=object)

In [49]:
df.loc[df.Chromosome.str.contains('ERCC'),'Chromosome'].unique()

array(['ERCC-00002', 'ERCC-00003', 'ERCC-00004', 'ERCC-00009',
       'ERCC-00012', 'ERCC-00013', 'ERCC-00014', 'ERCC-00016',
       'ERCC-00017', 'ERCC-00019', 'ERCC-00022', 'ERCC-00024',
       'ERCC-00025', 'ERCC-00028', 'ERCC-00031', 'ERCC-00033',
       'ERCC-00034', 'ERCC-00035', 'ERCC-00039', 'ERCC-00040',
       'ERCC-00041', 'ERCC-00042', 'ERCC-00043', 'ERCC-00044',
       'ERCC-00046', 'ERCC-00048', 'ERCC-00051', 'ERCC-00053',
       'ERCC-00054', 'ERCC-00057', 'ERCC-00058', 'ERCC-00059',
       'ERCC-00060', 'ERCC-00061', 'ERCC-00062', 'ERCC-00067',
       'ERCC-00069', 'ERCC-00071', 'ERCC-00073', 'ERCC-00074',
       'ERCC-00075', 'ERCC-00076', 'ERCC-00077', 'ERCC-00078',
       'ERCC-00079', 'ERCC-00081', 'ERCC-00083', 'ERCC-00084',
       'ERCC-00085', 'ERCC-00086', 'ERCC-00092', 'ERCC-00095',
       'ERCC-00096', 'ERCC-00097', 'ERCC-00098', 'ERCC-00099',
       'ERCC-00104', 'ERCC-00108', 'ERCC-00109', 'ERCC-00111',
       'ERCC-00112', 'ERCC-00113', 'ERCC-00116', 'ERCC-

In [50]:
df.loc[df.Chromosome == 'ERCC-00002']

Unnamed: 0,Chromosome,Source,Feature,Start,End,Score,Strand,Frame,gene_id,gene_type,...,transcript_support_level,tag,havana_transcript,exon_number,exon_id,ont,protein_id,ccdsid,gene_biotype,exon_assignment
0,ERCC-00002,ERCC,exon,0,1045,0,+,.,ERCC-00002A,,...,,,,,,,,,ERCC_spike_in,


In [26]:
temp = df[['Chromosome', 'gene_id']].drop_duplicates()
temp = temp.groupby('Chromosome').nunique().reset_index().rename({'gene_id':'n_genes'}, axis=1)
temp = temp.loc[temp.n_genes>1]
# so sirvs 1-6 have more than one "gene" per chromosome, but where are they located? are they overlapping
temp

Unnamed: 0,Chromosome,n_genes
92,SIRV1,2
99,SIRV2,3
100,SIRV3,4
101,SIRV4,3
105,SIRV5,2
106,SIRV6,3


In [30]:
for c in temp.Chromosome.unique():
    temp2 = df.loc[df.Chromosome == c]
    print()
    print(f'Chromosome {c}')
    for g in temp2.gene_id.unique():
        max_coord = df.loc[df.gene_id == g][['Start', 'End']].max().max()
        min_coord = df.loc[df.gene_id == g][['Start', 'End']].min().min()
        print(f'Gene {g} min coord: {min_coord}')
        print(f'Gene {g} max coord: {max_coord}')
        
# they do overlap


Chromosome SIRV1
Gene SIRV1B min coord: 10582
Gene SIRV1B max coord: 11643
Gene SIRV1A min coord: 1000
Gene SIRV1A max coord: 11643

Chromosome SIRV2
Gene SIRV2B min coord: 1108
Gene SIRV2B max coord: 1631
Gene SIRV2C min coord: 4033
Gene SIRV2C max coord: 4457
Gene SIRV2A min coord: 1000
Gene SIRV2A max coord: 5911

Chromosome SIRV3
Gene SIRV3A min coord: 1944
Gene SIRV3A max coord: 8939
Gene SIRV3B min coord: 1000
Gene SIRV3B max coord: 1982
Gene SIRV3C min coord: 8759
Gene SIRV3C max coord: 9943
Gene SIRV3D min coord: 4601
Gene SIRV3D max coord: 4762

Chromosome SIRV4
Gene SIRV4C min coord: 1000
Gene SIRV4C max coord: 3403
Gene SIRV4A min coord: 8322
Gene SIRV4A max coord: 15122
Gene SIRV4B min coord: 3637
Gene SIRV4B max coord: 5158

Chromosome SIRV5
Gene SIRV5A min coord: 1000
Gene SIRV5A max coord: 13606
Gene SIRV5B min coord: 2177
Gene SIRV5B max coord: 2406

Chromosome SIRV6
Gene SIRV6A min coord: 1000
Gene SIRV6A max coord: 11837
Gene SIRV6B min coord: 1544
Gene SIRV6B max co

## 230720 are SIRVs / ERCCs in our unfiltered matrix?


In [7]:
df = pd.read_csv(talon_ab, sep='\t')

In [8]:
df.head()

Unnamed: 0,gene_ID,transcript_ID,annot_gene_id,annot_transcript_id,annot_gene_name,annot_transcript_name,n_exons,length,gene_novelty,transcript_novelty,...,calu3_1_2,brodmann_area_46_1_1,brodmann_area_46_7_1,brodmann_area_46_5_1,brodmann_area_46_6_1,brodmann_area_46_8_1,brodmann_area_46_4_1,brodmann_area_46_2_1,brodmann_area_46_9_1,brodmann_area_46_3_1
0,3,4,ENSG00000278267.1,ENST00000619216.1,MIR6859-1,MIR6859-1-201,1,68,Known,Known,...,0,0,0,0,0,0,0,0,0,0
1,4,6,ENSG00000243485.5,ENST00000469289.1,MIR1302-2HG,MIR1302-2HG-201,2,535,Known,Known,...,0,0,0,0,0,0,0,0,0,0
2,6,8,ENSG00000237613.2,ENST00000417324.1,FAM138A,FAM138A-201,3,1187,Known,Known,...,0,0,0,0,0,0,0,0,0,0
3,6,9,ENSG00000237613.2,ENST00000461467.1,FAM138A,FAM138A-202,2,590,Known,Known,...,0,0,0,0,0,0,0,0,0,0
4,10,19,ENSG00000238009.6,ENST00000453576.2,AL627309.1,AL627309.1-204,2,336,Known,Known,...,0,0,0,0,0,0,0,0,0,0


In [9]:
df.loc[df.annot_gene_id.str.contains('SIRV')]

Unnamed: 0,Chromosome,Source,Feature,Start,End,Score,Strand,Frame,gene_id,gene_type,...,transcript_support_level,tag,havana_transcript,exon_number,exon_id,ont,protein_id,ccdsid,gene_biotype,exon_assignment
98,SIRV1,LexogenSIRVData,exon,1000,1484,0,-,.,SIRV1A,,...,,,,,,,,,,SIRV101_0
99,SIRV1,LexogenSIRVData,exon,6337,6473,0,-,.,SIRV1A,,...,,,,,,,,,,SIRV101_1
100,SIRV1,LexogenSIRVData,exon,6560,6813,0,-,.,SIRV1A,,...,,,,,,,,,,SIRV101_2
101,SIRV1,LexogenSIRVData,exon,7552,7814,0,-,.,SIRV1A,,...,,,,,,,,,,SIRV101_3
102,SIRV1,LexogenSIRVData,exon,10282,10366,0,-,.,SIRV1A,,...,,,,,,,,,,SIRV101_4
103,SIRV1,LexogenSIRVData,exon,10444,10786,0,-,.,SIRV1A,,...,,,,,,,,,,SIRV101_5
104,SIRV1,LexogenSIRVData,exon,1006,1484,0,-,.,SIRV1A,,...,,,,,,,,,,SIRV102_0
105,SIRV1,LexogenSIRVData,exon,6337,6813,0,-,.,SIRV1A,,...,,,,,,,,,,SIRV102_1
106,SIRV1,LexogenSIRVData,exon,7552,7814,0,-,.,SIRV1A,,...,,,,,,,,,,SIRV102_2
107,SIRV1,LexogenSIRVData,exon,10282,10366,0,-,.,SIRV1A,,...,,,,,,,,,,SIRV102_3
