In [2]:
import pandas as pd
import numpy as np
import scipy.stats as st
import seaborn as sns
import sys
import os
import gseapy as gp
import matplotlib.pyplot as plt
import swan_vis as swan
import yaml
from snakemake.io import expand

p = os.path.dirname(os.path.dirname(os.getcwd()))
sys.path.append(p)

from scripts.utils import *
from scripts.plotting import *

In [3]:
config_file = '../snakemake/config.yml'
with open(config_file) as f:
    config = yaml.safe_load(f)

In [5]:
ab = '../'+expand(config['data']['ab'], species='human')[0]
talon_ab = '../'+expand(config['data']['talon_filt_ab'], species='human')[0]
lapa_ab = '../'+expand(config['data']['lapa_filt_ab'], species='human')[0]
filt_ab = '../'+expand(config['data']['filt_ab'], species='human')[0]
read_annot = '../'+expand(config['data']['read_annot'], species='human')[0]
t_metadata = '../'+expand(config['ref']['cerberus']['t_info'], species='human')[0]
lib_meta = '../'+expand(config['data']['meta'], species='human')[0]
swan_file = '../'+expand(config['data']['sg'], species='human')[0]
cerberus_h5 = '../'+expand(config['data']['cerb_annot'], species='human')[0]
cerb_t_metadata = '../'+expand(config['data']['t_info'], species='human')[0]
major_isos = '../'+expand(config['data']['major_isos'], species='human', obs_col='sample')[0]
pp_summary = '../'+expand(config['data']['p_pred']['summary'], species='human')[0]
ref_t_metadata = '../'+expand(config['ref']['t_info'], species='human')[0]
ref_g_metadata = '../'+expand(config['ref']['g_info'], species='human')[0]

spike_filt_ab = '../'+config['data']['spikes']['filt_ab']
gene_spike_ab = '../'+config['data']['spikes']['gene_ab']

ver = 'v40_cerberus'
min_tpm = 1
gene_subset = 'protein_coding'
obs_col = 'sample'
go_gene_subset = 'protein_coding'

m_lib_meta = '../'+expand(config['data']['meta'], species='mouse')[0]

## SIRV novel isoform detection

In [8]:
spike_filt_ab

'../data/spike/talon/annot_talon_abundance_filtered.tsv'

In [9]:
df = pd.read_csv(spike_filt_ab, sep='\t')

In [19]:
df.tail()

Unnamed: 0,gene_ID,transcript_ID,annot_gene_id,annot_transcript_id,annot_gene_name,annot_transcript_name,n_exons,length,gene_novelty,transcript_novelty,...,cortex_wt_f_1_1_labeled_mouse,cortex_wt_f_1_2_labeled_mouse,cortex_wt_m_2_1_labeled_mouse,cortex_wt_m_2_2_labeled_mouse,hippocampus_wt_f_1_1_labeled_mouse,hippocampus_wt_f_1_2_labeled_mouse,hippocampus_5x_m_1_1_labeled_mouse,hippocampus_5x_m_1_2_labeled_mouse,cortex_5x_m_1_1_labeled_mouse,cortex_5x_m_1_2_labeled_mouse
350,11,3633,SIRV6.1,spikesT000003633,spikesG000000011,spikesT000003633,3,788,Known,ISM,...,0,0,0,0,0,0,0,0,0,3
351,11,3637,SIRV6.1,spikesT000003637,spikesG000000011,spikesT000003637,2,658,Known,ISM,...,0,0,0,0,0,0,0,0,0,1
352,11,3677,SIRV6.1,spikesT000003677,spikesG000000011,spikesT000003677,2,1356,Known,ISM,...,0,0,1,2,0,0,0,1,1,0
353,11,3680,SIRV6.1,spikesT000003680,spikesG000000011,spikesT000003680,1,302,Known,ISM,...,0,0,0,0,0,0,0,0,0,1
354,196,3735,spikesG000000196,spikesT000003735,spikesG000000196,spikesT000003735,1,729,Antisense,Antisense,...,0,0,0,1,0,0,0,0,0,0


In [15]:
temp = df.loc[df.transcript_novelty!='Known']
temp = temp[['annot_transcript_id', 'transcript_novelty']].groupby('transcript_novelty').count()
temp

Unnamed: 0_level_0,annot_transcript_id
transcript_novelty,Unnamed: 1_level_1
Antisense,9
ISM,141
NIC,21
NNC,14


## 230720 ok so the sirv "genes" overlap, but how many genes do we call per sirv "chromosome"?

In [41]:
df = pd.read_csv(lapa_ab, sep='\t')
# df.head()

In [40]:
df[['transcript_novelty', 'annot_transcript_id']].groupby('transcript_novelty').count()
# these look like they've just been filtered for reproducibility and to remove genomic
# figured out through breadcrumbs trails that these have indeed been filtered for reproducibility: 
# https://sandbox.zenodo.org/record/1095599

Unnamed: 0_level_0,annot_transcript_id
transcript_novelty,Unnamed: 1_level_1
Antisense,23316
ISM,80937
Intergenic,1803
Known,193913
NIC,79168
NNC,23323


In [None]:
## 230720 ok so the sirv "genes" overlap, but what do they look like on the brwoser?

## 230720 Is there only one sirv / ercc gene per "chromosome"? 

In [43]:
spike_gtf = '/Users/fairliereese/Documents/programming/mortazavi_lab/data/rnawg/refs/gencode_v29_sirv4_ercc.gtf'
df = pr.read_gtf(spike_gtf, duplicate_attr=True).as_df()

In [46]:
df = df.loc[(df.Chromosome.str.contains('ERCC'))|(df.Chromosome.str.contains('SIRV'))]
df.Chromosome = df.Chromosome.astype('str')
df.Chromosome.unique()

array(['ERCC-00002', 'ERCC-00003', 'ERCC-00004', 'ERCC-00009',
       'ERCC-00012', 'ERCC-00013', 'ERCC-00014', 'ERCC-00016',
       'ERCC-00017', 'ERCC-00019', 'ERCC-00022', 'ERCC-00024',
       'ERCC-00025', 'ERCC-00028', 'ERCC-00031', 'ERCC-00033',
       'ERCC-00034', 'ERCC-00035', 'ERCC-00039', 'ERCC-00040',
       'ERCC-00041', 'ERCC-00042', 'ERCC-00043', 'ERCC-00044',
       'ERCC-00046', 'ERCC-00048', 'ERCC-00051', 'ERCC-00053',
       'ERCC-00054', 'ERCC-00057', 'ERCC-00058', 'ERCC-00059',
       'ERCC-00060', 'ERCC-00061', 'ERCC-00062', 'ERCC-00067',
       'ERCC-00069', 'ERCC-00071', 'ERCC-00073', 'ERCC-00074',
       'ERCC-00075', 'ERCC-00076', 'ERCC-00077', 'ERCC-00078',
       'ERCC-00079', 'ERCC-00081', 'ERCC-00083', 'ERCC-00084',
       'ERCC-00085', 'ERCC-00086', 'ERCC-00092', 'ERCC-00095',
       'ERCC-00096', 'ERCC-00097', 'ERCC-00098', 'ERCC-00099',
       'ERCC-00104', 'ERCC-00108', 'ERCC-00109', 'ERCC-00111',
       'ERCC-00112', 'ERCC-00113', 'ERCC-00116', 'ERCC-

In [48]:
df.loc[df.Chromosome.str.contains('SIRV'),'Chromosome'].unique()

array(['SIRV1', 'SIRV2', 'SIRV3', 'SIRV4', 'SIRV5', 'SIRV6', 'SIRV7',
       'SIRV4001', 'SIRV4002', 'SIRV4003', 'SIRV6001', 'SIRV6002',
       'SIRV6003', 'SIRV8001', 'SIRV8002', 'SIRV8003', 'SIRV10001',
       'SIRV10002', 'SIRV10003', 'SIRV12001', 'SIRV12002', 'SIRV12003'],
      dtype=object)

In [49]:
df.loc[df.Chromosome.str.contains('ERCC'),'Chromosome'].unique()

array(['ERCC-00002', 'ERCC-00003', 'ERCC-00004', 'ERCC-00009',
       'ERCC-00012', 'ERCC-00013', 'ERCC-00014', 'ERCC-00016',
       'ERCC-00017', 'ERCC-00019', 'ERCC-00022', 'ERCC-00024',
       'ERCC-00025', 'ERCC-00028', 'ERCC-00031', 'ERCC-00033',
       'ERCC-00034', 'ERCC-00035', 'ERCC-00039', 'ERCC-00040',
       'ERCC-00041', 'ERCC-00042', 'ERCC-00043', 'ERCC-00044',
       'ERCC-00046', 'ERCC-00048', 'ERCC-00051', 'ERCC-00053',
       'ERCC-00054', 'ERCC-00057', 'ERCC-00058', 'ERCC-00059',
       'ERCC-00060', 'ERCC-00061', 'ERCC-00062', 'ERCC-00067',
       'ERCC-00069', 'ERCC-00071', 'ERCC-00073', 'ERCC-00074',
       'ERCC-00075', 'ERCC-00076', 'ERCC-00077', 'ERCC-00078',
       'ERCC-00079', 'ERCC-00081', 'ERCC-00083', 'ERCC-00084',
       'ERCC-00085', 'ERCC-00086', 'ERCC-00092', 'ERCC-00095',
       'ERCC-00096', 'ERCC-00097', 'ERCC-00098', 'ERCC-00099',
       'ERCC-00104', 'ERCC-00108', 'ERCC-00109', 'ERCC-00111',
       'ERCC-00112', 'ERCC-00113', 'ERCC-00116', 'ERCC-

In [50]:
df.loc[df.Chromosome == 'ERCC-00002']

Unnamed: 0,Chromosome,Source,Feature,Start,End,Score,Strand,Frame,gene_id,gene_type,...,transcript_support_level,tag,havana_transcript,exon_number,exon_id,ont,protein_id,ccdsid,gene_biotype,exon_assignment
0,ERCC-00002,ERCC,exon,0,1045,0,+,.,ERCC-00002A,,...,,,,,,,,,ERCC_spike_in,


In [26]:
temp = df[['Chromosome', 'gene_id']].drop_duplicates()
temp = temp.groupby('Chromosome').nunique().reset_index().rename({'gene_id':'n_genes'}, axis=1)
temp = temp.loc[temp.n_genes>1]
# so sirvs 1-6 have more than one "gene" per chromosome, but where are they located? are they overlapping
temp

Unnamed: 0,Chromosome,n_genes
92,SIRV1,2
99,SIRV2,3
100,SIRV3,4
101,SIRV4,3
105,SIRV5,2
106,SIRV6,3


In [30]:
for c in temp.Chromosome.unique():
    temp2 = df.loc[df.Chromosome == c]
    print()
    print(f'Chromosome {c}')
    for g in temp2.gene_id.unique():
        max_coord = df.loc[df.gene_id == g][['Start', 'End']].max().max()
        min_coord = df.loc[df.gene_id == g][['Start', 'End']].min().min()
        print(f'Gene {g} min coord: {min_coord}')
        print(f'Gene {g} max coord: {max_coord}')
        
# they do overlap


Chromosome SIRV1
Gene SIRV1B min coord: 10582
Gene SIRV1B max coord: 11643
Gene SIRV1A min coord: 1000
Gene SIRV1A max coord: 11643

Chromosome SIRV2
Gene SIRV2B min coord: 1108
Gene SIRV2B max coord: 1631
Gene SIRV2C min coord: 4033
Gene SIRV2C max coord: 4457
Gene SIRV2A min coord: 1000
Gene SIRV2A max coord: 5911

Chromosome SIRV3
Gene SIRV3A min coord: 1944
Gene SIRV3A max coord: 8939
Gene SIRV3B min coord: 1000
Gene SIRV3B max coord: 1982
Gene SIRV3C min coord: 8759
Gene SIRV3C max coord: 9943
Gene SIRV3D min coord: 4601
Gene SIRV3D max coord: 4762

Chromosome SIRV4
Gene SIRV4C min coord: 1000
Gene SIRV4C max coord: 3403
Gene SIRV4A min coord: 8322
Gene SIRV4A max coord: 15122
Gene SIRV4B min coord: 3637
Gene SIRV4B max coord: 5158

Chromosome SIRV5
Gene SIRV5A min coord: 1000
Gene SIRV5A max coord: 13606
Gene SIRV5B min coord: 2177
Gene SIRV5B max coord: 2406

Chromosome SIRV6
Gene SIRV6A min coord: 1000
Gene SIRV6A max coord: 11837
Gene SIRV6B min coord: 1544
Gene SIRV6B max co

## 230720 are SIRVs / ERCCs in our unfiltered matrix?


In [7]:
df = pd.read_csv(talon_ab, sep='\t')

In [8]:
df.head()

Unnamed: 0,gene_ID,transcript_ID,annot_gene_id,annot_transcript_id,annot_gene_name,annot_transcript_name,n_exons,length,gene_novelty,transcript_novelty,...,calu3_1_2,brodmann_area_46_1_1,brodmann_area_46_7_1,brodmann_area_46_5_1,brodmann_area_46_6_1,brodmann_area_46_8_1,brodmann_area_46_4_1,brodmann_area_46_2_1,brodmann_area_46_9_1,brodmann_area_46_3_1
0,3,4,ENSG00000278267.1,ENST00000619216.1,MIR6859-1,MIR6859-1-201,1,68,Known,Known,...,0,0,0,0,0,0,0,0,0,0
1,4,6,ENSG00000243485.5,ENST00000469289.1,MIR1302-2HG,MIR1302-2HG-201,2,535,Known,Known,...,0,0,0,0,0,0,0,0,0,0
2,6,8,ENSG00000237613.2,ENST00000417324.1,FAM138A,FAM138A-201,3,1187,Known,Known,...,0,0,0,0,0,0,0,0,0,0
3,6,9,ENSG00000237613.2,ENST00000461467.1,FAM138A,FAM138A-202,2,590,Known,Known,...,0,0,0,0,0,0,0,0,0,0
4,10,19,ENSG00000238009.6,ENST00000453576.2,AL627309.1,AL627309.1-204,2,336,Known,Known,...,0,0,0,0,0,0,0,0,0,0


In [9]:
df.loc[df.annot_gene_id.str.contains('SIRV')]

Unnamed: 0,Chromosome,Source,Feature,Start,End,Score,Strand,Frame,gene_id,gene_type,...,transcript_support_level,tag,havana_transcript,exon_number,exon_id,ont,protein_id,ccdsid,gene_biotype,exon_assignment
98,SIRV1,LexogenSIRVData,exon,1000,1484,0,-,.,SIRV1A,,...,,,,,,,,,,SIRV101_0
99,SIRV1,LexogenSIRVData,exon,6337,6473,0,-,.,SIRV1A,,...,,,,,,,,,,SIRV101_1
100,SIRV1,LexogenSIRVData,exon,6560,6813,0,-,.,SIRV1A,,...,,,,,,,,,,SIRV101_2
101,SIRV1,LexogenSIRVData,exon,7552,7814,0,-,.,SIRV1A,,...,,,,,,,,,,SIRV101_3
102,SIRV1,LexogenSIRVData,exon,10282,10366,0,-,.,SIRV1A,,...,,,,,,,,,,SIRV101_4
103,SIRV1,LexogenSIRVData,exon,10444,10786,0,-,.,SIRV1A,,...,,,,,,,,,,SIRV101_5
104,SIRV1,LexogenSIRVData,exon,1006,1484,0,-,.,SIRV1A,,...,,,,,,,,,,SIRV102_0
105,SIRV1,LexogenSIRVData,exon,6337,6813,0,-,.,SIRV1A,,...,,,,,,,,,,SIRV102_1
106,SIRV1,LexogenSIRVData,exon,7552,7814,0,-,.,SIRV1A,,...,,,,,,,,,,SIRV102_2
107,SIRV1,LexogenSIRVData,exon,10282,10366,0,-,.,SIRV1A,,...,,,,,,,,,,SIRV102_3
