In [1]:
import pandas as pd
import numpy as np
import scipy.stats as st
import seaborn as sns
import sys
import os
import gseapy as gp

p = os.path.dirname(os.path.dirname(os.getcwd()))
sys.path.append(p)

from scripts.utils import *
from scripts.plotting import *

In [15]:
df = pd.read_csv('human_talon_abundance.tsv', sep='\t')
df, inds = get_tpm_table(df,
                   how='iso',
                   min_tpm=1,
                   sample='brain', 
                   gene_subset='polya')

Calculating iso TPM values
Subsetting for polya genes
Enforcing minimum TPM
Total # isos detected: 5401754
# isos >= 1 tpm: 235794
Applying gene type and novelty subset
Number of isos reported: 151058


In [16]:
gene_df = pd.read_csv('human_talon_abundance.tsv', sep='\t')
gene_df = gene_df[['annot_gene_name', 'annot_transcript_id']]

In [17]:
gene_df

Unnamed: 0,annot_gene_name,annot_transcript_id
0,MIR6859-1,ENST00000619216.1
1,MIR1302-2HG,ENST00000469289.1
2,FAM138A,ENST00000417324.1
3,FAM138A,ENST00000461467.1
4,AL627309.1,ENST00000453576.2
...,...,...
5401749,ZNF692,ENCODEHT005511812
5401750,ENCODEHG000173225,ENCODEHT005511813
5401751,PGBD2,ENCODEHT005511814
5401752,ENCODEHG000614552,ENCODEHT005511815


In [19]:
df = df.merge(gene_df, how='left', left_index=True, right_on='annot_transcript_id')

In [20]:
df.head()

Unnamed: 0,brodmann_area_46_1_1,brodmann_area_46_7_1,brodmann_area_46_5_1,brodmann_area_46_6_1,brodmann_area_46_8_1,brodmann_area_46_4_1,brodmann_area_46_2_1,brodmann_area_46_9_1,brodmann_area_46_3_1,annot_gene_name,annot_transcript_id
8,1.778265,2.903556,7.091299,4.497588,7.131835,8.274235,5.213049,4.32838,4.989442,FO538757.1,ENST00000623083.4
17,0.0,0.967852,0.393961,1.037905,0.44574,0.0,1.158455,1.082095,0.997888,MTND1P23,ENST00000416931.1
18,1.778265,0.0,2.363766,0.0,0.891479,0.752203,0.0,0.360698,0.997888,MTND2P28,ENST00000457540.1
19,3.556529,1.451778,2.757727,1.383873,1.337219,1.128305,0.579228,0.721397,0.997888,MTCO1P12,ENST00000414273.1
22,3.556529,0.0,0.393961,0.345968,0.0,0.0,0.579228,1.082095,1.995777,MTATP6P1,ENST00000514057.1


In [21]:
df.loc[df.annot_gene_name == 'SCN1A']

Unnamed: 0,brodmann_area_46_1_1,brodmann_area_46_7_1,brodmann_area_46_5_1,brodmann_area_46_6_1,brodmann_area_46_8_1,brodmann_area_46_4_1,brodmann_area_46_2_1,brodmann_area_46_9_1,brodmann_area_46_3_1,annot_gene_name,annot_transcript_id
12980,1.778265,2.41963,1.575844,2.07581,5.348876,1.504406,3.475366,3.246285,0.997888,SCN1A,ENST00000507401.2
749578,0.0,0.483926,0.393961,1.037905,1.782959,1.880508,2.31691,1.082095,0.997888,SCN1A,ENCODEHT000859641
749579,1.778265,0.0,0.0,0.345968,0.0,0.376102,0.0,0.0,0.0,SCN1A,ENCODEHT000859642
1582123,1.778265,0.967852,0.787922,1.729842,2.674438,2.25661,1.158455,2.16419,1.995777,SCN1A,ENCODEHT001692186
2145776,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.442793,0.0,SCN1A,ENCODEHT002255839
3061026,5.334794,2.903556,2.757727,1.383873,2.674438,1.504406,4.633821,2.524888,5.987331,SCN1A,ENCODEHT003171089
3061033,0.0,0.0,0.0,0.345968,0.44574,0.0,0.0,1.082095,0.0,SCN1A,ENCODEHT003171096
4066560,1.778265,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,SCN1A,ENCODEHT004176623
5349421,0.0,0.483926,0.787922,1.037905,1.337219,0.752203,0.579228,0.721397,0.997888,SCN1A,ENCODEHT005459484
5349442,1.778265,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,SCN1A,ENCODEHT005459505


In [22]:
t_df = pd.read_csv('human_talon_abundance_filtered.tsv', sep='\t')

In [23]:
t_df.loc[t_df.annot_gene_name == 'SCN1A']

Unnamed: 0,gene_ID,transcript_ID,annot_gene_id,annot_transcript_id,annot_gene_name,annot_transcript_name,n_exons,length,gene_novelty,transcript_novelty,...,calu3_1_2,brodmann_area_46_1_1,brodmann_area_46_7_1,brodmann_area_46_5_1,brodmann_area_46_6_1,brodmann_area_46_8_1,brodmann_area_46_4_1,brodmann_area_46_2_1,brodmann_area_46_9_1,brodmann_area_46_3_1
12980,8024,27920,ENSG00000144285.19,ENST00000507401.2,SCN1A,SCN1A-207,3,711,Known,Known,...,0,1,5,4,6,12,4,6,9,1
178364,8024,1692186,ENSG00000144285.19,ENCODEHT001692186,SCN1A,ENCODEHT001692186,1,3263,Known,ISM,...,0,1,2,2,5,6,6,2,6,2


In [24]:
df = pd.read_csv('human_talon_abundance.tsv', sep='\t')
df = df.loc[df.annot_gene_name == 'SCN1A']

In [25]:
df.head()

Unnamed: 0,gene_ID,transcript_ID,annot_gene_id,annot_transcript_id,annot_gene_name,annot_transcript_name,n_exons,length,gene_novelty,transcript_novelty,...,calu3_1_2,brodmann_area_46_1_1,brodmann_area_46_7_1,brodmann_area_46_5_1,brodmann_area_46_6_1,brodmann_area_46_8_1,brodmann_area_46_4_1,brodmann_area_46_2_1,brodmann_area_46_9_1,brodmann_area_46_3_1
12980,8024,27920,ENSG00000144285.19,ENST00000507401.2,SCN1A,SCN1A-207,3,711,Known,Known,...,0,1,5,4,6,12,4,6,9,1
749565,8024,859628,ENSG00000144285.19,ENCODEHT000859628,SCN1A,ENCODEHT000859628,17,3383,Known,NIC,...,0,0,0,0,0,0,0,0,0,0
749567,8024,859630,ENSG00000144285.19,ENCODEHT000859630,SCN1A,ENCODEHT000859630,1,793,Known,Genomic,...,0,0,0,0,0,0,0,0,0,0
749575,8024,859638,ENSG00000144285.19,ENCODEHT000859638,SCN1A,ENCODEHT000859638,1,1175,Known,Genomic,...,0,0,0,0,0,0,0,0,0,0
749578,8024,859641,ENSG00000144285.19,ENCODEHT000859641,SCN1A,ENCODEHT000859641,3,1194,Known,ISM,...,0,0,1,1,3,4,5,4,3,1


In [28]:
brain_cols = [c for c in df.columns if 'brodmann' in c]

In [31]:
temp = df[['annot_transcript_name', 'transcript_novelty']+brain_cols]
temp.set_index('annot_transcript_name', inplace=True)
temp = temp.groupby('transcript_novelty').sum()
temp.sum(axis=1)


transcript_novelty
Genomic     44
ISM        155
Known       48
NIC         24
NNC          5
dtype: int64

In [32]:
44+155+48+24+5

276

In [33]:
48/276

0.17391304347826086

In [34]:
df.head()

Unnamed: 0,gene_ID,transcript_ID,annot_gene_id,annot_transcript_id,annot_gene_name,annot_transcript_name,n_exons,length,gene_novelty,transcript_novelty,...,calu3_1_2,brodmann_area_46_1_1,brodmann_area_46_7_1,brodmann_area_46_5_1,brodmann_area_46_6_1,brodmann_area_46_8_1,brodmann_area_46_4_1,brodmann_area_46_2_1,brodmann_area_46_9_1,brodmann_area_46_3_1
12980,8024,27920,ENSG00000144285.19,ENST00000507401.2,SCN1A,SCN1A-207,3,711,Known,Known,...,0,1,5,4,6,12,4,6,9,1
749565,8024,859628,ENSG00000144285.19,ENCODEHT000859628,SCN1A,ENCODEHT000859628,17,3383,Known,NIC,...,0,0,0,0,0,0,0,0,0,0
749567,8024,859630,ENSG00000144285.19,ENCODEHT000859630,SCN1A,ENCODEHT000859630,1,793,Known,Genomic,...,0,0,0,0,0,0,0,0,0,0
749575,8024,859638,ENSG00000144285.19,ENCODEHT000859638,SCN1A,ENCODEHT000859638,1,1175,Known,Genomic,...,0,0,0,0,0,0,0,0,0,0
749578,8024,859641,ENSG00000144285.19,ENCODEHT000859641,SCN1A,ENCODEHT000859641,3,1194,Known,ISM,...,0,0,1,1,3,4,5,4,3,1


In [2]:
df = df[['gene_ID', 'transcript_ID']]
df.to_csv('/Users/fairliereese/mortazavi_lab/data/rnawg/lr_bulk/check_scn1a/scn1a_pass_list.csv', index=False, header=None)

NameError: name 'df' is not defined

In [1]:
# import swan_vis as swan

In [2]:
# annot = '../../refs/gencode_v29_sirv4_ercc.gtf'
# gtf = 'scn1a_talon.gtf'

# # sg = swan.SwanGraph()
# # sg.add_annotation(annot)
# # sg.save_graph('swan')
# sg = swan.read('swan.p')
# sg.add_transcriptome(gtf)

In [3]:
# tids = sg.t_df.loc[sg.t_df.gname == 'SCN1A', 'tid'].tolist()
# print(len(tids))
# # sg.plot_each_transcript(tids, prefix='figures/scn1a', indicate_novel=True)