In [1]:
import pandas as pd
import numpy as np
import scipy.stats as st
import seaborn as sns
import sys
import os
import gseapy as gp
import matplotlib.pyplot as plt
import swan_vis as swan
import yaml
from snakemake.io import expand

p = os.path.dirname(os.path.dirname(os.getcwd()))
sys.path.append(p)

from scripts.utils import *
from scripts.plotting import *

In [14]:
config_file = '../config.yml'
with open(config_file) as f:
    config = yaml.safe_load(f)

In [15]:
m_gtf = '../'+expand(config['ref']['talon']['gtf'], species='human')[0]
m_ab_file = '../'+expand(config['lr']['talon']['ab'], species='mouse')[0]
m_filt_ab_file = '../'+expand(config['lr']['talon']['filt_ab'], species='mouse')[0]

In [12]:
df2 = pd.read_csv(m_filt_ab_file, sep='\t')
df2[['annot_transcript_id', 'gene_novelty']].groupby('gene_novelty').count()

Unnamed: 0_level_0,annot_transcript_id
gene_novelty,Unnamed: 1_level_1
Antisense,1886
Fusion,342
Intergenic,885
Known,118994


In [16]:
gtf_df = pr.read_gtf(m_gtf, duplicate_attr=True).df

In [22]:
gtf_df.loc[gtf_df.Chromosome=='chr1']

Unnamed: 0,Chromosome,Source,Feature,Start,End,Score,Strand,Frame,gene_id,gene_type,...,transcript_type,transcript_name,transcript_support_level,tag,havana_transcript,exon_number,exon_id,protein_id,ccdsid,ont
772,chr1,HAVANA,gene,3073252,3074322,.,+,.,ENSMUSG00000102693.1,TEC,...,,,,,,,,,,
773,chr1,HAVANA,transcript,3073252,3074322,.,+,.,ENSMUSG00000102693.1,TEC,...,TEC,4933401J01Rik-201,,basic,OTTMUST00000127109.1,,,,,
774,chr1,HAVANA,exon,3073252,3074322,.,+,.,ENSMUSG00000102693.1,TEC,...,TEC,4933401J01Rik-201,,basic,OTTMUST00000127109.1,1,ENSMUSE00001343744.1,,,
775,chr1,ENSEMBL,gene,3102015,3102125,.,+,.,ENSMUSG00000064842.1,snRNA,...,,,,,,,,,,
776,chr1,ENSEMBL,transcript,3102015,3102125,.,+,.,ENSMUSG00000064842.1,snRNA,...,snRNA,Gm26206-201,,basic,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
116270,chr1,ENSEMBL,transcript,195240909,195241007,.,-,.,ENSMUSG00000099208.1,misc_RNA,...,misc_RNA,Gm27940-201,,basic,,,,,,
116271,chr1,ENSEMBL,exon,195240909,195241007,.,-,.,ENSMUSG00000099208.1,misc_RNA,...,misc_RNA,Gm27940-201,,basic,,1,ENSMUSE00001316215.1,,,
116272,chr1,HAVANA,gene,195259298,195259848,.,-,.,ENSMUSG00000104297.1,TEC,...,,,,,,,,,,
116273,chr1,HAVANA,transcript,195259298,195259848,.,-,.,ENSMUSG00000104297.1,TEC,...,TEC,Gm38046-201,,basic,OTTMUST00000128998.1,,,,,


In [24]:
# are all "overlapping_locus" tags just genes?
gtf_df.loc[(gtf_df.tag.str.contains('overlapping_locus'))&(gtf_df.Feature!='gene')]
# yes

Unnamed: 0,Chromosome,Source,Feature,Start,End,Score,Strand,Frame,gene_id,gene_type,...,transcript_type,transcript_name,transcript_support_level,tag,havana_transcript,exon_number,exon_id,protein_id,ccdsid,ont


In [30]:
# what entries have the "overlapping_locus" tag?
temp = gtf_df.loc[gtf_df.tag.str.contains('overlapping_locus')].copy(deep=True)
temp[['tag', 'Feature']].groupby('Feature').count()

Unnamed: 0_level_0,tag
Feature,Unnamed: 1_level_1
gene,4859


In [28]:
# there are no entries with the "readthrough_gene" tag
gtf_df['tag'] = gtf_df.tag.astype(str)
gtf_df.loc[gtf_df.tag.str.contains('readthrough_gene')]

Unnamed: 0,Chromosome,Source,Feature,Start,End,Score,Strand,Frame,gene_id,gene_type,...,transcript_type,transcript_name,transcript_support_level,tag,havana_transcript,exon_number,exon_id,protein_id,ccdsid,ont


In [29]:
# what entries have the "readthrough_transcript" tag?
temp = gtf_df.loc[gtf_df.tag.str.contains('readthrough_transcript')].copy(deep=True)
temp[['tag', 'Feature']].groupby('Feature').count()

Unnamed: 0_level_0,tag
Feature,Unnamed: 1_level_1
CDS,2265
UTR,1425
exon,3948
start_codon,285
stop_codon,299
transcript,508


In [31]:
# get a list of genes w/ the overlapping_locus tags
o_genes = gtf_df.loc[gtf_df.tag.str.contains('overlapping_locus'), 'gene_id'].unique().tolist()

# how many transcripts from genes w/ overlapping locus tag have the 'read_trhough_transcript' tag?
temp = gtf_df.loc[gtf_df.gene_id.isin(o_genes)].copy(deep=True)
temp = temp.loc[temp.tag.str.contains('readthrough_transcript')].copy(deep=True)
temp[['tag', 'Feature']].groupby('Feature').count()

# so there are a few transcrpts with the readthrough_transcript tag that 
# don't belong to the overlapping_locus genes

Unnamed: 0_level_0,tag
Feature,Unnamed: 1_level_1
CDS,2110
UTR,1368
exon,3731
start_codon,269
stop_codon,283
transcript,477


In [38]:
# more throughough check for concordance of overlapping locus and 
temp = gtf_df.loc[gtf_df.Feature=='transcript'][['gene_id', 'gene_name', 'tag', 'transcript_id']].copy(deep=True)
temp['readthrough_transcript'] = temp.tag.str.contains('readthrough_transcript')
temp.drop('tag', inplace=True, axis=1)
temp.drop_duplicates(inplace=True)

temp2 = gtf_df.loc[gtf_df.Feature=='gene'][['gene_id', 'tag']].copy(deep=True)
temp2['overlapping_locus'] = temp2.tag.str.contains('overlapping_locus')
temp2.drop('tag', axis=1, inplace=True)
temp2.drop_duplicates(inplace=True)

temp = temp.merge(temp2, how='outer', on='gene_id')

In [39]:
temp.groupby(['readthrough_transcript', 'overlapping_locus']).count().reset_index()

Unnamed: 0,readthrough_transcript,overlapping_locus,gene_id,gene_name,transcript_id
0,False,False,125121,124941,125121
1,False,True,16413,16413,16413
2,True,False,31,31,31
3,True,True,477,477,477


In [40]:
temp.loc[(temp.readthrough_transcript==True)&(temp.overlapping_locus==False)].head()

Unnamed: 0,gene_id,gene_name,transcript_id,readthrough_transcript,overlapping_locus
20181,ENSMUSG00000106407.1,Gm43439,ENSMUST00000138710.2,True,False
20702,ENSMUSG00000051777.6,Iqcj,ENSMUST00000063263.4,True,False
20703,ENSMUSG00000102422.1,Iqschfp,ENSMUST00000182006.3,True,False
25730,ENSMUSG00000105103.1,Gm43191,ENSMUST00000140672.2,True,False
35006,ENSMUSG00000105617.4,Gm43809,ENSMUST00000200694.3,True,False


In [41]:
temp.loc[(temp.readthrough_transcript==False)&(temp.overlapping_locus==True)].head()

Unnamed: 0,gene_id,gene_name,transcript_id,readthrough_transcript,overlapping_locus
193,ENSMUSG00000025903.14,Lypla1,ENSMUST00000134384.7,False,True
194,ENSMUSG00000025903.14,Lypla1,ENSMUST00000027036.10,False,True
195,ENSMUSG00000025903.14,Lypla1,ENSMUST00000150971.7,False,True
196,ENSMUSG00000025903.14,Lypla1,ENSMUST00000119612.8,False,True
197,ENSMUSG00000025903.14,Lypla1,ENSMUST00000137887.7,False,True


In [4]:
df = pd.read_csv(m_ab_file, sep='\t')

In [6]:
df.gene_novelty.unique()

array(['Known', 'Antisense', 'Intergenic', 'Fusion'], dtype=object)

In [7]:
df[['annot_gene_id', 'gene_novelty']].groupby('gene_novelty').count().reset_index()

Unnamed: 0,gene_novelty,annot_gene_id
0,Antisense,222873
1,Fusion,43465
2,Intergenic,64034
3,Known,2552024


In [8]:
df.loc[df.gene_novelty=='Fusion']

Unnamed: 0,gene_ID,transcript_ID,annot_gene_id,annot_transcript_id,annot_gene_name,annot_transcript_name,n_exons,length,gene_novelty,transcript_novelty,...,adrenal_gland_1_2,cortex_5x_f_1_2,cortex_14d_f_2,cortex_2mo_m_2,gastroc_14d_f_2,gastroc_25d_m_2,gastroc_4d_f_2,heart_18-20mo_m_1,hippocampus_wt_m_2_1,hippocampus_18-20mo_f_2
64257,55774,142467,TALONG000055774,TALONT000142467,TALONG000055774,TALONT000142467,13,1934,Fusion,Fusion,...,0,0,0,0,0,0,0,0,0,0
64383,55808,142593,TALONG000055808,TALONT000142593,TALONG000055808,TALONT000142593,9,3331,Fusion,Fusion,...,0,1,1,0,0,0,0,0,2,1
64754,55869,142964,TALONG000055869,TALONT000142964,TALONG000055869,TALONT000142964,8,2352,Fusion,Fusion,...,0,2,0,0,0,0,0,0,1,0
64757,55869,142967,TALONG000055869,TALONT000142967,TALONG000055869,TALONT000142967,9,2518,Fusion,NIC,...,0,3,0,0,0,0,0,0,0,0
64759,55869,142969,TALONG000055869,TALONT000142969,TALONG000055869,TALONT000142969,4,1035,Fusion,NIC,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2882153,127991,2960363,TALONG000127991,TALONT002960363,TALONG000127991,TALONT002960363,1,1207,Fusion,Genomic,...,1,0,0,0,1,0,0,0,0,0
2882155,127991,2960365,TALONG000127991,TALONT002960365,TALONG000127991,TALONT002960365,6,1507,Fusion,NNC,...,0,0,0,0,0,0,0,0,1,0
2882291,180891,2960501,TALONG000180891,TALONT002960501,TALONG000180891,TALONT002960501,18,3855,Fusion,NIC,...,0,0,0,1,0,0,0,0,0,0
2882323,180891,2960533,TALONG000180891,TALONT002960533,TALONG000180891,TALONT002960533,1,1894,Fusion,Genomic,...,1,0,0,0,0,0,0,0,0,0
