In [2]:
import pandas as pd
import pyranges as pr
import upsetplot
import seaborn as sns
import matplotlib.pyplot as plt
import os
import sys

p = os.path.dirname(os.path.dirname(os.getcwd()))
sys.path.append(p)

from scripts.utils import *
from scripts.plotting import *

In [47]:
# # download data
# !meta='https://storage.googleapis.com/isoform.io/iso_all_v1.2.csv'
# !gtf='https://storage.googleapis.com/isoform.io/chess_structure_v1.2.1_sorted.gtf'

# !wget $meta
# !wget $gtf
# !wget https://github.com/chess-genome/chess/releases/download/v.3.0/chess3.0.gtf.gz
# !gunzip chess3.0.gtf.gz

## 1/3/2023 -- chess 3

In [55]:
gtf = 'chess3.0.gtf'
df = pr.read_gtf(gtf, duplicate_attr=True).as_df()
df = df.loc[df.Feature == 'transcript']

In [57]:
df.head()

Unnamed: 0,Chromosome,Source,Feature,Start,End,Score,Strand,Frame,transcript_id,gene_id,gene_type,gene_name,db_xref,num_samples,max_tpm,assembly_id,tag,comment
0,chr1,BestRefSeq,transcript,11873,14409,.,+,.,CHS.1.1,CHS.1,transcribed_pseudogene,DDX11L1,"RefSeq:NR_046018.2,GENCODE:ENST00000456328.2",,,,,
4,chr1,HAVANA,transcript,29553,31097,.,+,.,CHS.3.1,CHS.3,lncRNA,,GENCODE:ENST00000473358.1,903.0,9.496638,ALL_00000008,,
8,chr1,BestRefSeq,transcript,30365,30503,.,+,.,CHS.166735.1,CHS.166735,miRNA,MIR1302-2,RefSeq:NR_036051.1,,,,,
10,chr1,BestRefSeq,transcript,30437,30458,.,+,.,CHS.166735.2,CHS.166735,miRNA,MIR1302-2,,,,,,
12,chr1,Curated Genomic,transcript,52452,53396,.,+,.,CHS.144069.1,CHS.144069,pseudogene,OR4G4P,"RefSeq:OR4G4P,GENCODE:ENST00000606857.1",,,,,


In [63]:
# does each chess gene id have a gencode transcript id associated with it
df['gencode_tid'] = df.db_xref.str.split('GENCODE:', expand=True)[1]
df['gencode_tid'] = df.gencode_tid.str.split(',', expand=True)[0]
df['has_gencode_tid'] = ~df.gencode_tid.isnull()


In [64]:
df.head()

Unnamed: 0,Chromosome,Source,Feature,Start,End,Score,Strand,Frame,transcript_id,gene_id,gene_type,gene_name,db_xref,num_samples,max_tpm,assembly_id,tag,comment,gencode_tid
0,chr1,BestRefSeq,transcript,11873,14409,.,+,.,CHS.1.1,CHS.1,transcribed_pseudogene,DDX11L1,"RefSeq:NR_046018.2,GENCODE:ENST00000456328.2",,,,,,ENST00000456328.2
4,chr1,HAVANA,transcript,29553,31097,.,+,.,CHS.3.1,CHS.3,lncRNA,,GENCODE:ENST00000473358.1,903.0,9.496638,ALL_00000008,,,ENST00000473358.1
8,chr1,BestRefSeq,transcript,30365,30503,.,+,.,CHS.166735.1,CHS.166735,miRNA,MIR1302-2,RefSeq:NR_036051.1,,,,,,
10,chr1,BestRefSeq,transcript,30437,30458,.,+,.,CHS.166735.2,CHS.166735,miRNA,MIR1302-2,,,,,,,
12,chr1,Curated Genomic,transcript,52452,53396,.,+,.,CHS.144069.1,CHS.144069,pseudogene,OR4G4P,"RefSeq:OR4G4P,GENCODE:ENST00000606857.1",,,,,,ENST00000606857.1


In [74]:
temp = df[['gene_id', 'has_gencode_tid', 'transcript_id']].groupby(['gene_id', 'has_gencode_tid']).count().reset_index()
temp.head()
temp = temp.pivot(index='gene_id', columns='has_gencode_tid', values='transcript_id').reset_index()
temp.rename({False: 'n_no_gencode', True: 'n_gencode'}, axis=1, inplace=True)
temp.head()

has_gencode_tid,gene_id,n_no_gencode,n_gencode
0,CHS.1,,1.0
1,CHS.100,1.0,3.0
2,CHS.10000,31.0,7.0
3,CHS.10001,,1.0
4,CHS.10002,,2.0


In [99]:
print(len(temp.index))
print(len(temp.loc[temp.n_gencode.isnull()].index))
# temp.loc[temp.gene_id=='CHS.166735']
# all genes have at least one transcript with a matching tid JUST KIDDING

63774
23648


## 1/5/2023 - chess3

In [91]:
# gtf = 'chess3.0.gtf'
# df = pr.read_gtf(gtf, duplicate_attr=True).as_df()
# df = df.loc[df.Feature == 'transcript']
df['gencode_tid'] = df.db_xref.str.split('GENCODE:', expand=True)[1]
df['gencode_tid'] = df.gencode_tid.str.split(',', expand=True)[0]
df['gencode_stable_tid'] = cerberus.get_stable_gid(df, col='gencode_tid')

In [86]:
# merge with gencode
gtf_df, _, _ = get_gtf_info(how='iso', ver='v29')
gtf_df['gid_stable'] = cerberus.get_stable_gid(gtf_df, col='gid')
gtf_df['tid_stable'] = cerberus.get_stable_gid(gtf_df, col='tid')

In [94]:
# limit to tid / gid pairings 
df = df[['gene_id', 'gencode_stable_tid']]

In [93]:
gtf_df.head()

Unnamed: 0,tid,t_len,gid,gname,biotype,biotype_category,tf,gid_stable,tid_stable
0,ENST00000000233.9,1103,ENSG00000004059.10,ARF5,protein_coding,protein_coding,False,ENSG00000004059,ENST00000000233
1,ENST00000000412.7,2756,ENSG00000003056.7,M6PR,protein_coding,protein_coding,False,ENSG00000003056,ENST00000000412
2,ENST00000000442.10,2215,ENSG00000173153.13,ESRRA,protein_coding,protein_coding,True,ENSG00000173153,ENST00000000442
3,ENST00000001008.5,3732,ENSG00000004478.7,FKBP4,protein_coding,protein_coding,False,ENSG00000004478,ENST00000001008
4,ENST00000001146.6,4732,ENSG00000003137.8,CYP26B1,protein_coding,protein_coding,False,ENSG00000003137,ENST00000001146


In [96]:
df.loc[df.gene_id=='CHS.166735']

Unnamed: 0,gene_id,gencode_stable_tid
8,CHS.166735,
10,CHS.166735,


## earlier

In [48]:
gtf = 'chess_structure_v1.2.1_sorted.gtf'
meta = 'iso_all_v1.2.csv'

In [7]:
gtf_df = pr.read_gtf(gtf, duplicate_attr=True)
meta_df = pd.read_csv(meta)

In [9]:
gtf_df = gtf_df.as_df()

In [11]:
t_df = gtf_df.loc[gtf_df.Feature == 'transcript']

In [23]:
t_df.head()
t_df.old_gene_id.head()

0      ENSG00000186092.7
3                    NaN
12                   NaN
13    ENSG00000187634.13
16                   NaN
Name: old_gene_id, dtype: object

In [29]:
print(len(t_df.loc[t_df.old_gene_id.isnull()]))
print(len(t_df.loc[~t_df.old_gene_id.isnull()]))

219915
17360


In [24]:
# fix the transcript id section and make the gene id correct aswell
t_df['gene_id'] = t_df['old_gene_id']
t_df['transcript_id'] = t_df.transcript_id.str.split(';', expand=True)[0]

In [27]:
print(len(t_df.loc[t_df.gene_id.isnull()]))
print(len(t_df.loc[~t_df.gene_id.isnull()]))

219915
17360


In [33]:
print(len(t_df.loc[t_df.gene_name.isnull()]))
print(len(t_df.loc[~t_df.gene_name.isnull()]))

128323
108952


In [None]:
t_df.db_xref.head()
print(len(t_df.loc[~(t_df.db_xref.isnull())&(t_df.db_xref.str.contains('GENCODE'))]))
print(len(t_df.loc[~(t_df.db_xref.isnull())&~(t_df.db_xref.str.contains('GENCODE'))]))


In [43]:
# this column does not contain gencode gene IDs
# t_df.loc[~t_df.Dbxref.isnull(), 'Dbxref'].head().values
# t_df.loc[~(t_df.Dbxref.isnull())&(t_df.Dbxref.str.contains('ENSG')), 'Dbxref'].head().values

# print(len(t_df.loc[~(t_df.Dbxref.isnull())&(t_df.Dbxref.str.contains('GENCODE'))]))
# print(len(t_df.loc[~(t_df.Dbxref.isnull())&~(t_df.Dbxref.str.contains('GENCODE'))]))


array([], dtype=object)

In [32]:
print(len(t_df.loc[t_df.GENCODE_ID.isnull()]))
print(len(t_df.loc[~t_df.GENCODE_ID.isnull()]))
print(t_df.loc[~t_df.GENCODE_ID.isnull()].head())

194663
42612
    Chromosome     Source     Feature   Start     End Score Strand Frame  \
48        chr1  StringTie  transcript  925941  944153     .      +     .   
70        chr1  StringTie  transcript  930311  944575     .      +     .   
167       chr1  StringTie  transcript  941075  942994     .      +     .   
214       chr1  StringTie  transcript  942165  942892     .      +     .   
394       chr1  StringTie  transcript  961448  962478     .      +     .   

     transcript_id old_transcript_id  ... CDS_inference end_range start_range  \
48   CHS.39.alt104               NaN  ...           NaN       NaN         NaN   
70   CHS.39.alt105               NaN  ...           NaN       NaN         NaN   
167  CHS.39.alt109               NaN  ...           NaN       NaN         NaN   
214  CHS.39.alt110               NaN  ...           NaN       NaN         NaN   
394  CHS.42.alt110               NaN  ...           NaN       NaN         NaN   

    MANE_gid MANE_tid model_evidence standa

In [18]:
meta_df.head()
# meta_df.loc[meta_df['CHESS ID'].str.contains('gene_id')]
meta_df.loc[meta_df['CHESS ID'] == 'CHS.39.alt101']

Unnamed: 0,CHESS ID,GENE,RefSeq ID,GENCODE ID,pLDDT,introns in mouse
137610,CHS.39.alt101,SAMD11,NM_001385641.1,ENST00000616016.5,46.4,False


In [21]:
t_df.loc[t_df.transcript_id.str.contains(';'), ['transcript_id', 'gene_id']]
t_df[['transcript_id', 'gene_id']].head()


Unnamed: 0,transcript_id,gene_id
0,CHS.131169.alt100;gene_id CHS.131169,
3,CHS.131169.0,CHS.131169
12,CHS.39.7,CHS.39
13,CHS.39.alt101;gene_id CHS.39,
16,CHS.39.5,CHS.39


In [19]:

# fix entries with the gene id concatenated


Unnamed: 0,transcript_id,gene_id
0,CHS.131169.alt100;gene_id CHS.131169,
13,CHS.39.alt101;gene_id CHS.39,
21,CHS.39.alt100;gene_id CHS.39,
23,CHS.39.alt102;gene_id CHS.39,
44,CHS.39.alt103;gene_id CHS.39,
...,...,...
4357738,CHS.59348.alt101;gene_id CHS.59348,
4357747,CHS.59356.alt100;gene_id CHS.59356,
4357749,CHS.59356.alt101;gene_id CHS.59356,
4357753,CHS.59356.alt102;gene_id CHS.59356,
