In [93]:
import pandas as pd
import pyranges
import os
import sys
import seaborn as sns

p = os.path.dirname(os.path.dirname(os.getcwd()))
sys.path.append(p)

from scripts.utils import *

In [94]:
## gencode annotated transcripts
df = pr.read_gtf('../../refs/gencode_v29_sirv4_ercc.gtf')
df = df.df 

In [95]:
# get rid of sirvs and erccs
df = df.loc[~df.Chromosome.str.contains('ERCC')]
df = df.loc[~df.Chromosome.str.contains('SIRV')]

# limit protein coding genes
gene_df = get_gtf_info(how='gene')[0]
gene_df = gene_df.loc[gene_df.biotype_category == 'protein_coding']
pc_genes = gene_df.gid.tolist()
print(len(pc_genes))

df = df.loc[df.gene_id.isin(pc_genes)]
len(df.loc[df.Feature == 'gene'])

19969


19969

In [96]:
# df.loc[df.transcript_id == 'ENST00000641515.2']

In [97]:
# temp = df.loc[df.Feature == 'UTR']
# temp = temp[['Feature', 'transcript_id', 'Source', 'Start']]
# temp = temp.groupby(['Feature', 'Source', 'transcript_id']).count().reset_index()
# sns.displot(data=temp, kind='kde', x='Start', hue='Source')

In [98]:
# tids = temp.loc[temp.Start > 1, 'transcript_id'].tolist()
# temp.head()

In [99]:
# nov_df = pr.read_gtf('../talon/human_known_nic_nnc_talon.gtf').df
# nov_df = nov_df.loc[~nov_df.Chromosome.str.contains('SIRV')]
# nov_df = nov_df.loc[~nov_df.Chromosome.str.contains('ERCC')]
# nov_df = nov_df.loc[~nov_df.Feature.str.contains('gene')]

In [100]:
# nov_df.loc[~nov_df.transcript_id.str.contains('ENCODE')]

In [101]:
# nov_df.loc[nov_df.transcript_id.isin(tids), 'transcript_id'].head()

In [102]:
# # a transcript that is expressed in the long read data
# temp.loc[temp.transcript_id == 'ENST00000342066.7']

In [103]:
# nov_df.loc[nov_df.transcript_id == 'ENST00000342066.7'].tail()

In [104]:
# df.loc[df.transcript_id == 'ENST00000342066.7']

In [105]:
# df.loc[df.Feature == 'stop_codon']
# df.loc[(df.Feature == 'stop_codon')&(df.Strand == '-')]

In [106]:
# utr_df = utr_df.df
# utr_df.loc[utr_df.Name == 'ENST00000342066.7']

In [107]:
t_df = df.loc[df.Feature == 'transcript'].copy(deep=True)
stop_df = df.loc[df.Feature == 'stop_codon'].copy(deep=True)
stop_df = stop_df[['transcript_id', 'Start', 'End']]

# add stop codon coords to transcript df
t_df = t_df.merge(stop_df, on='transcript_id',
                  suffixes=('','_stop_codon'))

# split into fwd and rev
fwd = t_df.loc[t_df.Strand == '+'].copy(deep=True)
rev = t_df.loc[t_df.Strand == '-'].copy(deep=True)

fwd['3_utr_start'] = fwd.End_stop_codon
fwd['3_utr_end'] = fwd.End

rev['3_utr_end'] = rev.Start_stop_codon
rev['3_utr_start'] = rev.Start+1

df = pd.concat([fwd, rev])
df = df[['Chromosome', '3_utr_start', '3_utr_end',
         'Strand', 'transcript_id']]
df.rename({'3_utr_start': 'Start',
           '3_utr_end': 'End',
           'transcript_id': 'Name'}, axis=1, inplace=True)

In [108]:
utr_df = pr.PyRanges(df)

In [109]:
utr_df.head()

Unnamed: 0,Chromosome,Start,End,Strand,Name
0,chr1,70009,71585,+,ENST00000641515.2
1,chr1,70009,70108,+,ENST00000335137.4
2,chr1,944154,944575,+,ENST00000342066.7
3,chr1,944154,944581,+,ENST00000618181.4
4,chr1,944154,944581,+,ENST00000622503.4
5,chr1,942856,944581,+,ENST00000618323.4
6,chr1,942696,944581,+,ENST00000616016.4
7,chr1,944154,944581,+,ENST00000618779.4


In [28]:
## novel transcripts

In [146]:
df = pd.read_csv('human_cds.bed', sep='\t',
                 header=None, usecols=[1,2,3,5,6,7])
df.columns = ['Start', 'Stop', 'fields', 'Strand',
              'CDS_Start', 'CDS_Stop']
df['tid'] = df.fields.str.split(';', expand=True)[1]

In [147]:
df.head()

Unnamed: 0,Start,Stop,fields,Strand,CDS_Start,CDS_Stop,tid
0,14403,29570,ENSG00000227232.5;ENCODEHT000207057;NA;NA;none...,-,17327,17687,ENCODEHT000207057
1,14403,29570,ENSG00000227232.5;ENCODEHT000207076;NA;NA;ENSP...,-,17735,24886,ENCODEHT000207076
2,14403,29570,ENSG00000227232.5;ENCODEHT001150534;NA;NA;ENSP...,-,17735,24886,ENCODEHT001150534
3,14408,29570,ENSG00000227232.5;ENCODEHT000207067;NA;NA;none...,-,16747,17310,ENCODEHT000207067
4,17368,17436,ENSG00000278267.1;ENST00000619216.1;NA;NA;none...,-,17371,17434,ENST00000619216.1


In [148]:
# check if transcript is novel 
df['novel_transcript'] = df.tid.str.contains('ENCODE')

In [149]:
# check if transcript is nmd
df['nmd'] = ~df.fields.str.contains('prot_ok')

In [150]:
# limit to just novel transcripts that do not have NMD
df = df.loc[(df.novel_transcript==True)&(df.nmd==False)]

In [151]:
len(df.index)

24867

In [152]:
# for transcripts without NMD, get the 3' UTR
fwd = df.loc[df.Strand == '+'].copy(deep=True)
rev = df.loc[df.Strand == '-'].copy(deep=True)

# fwd strand
fwd['3_utr_start'] = fwd.CDS_Stop+3
fwd['3_utr_end'] = fwd.Stop

# rev strand
rev['3_utr_start'] = rev.Start
rev['3_utr_end'] = rev.CDS_Start-3

df = pd.concat([fwd, rev])


In [145]:
# # toy 
# cds_start = 1
# cds_stop = 4
# print((cds_stop - cds_start)/3)


In [None]:
# - strand

In [140]:
# # + strand
# # tid = ENCODEHT001155508
# start = 1308566
# stop = 1311676

# cds_start = 1309690
# cds_stop = 1311376

# print((cds_stop - cds_start)/3)

# print('CDS length: {}'.format(cds_stop-cds_start))

# rel_cds_start = cds_start - start
# rel_cds_stop = cds_stop - start

# seq = 'GTGACCAGGCCGCGTCCGCGCGCGCGCAGGATTCCTGCGCTGGAGGCCGCCTCTGACGCCACCGGCTGGGCTCCGCCATGAGTTCGGCGCCGGCCTCAGGCTCCGTGCGCGCGCGCTATCTTGTGTACTTCCAGTACGTGGGCACCGACTTTAAGAGGCCGCCGAGCGGCTGAATTCCGTGGAGCCGGTCAGGTTCACCATCTCCAGCCGCACGGACGCCGGGGTCCACGCCCTGAGCAACGCGGCGCACCTGGACGTCCAGCGCCGCTCAGGCCGGCCGCCCTTCCCGCCCGAGGTCCTGGCCGAGGCCCTCAACACACACCTGCGGCACCCGGCCATCAGGGTCCTGCGGGCCTTCCGAGTGCCCAGCGACTTCCACGCTCGTCACGCAGCCACGTCCCGGACCTACCTGTACCGCCTGGCCACTGGCTGTCACCGGCGTGATGAGCTGCCGGTGTTTGAACGCAACCTATGCTGGACTCTCCCGGCAGACTGCCTGGATATGGTCGCCATGCAGGAAGCCGCCCAGCACCTCCTCGGCACACACGACTTCAGCGCCTTCCAGTCCGCTGGCAGCCCGGTGCCGAGCCCCGTGCGAACGCTGCGCCGGGTCTCCGTTTCCCCAGGCCAAGCCAGCCCCTTGGTCACCCCCGAGGAGAGCAGGAAGCTGCGGTTCTGGAACCTGGAGTTTGAGAGCCAGTCTTTCCTGTATAGACAGGTACGGAGGATGACGGCTGTGCTGGTGGCCGTGGGGCTGGGGGCTTTGGCACCTGCCCAGGTGAAGACGATTCTGGAGAGCCAAGATCCCCTGGGCAAGCACCAGACACGTGTAGCCCCAGCCCACGGCTTATTCCTCAAGTCAGTGCTGTACGGGAACCTCGGTGCTGCCTCCTGCACCCTGCAGGGGCCACAGTTCGGGAGCCACGGATGACCCTGGACACTCAAGCCAAAGTTAGGCCACACCAGGCCCAACCCTGTGCTGGTCAAGCCAGGGCAGTCACAGCTGCTTGGGGCCCACAGCACTGCTGCCTGGTCTCCACAGTAGCCTCCCTGCCCGGGTCCCAGCACCCTGGATGCCCGTCTCTGTCCCAGGCGGGATGGGGCACAGTGCAGGACACAGCCATGTACACCAAGAAGAGAGTACCAAGTAGTCTTTTGTTCAGCTTTTACTGGAAACTGCTGTCTAGGACCACCTGCCCTAACCAGGAATAAAGGCAAGACAGCCTGG'
# print(len(seq))
# print()
# print(rel_cds_start)
# print(rel_cds_stop)


# # # start codon
# # print(seq[rel_cds_start:rel_cds_start+3])

# # # stop codon
# # print(seq[(-1*rel_cds_stop)-3:(-1*rel_cds_stop)])

In [141]:
# 944150 - 923927

In [142]:
# df.head(10)

In [143]:
# df.loc[df.Strand == '-']

In [144]:
# (3261-219)/3