In [35]:
import pandas as pd
import pyranges as pr
import numpy as np
import sys
import os

p = os.path.dirname(os.path.dirname(os.getcwd()))
sys.path.append(p)

from scripts.utils import *
from scripts.plotting import *

In [36]:
df = pr.read_bed('human_3_utr.bed').df

In [37]:
df.head()

Unnamed: 0,Chromosome,Start,End,Name,Score,Strand
0,chr1,70008,71585,ENST00000641515.2,.,+
1,chr1,70008,70108,ENST00000335137.4,.,+
2,chr1,944153,944575,ENST00000342066.7,.,+
3,chr1,944153,944581,ENST00000618181.4,.,+
4,chr1,944153,944581,ENST00000622503.4,.,+


In [38]:
# merge with gene information
t_df = pd.read_csv('../talon/human_talon_abundance_filtered.tsv', sep='\t')

In [39]:
t_df.rename({'annot_transcript_id': 'tid',
             'annot_gene_id': 'gid',
             'annot_gene_name': 'gname'},
            axis=1, inplace=True)
t_df = t_df[['tid', 'gid', 'gname']]

In [40]:
t_df

Unnamed: 0,tid,gid,gname
0,ENST00000619216.1,ENSG00000278267.1,MIR6859-1
1,ENST00000469289.1,ENSG00000243485.5,MIR1302-2HG
2,ENST00000417324.1,ENSG00000237613.2,FAM138A
3,ENST00000461467.1,ENSG00000237613.2,FAM138A
4,ENST00000453576.2,ENSG00000238009.6,AL627309.1
...,...,...,...
194498,ENCODEHT005508511,ENSG00000136628.17,EPRS
194499,ENCODEHT005508701,ENCODEHG000614080,ENCODEHG000614080
194500,ENCODEHT005509391,ENSG00000143774.16,GUK1
194501,ENCODEHT005510213,ENSG00000059588.9,TARBP1


In [41]:
annot_t_df, _, _ = get_gtf_info(how='iso')

In [42]:
annot_t_df = annot_t_df[['tid', 'gid', 'gname']]

In [43]:
t_df = t_df.merge(annot_t_df, how='outer', on=['tid', 'gid', 'gname'])

In [44]:
t_df.head()

Unnamed: 0,tid,gid,gname
0,ENST00000619216.1,ENSG00000278267.1,MIR6859-1
1,ENST00000469289.1,ENSG00000243485.5,MIR1302-2HG
2,ENST00000417324.1,ENSG00000237613.2,FAM138A
3,ENST00000461467.1,ENSG00000237613.2,FAM138A
4,ENST00000453576.2,ENSG00000238009.6,AL627309.1


In [45]:
df = df.merge(t_df, how='left', left_on='Name', right_on='tid')

In [46]:
df.head()

Unnamed: 0,Chromosome,Start,End,Name,Score,Strand,tid,gid,gname
0,chr1,70008,71585,ENST00000641515.2,.,+,ENST00000641515.2,ENSG00000186092.6,OR4F5
1,chr1,70008,70108,ENST00000335137.4,.,+,ENST00000335137.4,ENSG00000186092.6,OR4F5
2,chr1,944153,944575,ENST00000342066.7,.,+,ENST00000342066.7,ENSG00000187634.11,SAMD11
3,chr1,944153,944581,ENST00000618181.4,.,+,ENST00000618181.4,ENSG00000187634.11,SAMD11
4,chr1,944153,944581,ENST00000622503.4,.,+,ENST00000622503.4,ENSG00000187634.11,SAMD11


In [47]:
df.drop('tid', axis=1, inplace=True)

In [48]:
# get all unique end coordinates
df['utr_end'] = np.nan
df.loc[df.Strand == '+', 'utr_end'] = df.loc[df.Strand == '+', 'End']
df.loc[df.Strand == '-', 'utr_end'] = df.loc[df.Strand == '-', 'Start']

df = df[['utr_end', 'gid', 'gname']].drop_duplicates()

In [49]:
df = df.merge(df, how='left', on=['gid', 'gname'])

In [50]:
df['utr_dist'] = (df.utr_end_x - df.utr_end_y).abs()

In [51]:
# get maximum utr difference per gene
temp = df[['gid', 'gname', 'utr_dist']].groupby(['gid', 'gname']).max().reset_index()

In [52]:
temp.to_csv('max_3_utr_dists.tsv', sep='\t')

In [54]:
temp.tail()

Unnamed: 0,gid,gname,utr_dist
20601,ENSG00000285975.1,AC134684.11,0.0
20602,ENSG00000285976.1,AL135905.2,0.0
20603,ENSG00000285978.1,AC113348.2,0.0
20604,ENSG00000285982.1,AC012213.5,0.0
20605,ENSG00000285991.1,AL355312.5,64950.0
