In [1]:
import pandas as pd
import numpy as np
import scipy.stats as st
import seaborn as sns
import sys
import os
import gseapy as gp
import swan_vis as swan 
from scipy import sparse

p = os.path.dirname(os.path.dirname(os.getcwd()))
sys.path.append(p)

from scripts.utils import *
from scripts.plotting import *

## Format abundance matrix

In [12]:
df = pd.read_csv('../rsem/human_bru_abundance.tsv', sep='\t')
df.columns = [c.lower() for c in df.columns]
df.drop('gene_id', axis=1, inplace=True)

In [13]:
df.columns

Index(['transcript_id', 'a673_0h', 'a673_2h', 'a673_6h', 'caco2_0h',
       'caco2_2h', 'caco2_6h', 'calu3_0h', 'calu3_2h', 'calu3_6h',
       'gm12878_0h', 'gm12878_2h', 'gm12878_6h', 'hct116_0h', 'hct116_2h',
       'hct116_6h', 'hepg2_0h', 'hepg2_2h', 'hepg2_6h', 'hmec_0h', 'hmec_2h',
       'hmec_6h', 'huvec_0h', 'huvec_2h', 'huvec_6h', 'imr90_0h', 'imr90_2h',
       'imr90_6h', 'k562_0h', 'k562_2h', 'k562_6h'],
      dtype='object')

In [14]:
df.to_csv('human_bru_abundance_swan.tsv', sep='\t', index=False)

## Create SwanGraph

In [15]:
annot = '../../refs/gencode_v29_sirv4_ercc.gtf'
ab = 'human_bru_abundance_swan.tsv'
gtf = '../ljungman_known_nic_nnc_talon.gtf'

In [16]:
sg = swan.SwanGraph()
sg.add_annotation(annot)
sg.add_transcriptome(gtf)
sg.add_abundance(ab)


Adding annotation to the SwanGraph

Adding transcriptome to the SwanGraph

Adding abundance for datasets a673_0h, a673_2h, a673_6h, caco2_0h, caco2_2h... (and 25 more) to SwanGraph
Calculating transcript TPM...
Calculating PI...
Calculating edge usage...




Calculating TSS usage...
Calculating TES usage...


In [17]:
sg.save_graph('swan')

Saving graph as swan.p


## Get metadata

In [7]:
sg = swan.read('swan.p')
meta = sg.adata.obs.copy(deep=True)
meta[['sample', 'timepoint']] = meta.dataset.str.split('_', expand=True) 
meta.to_csv('swan_metadata.tsv', sep='\t', index=False)

Read in graph from swan.p


## Add metadata 

In [10]:
meta = 'swan_metadata.tsv'
sg.add_metadata('swan_metadata.tsv')

In [11]:
# get relevant colors

# cell line
samples = sg.adata.obs['sample'].unique().tolist()
c_dict, order = get_biosample_colors()
c_dict_keys = list(c_dict.keys())
pop_keys = list(set(c_dict_keys)-set(samples))
for p in pop_keys:
    c_dict.pop(p)
    order.remove(p)
sg.set_metadata_colors('sample', c_dict)

# timepoint
c_dict, order = get_bru_timept_colors()
sg.set_metadata_colors('timepoint', c_dict)

In [12]:
sg.save_graph('swan')

Saving graph as swan.p


## Reports

In [3]:
sg = swan.read('swan.p')

Read in graph from swan.p


In [18]:
sg.gen_report('ELN',
              prefix='figures/eln_cell_line_tpm',
              layer='pi',
              cmap='magma',
              novelty=True,
              transcript_col='tname',
              metadata_cols=['sample', 'timepoint'],
              browser=True)

                     a673_0h    a673_2h    a673_6h  caco2_0h  caco2_2h  \
tid                                                                      
ENCODEHT002323608   0.000000  26.408451  11.021234       NaN       0.0   
ENCODEHT002323706   0.000000   0.000000   0.000000       NaN       0.0   
ENCODEHT002323748   0.000000   9.154929   8.291203       NaN       0.0   
ENCODEHT002323610   0.000000   0.000000  13.245704       NaN       0.0   
ENCODEHT002323712  16.393442   6.338028   0.000000       NaN       0.0   

                   caco2_6h  calu3_0h  calu3_2h  calu3_6h  gm12878_0h  ...  \
tid                                                                    ...   
ENCODEHT002323608       0.0       0.0       0.0       0.0         NaN  ...   
ENCODEHT002323706       0.0       0.0       0.0       0.0         NaN  ...   
ENCODEHT002323748       0.0       0.0       0.0       0.0         NaN  ...   
ENCODEHT002323610       0.0       0.0       0.0       0.0         NaN  ...   
ENCODEHT00232

In [19]:
sg.gen_report('MEF2C',
              prefix='figures/mef2c_cell_line_tpm',
              layer='pi',
              cmap='magma',
              novelty=True,
              transcript_col='tname',
              metadata_cols=['sample', 'timepoint'],
              browser=True)

                    a673_0h    a673_2h    a673_6h   caco2_0h   caco2_2h  \
tid                                                                       
ENST00000637481.1   0.00000   0.000000  37.888199  45.454548  40.000000   
ENST00000636998.1   0.00000  26.016258   0.000000   0.000000   0.000000   
ENST00000504921.6  63.15789  13.821138   3.726708  36.363636   0.000000   
ENST00000510942.5   0.00000  35.772358  32.298134   0.000000   0.000000   
ENST00000508569.5   0.00000  13.821138   9.316770   0.000000   8.571429   

                    caco2_6h   calu3_0h   calu3_2h   calu3_6h  gm12878_0h  \
tid                                                                         
ENST00000637481.1   5.555556  33.333336  23.648647   0.377358    8.946898   
ENST00000636998.1   0.000000   0.000000  11.486486  31.698114   17.179831   
ENST00000504921.6   0.000000  55.555557   5.630630   5.283019   47.411873   
ENST00000510942.5  16.666668   0.000000  36.711712  33.584904    0.000000   
ENST00000508

In [20]:
sg.gen_report('RPS16',
              prefix='figures/rps16_cell_line_tpm',
              layer='pi',
              cmap='magma',
              novelty=True,
              transcript_col='tname',
              metadata_cols=['sample', 'timepoint'],
              browser=True)

                     a673_0h    a673_2h    a673_6h   caco2_0h   caco2_2h  \
tid                                                                        
ENST00000251453.7  27.004105  47.253239  71.882385  47.576611  66.642120   
ENST00000601390.1  34.473324  33.541759  16.303492  19.340212  17.869415   
ENCODEHT000769052  36.990425  16.569897   7.009714  30.831770  10.260186   
ENST00000601655.5   1.121751   1.741849   2.415332   1.297686   2.650957   
ENST00000339471.8   0.054720   0.357302   0.577579   0.062539   0.957290   

                    caco2_6h   calu3_0h   calu3_2h   calu3_6h  gm12878_0h  \
tid                                                                         
ENST00000251453.7  84.032921  25.218912  72.668037  80.711540   39.261196   
ENST00000601390.1   5.587563  31.523642  12.654321   5.755145   22.654461   
ENCODEHT000769052   2.780064  41.009922   8.058985   4.918033   35.550835   
ENST00000601655.5   4.106081   1.342674   3.737997   5.476108    2.026806   
ENST0

In [21]:
sg.gen_report('ETV3',
              prefix='figures/etv3_cell_line_tpm',
              layer='pi',
              cmap='magma',
              novelty=True,
              transcript_col='tname',
              metadata_cols=['sample', 'timepoint'],
              browser=True)

                     a673_0h    a673_2h    a673_6h   caco2_0h   caco2_2h  \
tid                                                                        
ENST00000368192.8  83.196724  95.998451  92.507935  80.949593  96.964882   
ENST00000326786.4  15.368852   3.937211   7.365079  17.116060   2.999824   
ENST00000460850.1   1.434426   0.064334   0.126984   1.934349   0.035292   

                    caco2_6h   calu3_0h   calu3_2h   calu3_6h  gm12878_0h  \
tid                                                                         
ENST00000368192.8  90.957443  76.190475  96.031342  90.163933   74.547699   
ENST00000326786.4   8.966565  21.635611   3.900528   9.764790   21.833881   
ENST00000460850.1   0.075988   2.173913   0.068131   0.071276    3.618421   

                   ...    hmec_6h   huvec_0h   huvec_2h   huvec_6h   imr90_0h  \
tid                ...                                                          
ENST00000368192.8  ...  91.144638  74.010696  95.515572  91.062805  82.

In [22]:
sg.gen_report('NDUFS5',
              prefix='figures/ndufs5_cell_line_tpm',
              layer='pi',
              cmap='magma',
              novelty=True,
              transcript_col='tname',
              metadata_cols=['sample', 'timepoint'],
              browser=True)

                     a673_0h   a673_2h    a673_6h   caco2_0h   caco2_2h  \
tid                                                                       
ENST00000372967.3  48.762375  45.50898  53.465347  98.496239  98.165138   
ENST00000372969.7  51.237625  54.49102  46.534653   1.503759   1.834862   

                    caco2_6h   calu3_0h   calu3_2h   calu3_6h  gm12878_0h  \
tid                                                                         
ENST00000372967.3  95.297806  47.761192  47.163120  47.368423   99.425285   
ENST00000372969.7   4.702194  52.238804  52.836876  52.631580    0.574713   

                   ...    hmec_6h   huvec_0h   huvec_2h   huvec_6h   imr90_0h  \
tid                ...                                                          
ENST00000372967.3  ...  81.638222  88.520409  84.146339  86.836937  46.733669   
ENST00000372969.7  ...  18.361774  11.479591  15.853659  13.163064  53.266335   

                    imr90_2h   imr90_6h    k562_0h    k562_2h    

In [23]:
sg.gen_report('NDUFS5',
              prefix='figures/ndufs5_cell_line_tpm',
              layer='pi',
              cmap='magma',
              novelty=True,
              transcript_col='tname',
              metadata_cols=['sample', 'timepoint'])

                     a673_0h   a673_2h    a673_6h   caco2_0h   caco2_2h  \
tid                                                                       
ENST00000372967.3  48.762375  45.50898  53.465347  98.496239  98.165138   
ENST00000372969.7  51.237625  54.49102  46.534653   1.503759   1.834862   

                    caco2_6h   calu3_0h   calu3_2h   calu3_6h  gm12878_0h  \
tid                                                                         
ENST00000372967.3  95.297806  47.761192  47.163120  47.368423   99.425285   
ENST00000372969.7   4.702194  52.238804  52.836876  52.631580    0.574713   

                   ...    hmec_6h   huvec_0h   huvec_2h   huvec_6h   imr90_0h  \
tid                ...                                                          
ENST00000372967.3  ...  81.638222  88.520409  84.146339  86.836937  46.733669   
ENST00000372969.7  ...  18.361774  11.479591  15.853659  13.163064  53.266335   

                    imr90_2h   imr90_6h    k562_0h    k562_2h    

In [86]:
sg.gen_report('NDUFS2',
              prefix='figures/ndufs2_cell_line_tpm',
              layer='pi',
              cmap='magma',
              novelty=True,
              transcript_col='tid',
              metadata_cols=['sample', 'timepoint'],
              browser=True)

                     a673_0h    a673_2h    a673_6h   caco2_0h   caco2_2h  \
tid                                                                        
ENCODEHT001061373  59.925556  43.631557  75.961533  58.241760  34.095745   
ENCODEHT001061382   4.342432  13.177611   2.911325   8.669109  20.265957   
ENST00000392179.4  13.523573  19.479946   4.273504   0.000000  12.340425   
ENST00000496133.5   2.357320   7.227854   7.665598   4.517704   8.989362   
ENST00000478866.5   1.861042   3.657999   1.869658   5.982906  11.595745   

                    caco2_6h   calu3_0h   calu3_2h   calu3_6h  gm12878_0h  \
tid                                                                         
ENCODEHT001061373  67.552406  35.175880  34.302326  73.837639   68.875740   
ENCODEHT001061382   4.605846   0.000000  19.694767   6.383764    8.402367   
ENST00000392179.4   4.517272   0.000000   9.520349   0.000000    0.000000   
ENST00000496133.5   6.879244   0.000000   2.252907   3.911439    4.852071   
ENST0

In [4]:
sg.gen_report('BRCA1',
              prefix='figures/brca1_cell_line_tpm',
              layer='pi',
              cmap='magma',
              novelty=True,
              transcript_col='tname',
              metadata_cols=['sample', 'timepoint'],
              browser=True)


Plotting transcripts for ENSG00000012048.21
Saving transcript path graph for ENST00000357654.8 as figures/brca1_cell_line_tpm_browser_ENST00000357654.8_path.png
Saving transcript path graph for ENST00000354071.7 as figures/brca1_cell_line_tpm_browser_ENST00000354071.7_path.png
Saving transcript path graph for ENST00000461221.5 as figures/brca1_cell_line_tpm_browser_ENST00000461221.5_path.png
Saving transcript path graph for ENST00000618469.1 as figures/brca1_cell_line_tpm_browser_ENST00000618469.1_path.png
Saving transcript path graph for ENST00000634433.1 as figures/brca1_cell_line_tpm_browser_ENST00000634433.1_path.png
Saving transcript path graph for ENST00000493795.5 as figures/brca1_cell_line_tpm_browser_ENST00000493795.5_path.png
Saving transcript path graph for ENST00000468300.5 as figures/brca1_cell_line_tpm_browser_ENST00000468300.5_path.png
Saving transcript path graph for ENST00000491747.6 as figures/brca1_cell_line_tpm_browser_ENST00000491747.6_path.png
Saving transcript p

## Get corresponding long read data pi values

In [24]:
sg = swan.read('../../lr_bulk/swan/swan.p')

Read in graph from ../../lr_bulk/swan/swan.p


In [29]:
datasets = pd.read_csv('../ljungman_datasets.tsv', header=None, names=['dataset'])

In [30]:
df, _ = swan.calc_pi(sg.adata, sg.t_df) 

  df.reset_index(inplace=True)


In [33]:
df.fillna(0, inplace=True)

In [34]:
df.head()

tid,ENST00000619216.1,ENST00000469289.1,ENST00000417324.1,ENST00000461467.1,ENST00000453576.2,ENST00000442987.3,ENST00000494149.2,ENST00000410691.1,ENST00000623083.4,ENST00000450734.1,...,ENCODEHT005112837,ENCODEHT005112880,ENCODEHT005112885,ENCODEHT005112887,ENCODEHT005119645,ENCODEHT005119762,ENCODEHT005127235,ENCODEHT005127238,ENCODEHT005148946,ENCODEHT005149625
gm12878_1_1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,55.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
gm12878_1_2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,20.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
gm12878_1_3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,100.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
gm12878_1_4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,50.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
gm12878_3_1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,18.181818,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [37]:
# limit to datasets that match deeply profiled cell ines
df = df.loc[datasets.dataset.tolist()]

In [41]:
df = df.transpose()

In [43]:
df.reset_index(inplace=True)

In [45]:
df.to_csv('lr_bulk_pi.tsv', sep='\t', index=False)

## Get exon / intron information for each transcript

In [46]:
gtf = '../ljungman_known_nic_nnc_talon.gtf'

In [47]:
df = pr.read_gtf(gtf)

In [72]:
e_dfs = [df.df.loc[df.df.Feature == 'exon'].copy(deep=True),
         df.features.introns(by='transcript').df.copy(deep=True)]
edge_df = pd.DataFrame()
for e_df in e_dfs:
    e_df = e_df[['Chromosome', 'Start', 'End', 'Score',
                 'Strand', 'Feature', 'transcript_id']]
    e_df['Name'] = e_df.transcript_id.astype(str)+'_'+e_df.Feature.astype(str)
    # e_df.drop('transcript_id', 'Feature')
    edge_df = pd.concat([edge_df, e_df])

# edge_df = pr.PyRanges(edge_df)

  res = method(*args, **kwargs)


In [73]:
# sort based on coordinates
fwd = edge_df.loc[edge_df.Strand == '+'].copy(deep=True)
rev = edge_df.loc[edge_df.Strand == '-'].copy(deep=True)
fwd = fwd.sort_values(by=['transcript_id', 'Start'], ascending=[True, True])
rev = rev.sort_values(by=['transcript_id', 'Start'], ascending=[True, False])

In [75]:
edge_df = pd.concat([fwd, rev])

In [76]:
# temp = df.features.introns(by='transcript').df

In [77]:
# len(temp.index)

In [78]:
# temp.transcript_id.head()

In [79]:
# len(df.df.loc[df.df.Feature == 'exon'].index)

In [80]:
# formatting
edge_df.drop(['transcript_id', 'Feature'], axis=1, inplace=True)
edge_df = pr.PyRanges(edge_df)
edge_df.to_bed('intron_exon.bed')