# Analysis of long-read transcriptomes

### Data pre-processing

Run Minimap2, TranscriptClean, and TALON using `run_talon_tc.sh`

Create TALON abundance file
```bash 
db=talon/pgp1.db
talon_abundance \
    --db $db \
    -a gencode_v29 \
    -b hg38 \
    --o talon/pgp1
```

Filter novel transcripts for reproducibility
```bash
db=talon/pgp1.db
talon_filter_transcripts \
    --db $db \
    -a gencode_v29 \
    --maxFracA=0.5 \
    --minCount=5 \
    --minDatasets=2 \
    --o talon/pgp1_pass_list.csv
```

### TSS

Isolate reads that represent more confident 5' ends.

In [8]:
import pandas as pd
import swan_vis as swan

  from pandas.core.index import RangeIndex


In [5]:
annot = 'talon/pgp1_talon_read_annot.tsv'

In [7]:
# limit the reads to those that represent putative 5' ends
df = pd.read_csv(annot, sep='\t')
tss_df = df.loc[df.transcript_novelty.isin(['Known', 'NIC', 'NNC', 'ISM'])]
tss_df = tss_df.loc[tss_df.ISM_subtype.isin(['None', 'Prefix', 'Both'])]
tss_reads = tss_df.read_name.tolist()

# tss
fname = 'tss_read_names.txt'
with open(fname, 'w') as ofile:
    for r in tss_reads:
        ofile.write(r+'\n')

Create a sam file with all the TSS reads from the merged BAM using picard tools `isolate_tss_reads.sh`

Call TSSs using Diane's script
```bash
tss_dir=~/mortazavi_lab/bin/tss-annotation/long_read/
python ${tss_dir}pacbio_to_tss.py \
    -i tss_reads.bam \
    --window-size=50 \
    --expression-threshold=2 \
    -o unfilt_tss.bed \
    -r \
    -n rev_tss.bw \
    -p fwd_tss.bw
```

### Swan

Swan config file

In [17]:
annot = '/Users/fairliereese/mortazavi_lab/ref/gencode.v29/gencode.v29.SIRV.ERCC.annotation.gtf'

# initialize SwanGraph
sg = swan.SwanGraph()

# add annotation GTF for reference
sg.add_annotation(annot)

# add transcript models from each dataset
sg.add_datasets('swan_config.tsv')

sg.save_graph('swan')


Adding dataset annotation to the SwanGraph

Adding dataset astro_1 to the SwanGraph

Adding dataset astro_2 to the SwanGraph

Adding dataset excite_neuron_1 to the SwanGraph

Adding dataset excite_neuron_2 to the SwanGraph

Adding dataset pgp1_1 to the SwanGraph

Adding dataset pgp1_2 to the SwanGraph
Saving graph as swan.p


### TSS switching

### Isoform switching

In [18]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import scanpy as sc
import numpy as np
import anndata
import scipy.stats as st
import statsmodels.stats as stm
from statsmodels.stats.multitest import multipletests

In [166]:
pass_list = 'talon/pgp1_pass_list.csv'
ab_file = 'talon/pgp1_talon_abundance.tsv'
cond_map = {'Astrocytes': ['astro_1', 'astro_2'], \
            'Excitatory neurons': ['excite_neuron_1', 'excite_neuron_2'], \
            'PGP1': ['pgp1_1', 'pgp1_2']}
adata = make_adata(ab_file, pass_list, cond_map)

Transforming to str index.
Transforming to str index.


In [165]:
def make_adata(ab_file, pass_list, cond_map):
    
    # filter talon ab file based on pass list
    df = pd.read_csv(ab_file, sep='\t')
    pass_list = pd.read_csv(pass_list, header=None, names=['gene_id', 'transcript_id'])
    df = df.loc[df.transcript_ID.isin(pass_list.transcript_id.tolist())]

    # obs table
    obs = pd.DataFrame.from_dict(cond_map, orient='index')
    obs.reset_index(inplace=True)
    id_vars = ['index']
    value_vars = obs.columns[1:]
    obs = obs.melt(id_vars=id_vars, value_vars=value_vars)
    obs.drop('variable', axis=1, inplace=True)
    obs.rename({'index':'condition', 'value':'dataset'}, axis=1, inplace=True)

    # var table
    var_cols = ['annot_transcript_id', 'annot_gene_id', \
                 'gene_ID', 'transcript_ID', 'transcript_novelty', \
                 'ISM_subtype']
    var = df[var_cols]

    # X table
    df = df.transpose()
    df = df.loc[df.index.isin(obs.dataset.tolist())]
    obs_order = obs['dataset'].reset_index().set_index('dataset')
    df['dataset_num'] = df.index.map(obs_order['index'])
    df.sort_values('dataset_num', inplace=True)
    df.drop('dataset_num', axis=1, inplace=True)
    X = df.to_numpy()

    adata = anndata.AnnData(obs=obs, var=var, X=X) 
    
    return adata