In [1]:
import pandas as pd
import numpy as np
import scipy.stats as st
import seaborn as sns
import sys
import os
import gseapy as gp
import matplotlib.pyplot as plt
import swan_vis as swan
import yaml
from snakemake.io import expand

p = os.path.dirname(os.path.dirname(os.getcwd()))
sys.path.append(p)

from proc_revisions.utils import *
from proc_revisions.plotting import *

In [2]:
od = '../../proc_revisions/'
config_file = f'{od}/config.yml'
with open(config_file) as f:
    config = yaml.safe_load(f)

In [3]:
ab = od+expand(config['lr']['talon']['fusion_fix']['ab'], species='human')[0]
unfilt_ab = od+expand(config['lr']['cerberus']['ab'], species='human')[0] 
filt_ab = od+expand(config['lr']['cerberus']['filt_ab'], species='human')[0]
read_annot = od+expand(config['lr']['talon']['full_annot'], species='human')[0]
t_metadata = od+expand(config['ref']['cerberus']['new_gtf_t_info'], species='human')[0]
lib_meta = od+expand(config['lr']['meta'], species='human')[0]
swan_file = od+expand(config['lr']['swan']['sg'], species='human')[0]
cerberus_h5 = od+expand(config['lr']['cerberus']['ca_triplets'], species='human', obs_col='sample')[0]
cerb_t_metadata = od+expand(config['lr']['cerberus']['gtf_t_info'], species='human')[0]
major_isos = od+expand(config['lr']['analysis']['major_isos'], species='human', obs_col='sample')[0]
pi_tpm_table = od+expand(config['lr']['mane']['pi_tpm']['triplet'], species='human', obs_col='sample')[0]

ref_t_metadata = od+expand(config['ref']['new_gtf_t_info'], species='human')[0]
ref_g_metadata = od+expand(config['ref']['new_gtf_g_info'], species='human')[0]

ver = 'v40_cerberus'
min_tpm = 1
gene_subset = 'polya'
obs_col = 'sample'
go_gene_subset = 'protein_coding'
predom_iso_subset = 'protein_coding'

m_lib_meta = od+expand(config['lr']['meta'], species='mouse')[0]
m_cerberus_h5 = od+expand(config['lr']['cerberus']['ca_triplets'], species='mouse', obs_col='sample')[0]
m_swan_file = od+expand(config['lr']['swan']['sg'], species='mouse')[0]
m_filt_ab = od+expand(config['lr']['cerberus']['filt_ab'], species='mouse')[0]



One of the reviewers is worried about how our observations that many genes only have one transcript 
called might be affected by sequencing depth. To double check this, we'll take the library that the reviewer called out, and ask how often transcripts that only had one transcript in this library also have only one transcript in replicates for the same sample * condition, as well as from the same sample type (brain).

In [10]:
ca = cerberus.read(m_cerberus_h5)
sg = swan.read(m_swan_file)
filt_ab_df = pd.read_csv(m_filt_ab, sep='\t')
obs_col = 'dataset'
min_tpm = 1
dataset = 'cortex_14d_f_1'

Read in graph from ../../proc_revisions/data/mouse/lr/swan/swan_graph.p


In [5]:
# library-level observed triplets
df = ca.get_expressed_triplets(sg,
                               obs_col=obs_col,
                               min_tpm=min_tpm,
                               source='dataset_det')

In [20]:
# get just the stuff with cortex_14d_f_1
subset_df = df.loc[df.dataset==dataset].copy(deep=True)
subset_df.head()

# which genes only had one isoform?
gids = subset_df.loc[subset_df.n_iso==1, 'gid'].tolist()

In [21]:
# now get the replicates for this sample * condition
# actually, I should be doing this using the sample-level triplets that I already started calculating
meta = pd.read_csv(m_lib_meta, sep='\t')
sample = meta.loc[meta.dataset == dataset, 'sample'].values[0]
datasets = meta.loc[meta['sample'] == sample, 'dataset'].tolist()
datasets = list(set(datasets) - set([dataset]))

rep_df = ca.triplets.loc[ca.triplets.dataset.isin(datasets)].copy(deep=True)
rep_df.dataset.unique().tolist()

AttributeError: 'DataFrame' object has no attribute 'dataset'

In [22]:
rep_df.head()

Unnamed: 0,gid,dataset,n_iso,n_tss,n_ic,n_tes,splicing_ratio,tss_ratio,tes_ratio,spl_ratio,sector,gene_tpm,gname,source
456886,ENSMUSG00000000001,cortex_14d_f_2,1,1,1,1,1.0,0.333333,0.333333,0.333333,simple,47.220791,Gnai3,dataset_det
456887,ENSMUSG00000000028,cortex_14d_f_2,2,2,1,1,0.666667,0.545455,0.272727,0.181818,tss,3.935066,Cdc45,dataset_det
456888,ENSMUSG00000000056,cortex_14d_f_2,4,1,3,2,2.0,0.2,0.4,0.4,mixed,106.24678,Narf,dataset_det
456889,ENSMUSG00000000058,cortex_14d_f_2,1,1,1,1,1.0,0.333333,0.333333,0.333333,simple,106.24678,Cav2,dataset_det
456890,ENSMUSG00000000078,cortex_14d_f_2,2,1,1,2,0.666667,0.272727,0.545455,0.181818,tes,41.974037,Klf6,dataset_det


In [23]:
# limit to the gids where we only had one isoform in the lowly-sequenced library
print(len(rep_df.index))
rep_df = rep_df.loc[rep_df.gid.isin(gids)]
print(len(rep_df.index))

39371
8810


In [None]:
# count the genes that still have 1 transcript / gene and those that now have more 
# when upscaling like this