In [1]:
import pandas as pd
import numpy as np
import scipy.stats as st
import seaborn as sns
import sys
import os
import gseapy as gp
import matplotlib.pyplot as plt
import swan_vis as swan
import yaml
from snakemake.io import expand

p = os.path.dirname(os.path.dirname(os.getcwd()))
sys.path.append(p)

from proc_revisions.utils import *
from proc_revisions.plotting import *

In [2]:
od = '../../proc_revisions/'
config_file = f'{od}/config.yml'
with open(config_file) as f:
    config = yaml.safe_load(f)

In [16]:
ab = od+expand(config['lr']['talon']['fusion_fix']['ab'], species='human')[0]
unfilt_ab = od+expand(config['lr']['cerberus']['ab'], species='human')[0] 
filt_ab = od+expand(config['lr']['cerberus']['filt_ab'], species='human')[0]
read_annot = od+expand(config['lr']['talon']['full_annot'], species='human')[0]
t_metadata = od+expand(config['ref']['cerberus']['new_gtf_t_info'], species='human')[0]
lib_meta = od+expand(config['lr']['meta'], species='human')[0]
swan_file = od+expand(config['lr']['swan']['sg'], species='human')[0]
cerberus_h5 = od+expand(config['lr']['cerberus']['ca_triplets'], species='human', obs_col='sample')[0]
cerb_t_metadata = od+expand(config['lr']['cerberus']['gtf_t_info'], species='human')[0]
major_isos = od+expand(config['lr']['analysis']['major_isos'], species='human', obs_col='sample')[0]
pi_tpm_table = od+expand(config['lr']['mane']['pi_tpm']['triplet'], species='human', obs_col='sample')[0]

ref_t_metadata = od+expand(config['ref']['new_gtf_t_info'], species='human')[0]
ref_g_metadata = od+expand(config['ref']['new_gtf_g_info'], species='human')[0]

ver = 'v40_cerberus'
min_tpm = 1
gene_subset = 'polya'
obs_col = 'sample'
go_gene_subset = 'protein_coding'
predom_iso_subset = 'protein_coding'

m_lib_meta = od+expand(config['lr']['meta'], species='mouse')[0]
m_cerberus_h5 = od+expand(config['lr']['cerberus']['ca_triplets'], species='mouse', obs_col='sample')[0]
m_swan_file = od+expand(config['lr']['swan']['sg'], species='mouse')[0]
m_filt_ab = od+expand(config['lr']['cerberus']['filt_ab'], species='mouse')[0]
m_read_annot = od+expand(config['lr']['talon']['full_annot'], species='mouse')[0]




## Recalculate the # post-talon reads for each library and add to the metadata

In [4]:
# human
df = pd.read_csv(read_annot, sep='\t', usecols=[0,1])
df = df.groupby('dataset').count().reset_index().rename({'read_name':'reads_post_talon'}, axis=1)
meta = pd.read_csv(lib_meta, sep='\t')
meta.drop('reads_post_talon', axis=1, inplace=True)
meta = meta.merge(df, how='left', on='dataset')
meta.to_csv(lib_meta, index=False, sep='\t')

In [17]:
# mouse
df = pd.read_csv(m_read_annot, sep='\t', usecols=[0,1])
df = df.groupby('dataset').count().reset_index().rename({'read_name':'reads_post_talon'}, axis=1)
meta = pd.read_csv(m_lib_meta, sep='\t')
meta.drop('reads_post_talon', axis=1, inplace=True)
meta = meta.merge(df, how='left', on='dataset')
meta.to_csv(m_lib_meta, index=False, sep='\t')

In [18]:
meta.head()

Unnamed: 0,ENCODE_experiment_id,dataset,sample,sample_display,general_tissue_cell_type,fig1_tissue_label,age,sex,genotype,tissue_or_cell_line,...,matching_human_samples,ENCODE_alignments_id,ENCODE_reads_id,ENCODE_unfiltered_alignments_id,document_urls,document_labels,platform,RIN,spikeins,reads_post_talon
0,ENCSR665XLM,adrenal_10d_f_1,adrenal_10d,Adrenal gland PND10,adrenal gland,adrenal gland,10d,f,b6/cast,tissue,...,['adrenal_gland'],ENCFF550NPJ,ENCFF321AJK,ENCFF756SOZ,https://www.encodeproject.org/documents/3baa46...,"'PacBio libraries v3 (October, 2020) Protocol ...",Pacific Biosciences Sequel II,8.4,True,606677
1,ENCSR665XLM,adrenal_10d_f_2,adrenal_10d,Adrenal gland PND10,adrenal gland,adrenal gland,10d,f,b6/cast,tissue,...,['adrenal_gland'],ENCFF218PAQ,ENCFF863AGD,ENCFF276HLF,https://www.encodeproject.org/documents/3baa46...,"'PacBio libraries v3 (October, 2020) Protocol ...",Pacific Biosciences Sequel II,8.4,True,788654
2,ENCSR698DOB,adrenal_10d_m_2,adrenal_10d,Adrenal gland PND10,adrenal gland,adrenal gland,10d,m,b6/cast,tissue,...,['adrenal_gland'],ENCFF261QFC,ENCFF856RHM,ENCFF674HLT,https://www.encodeproject.org/documents/3baa46...,"'PacBio libraries v3 (October, 2020) Protocol ...",Pacific Biosciences Sequel II,8.9,True,1407511
3,ENCSR698DOB,adrenal_10d_m_1,adrenal_10d,Adrenal gland PND10,adrenal gland,adrenal gland,10d,m,b6/cast,tissue,...,['adrenal_gland'],ENCFF336YOF,ENCFF303OLU,ENCFF676ASW,https://www.encodeproject.org/documents/3baa46...,"'PacBio libraries v3 (October, 2020) Protocol ...",Pacific Biosciences Sequel II,8.9,True,971827
4,ENCSR530ZLE,adrenal_14d_f_2,adrenal_14d,Adrenal gland PND14,adrenal gland,adrenal gland,14d,f,b6/cast,tissue,...,['adrenal_gland'],ENCFF402PHJ,ENCFF435KPO,ENCFF704NBV,https://www.encodeproject.org/documents/3baa46...,"'PacBio libraries v3 (October, 2020) Protocol ...",Pacific Biosciences Sequel II,8.5,True,991731
