In [3]:
import pandas as pd
import numpy as np
import scipy.stats as st
import seaborn as sns
import sys
import os
import gseapy as gp
import matplotlib.pyplot as plt
import swan_vis as swan
import yaml
from snakemake.io import expand

p = os.path.dirname(os.path.dirname(os.getcwd()))
sys.path.append(p)

from proc_revisions.utils import *
from proc_revisions.plotting import *

In [4]:
od = '../../proc_revisions/'
config_file = f'{od}/config.yml'
with open(config_file) as f:
    config = yaml.safe_load(f)

In [5]:
ab = od+expand(config['lr']['talon']['fusion_fix']['ab'], species='human')[0]
unfilt_ab = od+expand(config['lr']['cerberus']['ab'], species='human')[0] 
filt_ab = od+expand(config['lr']['cerberus']['filt_ab'], species='human')[0]
read_annot = od+expand(config['lr']['talon']['full_annot'], species='human')[0]
t_metadata = od+expand(config['ref']['cerberus']['new_gtf_t_info'], species='human')[0]
lib_meta = od+expand(config['lr']['meta'], species='human')[0]
swan_file = od+expand(config['lr']['swan']['sg'], species='human')[0]
cerberus_h5 = od+expand(config['lr']['cerberus']['ca_triplets'], species='human', obs_col='sample')[0]
cerb_t_metadata = od+expand(config['lr']['cerberus']['gtf_t_info'], species='human')[0]
major_isos = od+expand(config['lr']['analysis']['major_isos'], species='human', obs_col='sample')[0]
pi_tpm_table = od+expand(config['lr']['mane']['pi_tpm']['triplet'], species='human', obs_col='sample')[0]

ref_t_metadata = od+expand(config['ref']['new_gtf_t_info'], species='human')[0]
ref_g_metadata = od+expand(config['ref']['new_gtf_g_info'], species='human')[0]

ver = 'v40_cerberus'
min_tpm = 1
gene_subset = 'polya'
obs_col = 'sample'
go_gene_subset = 'protein_coding'
predom_iso_subset = 'protein_coding'

m_lib_meta = od+expand(config['lr']['meta'], species='mouse')[0]

gtex_cerb_gtf = od+expand(config['gtex']['cerberus']['gtf'], species='human')[0]
gtex_filt_ab = od+expand(config['gtex']['cerberus']['ab'], species='human')[0]

## Add indication that sample matches one in the GTEx dataset

In [6]:
df = pd.read_csv(lib_meta, sep='\t')

In [7]:
samples = get_gtex_match_samples()
df['matching_gtex'] = False
df.loc[df['sample'].isin(samples), 'matching_gtex'] = True

In [None]:
# df.to_csv(lib_meta, sep='\t', index=False)

## Add the new spike in stuff that Diane pulled for me

In [53]:
df = pd.read_csv(lib_meta, sep='\t')
df2 = pd.read_csv('human_report.tsv', sep='\t')
df2.rename({'experiment': 'ENCODE_experiment_id'}, axis=1, inplace=True)
df2 = df2[['ENCODE_experiment_id', 'spikein_reference', 'spikein_name']]
df = df.merge(df2, how='left', on='ENCODE_experiment_id')

In [56]:
df.loc[df['ENCODE_experiment_id'] == 'ENCSR838WFC'].head()

Unnamed: 0,ENCODE_experiment_id,dataset,sample,sample_display,general_tissue_cell_type,fig1_tissue_label,health_status,tissue_or_cell_line,sample_color_hex_code,matching_mouse_samples,...,ENCODE_unfiltered_alignments_id,document_urls,document_labels,platform,RIN,spikeins,reads_post_talon,matching_gtex,spikein_reference,spikein_name
47,ENCSR838WFC,gm12878_2_1,gm12878,GM12878,gm12878,blood,,cell_line,#0798c8,,...,ENCFF407TMX,https://www.encodeproject.org/documents/77db75...,"'PacBio LongRead 2.0', 'ENCODE Long Read RNA-S...",Pacific Biosciences Sequel II,,True,2026295,False,"ENCSR089MWE,ENCSR156CIL","SIRV,ERCC"
48,ENCSR838WFC,gm12878_2_1,gm12878,GM12878,gm12878,blood,,cell_line,#0798c8,,...,ENCFF407TMX,https://www.encodeproject.org/documents/77db75...,"'PacBio LongRead 2.0', 'ENCODE Long Read RNA-S...",Pacific Biosciences Sequel II,,True,2026295,False,"ENCSR089MWE,ENCSR156CIL","SIRV,ERCC"
49,ENCSR838WFC,gm12878_2_2,gm12878,GM12878,gm12878,blood,,cell_line,#0798c8,,...,ENCFF592BQN,https://www.encodeproject.org/documents/77db75...,"'PacBio LongRead 2.0', 'ENCODE Long Read RNA-S...",Pacific Biosciences Sequel II,,True,2428259,False,"ENCSR089MWE,ENCSR156CIL","SIRV,ERCC"
50,ENCSR838WFC,gm12878_2_2,gm12878,GM12878,gm12878,blood,,cell_line,#0798c8,,...,ENCFF592BQN,https://www.encodeproject.org/documents/77db75...,"'PacBio LongRead 2.0', 'ENCODE Long Read RNA-S...",Pacific Biosciences Sequel II,,True,2428259,False,"ENCSR089MWE,ENCSR156CIL","SIRV,ERCC"


In [59]:
df[['ENCODE_experiment_id', 'spikeins', 'spikein_name']].groupby(['spikeins', 'spikein_name'], dropna=False).count()

Unnamed: 0_level_0,Unnamed: 1_level_0,ENCODE_experiment_id
spikeins,spikein_name,Unnamed: 2_level_1
False,,54
True,"ERCC,SIRV",7
True,"ERCC,SIRV_4",54
True,"SIRV,ERCC",81
True,"SIRV_4,ERCC",39
True,"SIRV_4,LRGASP ERCC",9


In [61]:
# df.to_csv(lib_meta, sep='\t', index=False)

In [62]:
df = pd.read_csv(m_lib_meta, sep='\t')
df2 = pd.read_csv('mouse_report.tsv', sep='\t')
df2.rename({'experiment': 'ENCODE_experiment_id'}, axis=1, inplace=True)
df2 = df2[['ENCODE_experiment_id', 'spikein_reference', 'spikein_name']]
df = df.merge(df2, how='left', on='ENCODE_experiment_id')

In [63]:
df[['ENCODE_experiment_id', 'spikeins', 'spikein_name']].groupby(['spikeins', 'spikein_name'], dropna=False).count()

Unnamed: 0_level_0,Unnamed: 1_level_0,ENCODE_experiment_id
spikeins,spikein_name,Unnamed: 2_level_1
False,,22
True,"SIRV,ERCC",40
True,"SIRV_4,ERCC",181
True,"SIRV_4,LRGASP ERCC",9
True,,1


In [64]:
# df.to_csv(m_lib_meta, sep='\t', index=False)