In [1]:
import pandas as pd
import numpy as np
import scipy.stats as st
import seaborn as sns
import sys
import os
import gseapy as gp
import matplotlib.pyplot as plt
import swan_vis as swan
import yaml
from snakemake.io import expand
import itertools
import glob

p = os.path.dirname(os.path.dirname(os.getcwd()))
sys.path.append(p)

from proc_revisions.utils import *
from proc_revisions.plotting import *

In [2]:
od = '../../proc_revisions/'
config_file = f'{od}/config.yml'
with open(config_file) as f:
    config = yaml.safe_load(f)

In [3]:
ab = od+expand(config['lr']['talon']['fusion_fix']['ab'], species='human')[0]
filt_ab = od+expand(config['lr']['cerberus']['filt_ab'], species='human')[0]
read_annot = od+expand(config['lr']['talon']['full_annot'], species='human')[0]
t_metadata = od+expand(config['ref']['cerberus']['new_gtf_t_info'], species='human')[0]
lib_meta = od+expand(config['lr']['meta'], species='human')[0]
swan_file = od+expand(config['lr']['swan']['sg'], species='human')[0]
cerberus_h5 = od+expand(config['lr']['cerberus']['ca_triplets'], species='human', obs_col='sample')[0]
cerb_t_metadata = od+expand(config['lr']['cerberus']['gtf_t_info'], species='human')[0]
major_isos = od+expand(config['lr']['analysis']['major_isos'], species='human', obs_col='sample')[0]
pi_tpm_table = od+expand(config['lr']['mane']['pi_tpm']['triplet'], species='human', obs_col='sample')[0]
pp_summary = od+expand(config['lr']['protein_pred']['summary'], species='human')[0]

ref_t_metadata = od+expand(config['ref']['new_gtf_t_info'], species='human')[0]
ref_g_metadata = od+expand(config['ref']['new_gtf_g_info'], species='human')[0]

sr_ab = '../'+config['sr']['ab']


ver = 'v40_cerberus'
min_tpm = 1
gene_subset = 'protein_coding'
obs_col = 'sample'
go_gene_subset = 'protein_coding'

m_ab = od+expand(config['lr']['talon']['fusion_fix']['ab'], species='mouse')[0]
m_filt_ab = od+expand(config['lr']['cerberus']['filt_ab'], species='mouse')[0]
m_read_annot = od+expand(config['lr']['talon']['full_annot'], species='mouse')[0]
m_t_metadata = od+expand(config['ref']['cerberus']['new_gtf_t_info'], species='mouse')[0]
m_lib_meta = od+expand(config['lr']['meta'], species='mouse')[0]
m_swan_file = od+expand(config['lr']['swan']['sg'], species='mouse')[0]
m_cerberus_h5 = od+expand(config['lr']['cerberus']['ca_triplets'], species='mouse', obs_col='sample')[0]
cerb_t_metadata = od+expand(config['lr']['cerberus']['gtf_t_info'], species='mouse')[0]
m_major_isos = od+expand(config['lr']['analysis']['major_isos'], species='mouse', obs_col='sample')[0]
m_pi_tpm_table = od+expand(config['lr']['mane']['pi_tpm']['triplet'], species='mouse', obs_col='sample')[0]
m_pp_summary = od+expand(config['lr']['protein_pred']['summary'], species='mouse')[0]


m_ref_t_metadata = od+expand(config['ref']['new_gtf_t_info'], species='mouse')[0]
m_ref_g_metadata = od+expand(config['ref']['new_gtf_g_info'], species='mouse')[0]


orth_table = '../../proc_revisions/ref/biomart_human_to_mouse.tsv'
mouse_ver = 'vM25_cerberus'

In [4]:
files = glob.glob('../../proc_revisions/data/mouse/lr/du/*du*tsv')

# filter
p = 0.01
dpi = 10

df = pd.DataFrame()
for f in files:
    tissue = f.rsplit('/', maxsplit=1)[1].split('_', maxsplit=1)[0]
    timept = f.rsplit('/', maxsplit=1)[1].split('_vs_', maxsplit=1)[0].split('_', maxsplit=1)[1]
    timept2 = f.rsplit('/', maxsplit=1)[1].split(tissue, maxsplit=2)[2].split('_', maxsplit=2)[1]
    feat = f.rsplit('_', maxsplit=1)[1].rsplit('.tsv', maxsplit=1)[0]
    
    def make_add_temp(f, df, rev=False):
        temp = pd.read_csv(f, sep='\t')
        temp['tissue'] = tissue
        if not rev:
            temp['time1'] = timept
            temp['time2'] = timept2    
        else:
            temp['time1'] = timept2
            temp['time2'] = timept    
        temp['feat'] = feat
        temp['fname'] = f
        df = pd.concat([temp, df], axis=0)            
        return df
    df = make_add_temp(f, df)
    df = make_add_temp(f, df, rev=True)
    df = df.loc[(df.dpi.abs()>=dpi)&(df.adj_p_val<=p)]
    

In [None]:
# for each tissue and feat, get the GO terms assc. 

In [6]:
df.to_csv('mouse_tc_du.tsv', sep='\t', index=None)

In [11]:
df

Unnamed: 0.1,Unnamed: 0,gid,p_val,dpi,pos_iso_1,pos_iso_2,pos_iso_1_dpi,pos_iso_2_dpi,neg_iso_1,neg_iso_2,neg_iso_1_dpi,neg_iso_2_dpi,adj_p_val,gname,tissue,time1,time2,feat,fname
318,318,ENSMUSG00000006676,2.748285e-07,19.625010,ENSMUSG00000006676_2,,19.625010,,ENSMUSG00000006676_1,,-19.625008,,1.274380e-04,Usp19,gastroc,36d,25d,tes,../../proc_revisions/data/mouse/lr/du/gastroc_...
647,647,ENSMUSG00000019787,7.634857e-30,14.930023,ENSMUSG00000019787_1,ENSMUSG00000019787_8,5.232929,3.791228,ENSMUSG00000019787_4,ENSMUSG00000019787_3,-12.354973,-2.575050,3.540283e-26,Trdn,gastroc,36d,25d,tes,../../proc_revisions/data/mouse/lr/du/gastroc_...
862,862,ENSMUSG00000020849,7.342589e-24,14.840004,ENSMUSG00000020849_1,ENSMUSG00000020849_4,14.778123,0.061881,ENSMUSG00000020849_6,,-14.840004,,1.702379e-20,Ywhae,gastroc,36d,25d,tes,../../proc_revisions/data/mouse/lr/du/gastroc_...
1655,1655,ENSMUSG00000026179,2.223188e-05,11.532897,ENSMUSG00000026179_1,,11.532897,,ENSMUSG00000026179_2,ENSMUSG00000026179_4,-10.659004,-0.852273,6.064072e-03,Pnkd,gastroc,36d,25d,tes,../../proc_revisions/data/mouse/lr/du/gastroc_...
2040,2040,ENSMUSG00000028396,4.637256e-06,20.045031,ENSMUSG00000028396_1,,20.045031,,ENSMUSG00000028396_2,,-20.045025,,1.535925e-03,2310002L09Rik,gastroc,36d,25d,tes,../../proc_revisions/data/mouse/lr/du/gastroc_...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6251,6251,ENSMUSG00000092471,6.031704e-04,13.074879,ENSMUSG00000092471_2,,13.074879,,ENSMUSG00000092471_1,,-13.074879,,8.581714e-03,Cyp21a2-ps,adrenal,10d,18-20mo,tes,../../proc_revisions/data/mouse/lr/du/adrenal_...
6266,6266,ENSMUSG00000095041,1.547803e-07,22.447186,ENSMUSG00000095041_2,,22.447186,,ENSMUSG00000095041_1,ENSMUSG00000095041_3,-16.872063,-3.164124,5.907604e-06,AC149090.1,adrenal,10d,18-20mo,tes,../../proc_revisions/data/mouse/lr/du/adrenal_...
6267,6267,ENSMUSG00000095098,9.898400e-05,19.555444,ENSMUSG00000095098_1,,19.555444,,ENSMUSG00000095098_2,,-19.555443,,1.912534e-03,Ccdc85b,adrenal,10d,18-20mo,tes,../../proc_revisions/data/mouse/lr/du/adrenal_...
6292,6292,ENSMUSG00000097305,7.549652e-05,72.756412,ENSMUSG00000097305_2,ENSMUSG00000097305_3,41.987180,30.769232,ENSMUSG00000097305_1,ENSMUSG00000097305_4,-38.141028,-34.615384,1.532531e-03,Gm17276,adrenal,10d,18-20mo,tes,../../proc_revisions/data/mouse/lr/du/adrenal_...


In [15]:
def do_go_thing(df, tissue, feat):
    dbs = ['GO_Biological_Process_2021',
       'GO_Cellular_Component_2021',
       'GO_Molecular_Function_2021',
       'KEGG_2019_Mouse',
       'Pfam_Domains_2019']
    s = f'{tissue}_{feat}'
    desc = '{} genes'.format(s)
    odir = '{}_du_genes_GO'.format(s)
    gnames = df.gname.tolist()
    go = gp.enrichr(gene_list=gnames,
                    gene_sets=dbs,
                    organism='Mouse',
                    description=desc,
                    outdir=odir,
                    cutoff=0.5)

In [16]:
feats = ['tss', 'ic', 'tes', 'iso']
for t in tissue:
    for feat in feats:
        temp = df.loc[(df.feat==feat)&\
                      (df.tissue==tissue)]
        do_go_thing(temp, t, feat)

In [13]:
temp

Unnamed: 0.1,Unnamed: 0,gid,p_val,dpi,pos_iso_1,pos_iso_2,pos_iso_1_dpi,pos_iso_2_dpi,neg_iso_1,neg_iso_2,neg_iso_1_dpi,neg_iso_2_dpi,adj_p_val,gname,tissue,time1,time2,feat,fname
72,72,ENSMUSG00000001508,4.505519e-09,15.080744,ENSMUSG00000001508_2,ENSMUSG00000001508_4,14.856528,0.224215,ENSMUSG00000001508_1,,-15.080742,,2.947897e-07,Sgca,gastroc,2mo,14d,tss,../../proc_revisions/data/mouse/lr/du/gastroc_...
109,109,ENSMUSG00000002107,2.451001e-04,15.477374,ENSMUSG00000002107_1,,15.477373,,ENSMUSG00000002107_10,ENSMUSG00000002107_3,-14.725494,-0.751880,5.873852e-03,Celf2,gastroc,2mo,14d,tss,../../proc_revisions/data/mouse/lr/du/gastroc_...
141,141,ENSMUSG00000002768,4.141569e-05,26.400001,ENSMUSG00000002768_6,ENSMUSG00000002768_1,24.800001,1.600000,ENSMUSG00000002768_3,ENSMUSG00000002768_5,-25.599998,-0.800000,1.255261e-03,Mea1,gastroc,2mo,14d,tss,../../proc_revisions/data/mouse/lr/du/gastroc_...
157,157,ENSMUSG00000002944,6.102885e-11,22.948716,ENSMUSG00000002944_1,,22.948716,,ENSMUSG00000002944_2,ENSMUSG00000002944_3,-14.846157,-5.484850,4.932567e-09,Cd36,gastroc,2mo,14d,tss,../../proc_revisions/data/mouse/lr/du/gastroc_...
178,178,ENSMUSG00000003308,1.615092e-05,21.964027,ENSMUSG00000003308_1,,21.964027,,ENSMUSG00000003308_2,ENSMUSG00000003308_3,-19.350555,-2.613472,5.501992e-04,Keap1,gastroc,2mo,14d,tss,../../proc_revisions/data/mouse/lr/du/gastroc_...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3251,3251,ENSMUSG00000057003,0.000000e+00,30.153408,ENSMUSG00000057003_7,ENSMUSG00000057003_1,22.712410,3.980694,ENSMUSG00000057003_9,,-30.153408,,0.000000e+00,Myh4,gastroc,25d,36d,tss,../../proc_revisions/data/mouse/lr/du/gastroc_...
3339,3339,ENSMUSG00000060261,3.662058e-07,23.008131,ENSMUSG00000060261_1,ENSMUSG00000060261_7,18.455280,4.390244,ENSMUSG00000060261_18,ENSMUSG00000060261_4,-22.926830,-0.081301,5.167571e-05,Gtf2i,gastroc,25d,36d,tss,../../proc_revisions/data/mouse/lr/du/gastroc_...
3466,3466,ENSMUSG00000064363,2.083535e-07,11.632961,ENSMUSG00000064363_6,ENSMUSG00000064363_5,8.540443,1.699206,ENSMUSG00000064363_1,,-11.632961,,3.053180e-05,mt-Nd4,gastroc,25d,36d,tss,../../proc_revisions/data/mouse/lr/du/gastroc_...
3534,3534,ENSMUSG00000070934,2.929438e-08,17.873823,ENSMUSG00000070934_1,,17.873823,,ENSMUSG00000070934_3,,-17.873821,,4.852677e-06,Rraga,gastroc,25d,36d,tss,../../proc_revisions/data/mouse/lr/du/gastroc_...
