In [2]:
import pandas as pd
import numpy as np
import scipy.stats as st
import seaborn as sns
import sys
import os
import gseapy as gp
import matplotlib.pyplot as plt
import swan_vis as swan
from pandarallel import pandarallel

p = os.path.dirname(os.path.dirname(os.getcwd()))
sys.path.append(p)

from scripts.utils import *
from scripts.plotting import *

In [1]:
ab = '../../lr_bulk/talon/human_talon_abundance.tsv'
filt_ab = '../../lr_bulk/cerberus/cerberus_filtered_abundance.tsv'
read_annot = '../../lr_bulk/talon/human_talon_read_annot.tsv'
t_metadata = '../../refs/cerberus/v40_transcript_metadata.tsv'
t_orig_metadata = '../../refs/v40_transcript_metadata.tsv'
swan_file = '../../lr_bulk/cerberus/swan/swan_meta.p'
cerberus_h5 = '../../lr_bulk/cerberus/triplets/cerberus_annot_triplets.h5'
major_isos = '../../lr_bulk/cerberus/swan/major_isos.tsv'
mane_file = '../../refs/v40_gene_metadata.tsv'

ver = 'v40_cerberus'

min_tpm = 1
gene_subset = 'protein_coding'
obs_col = 'sample'

mouse_ab = '../../mouse/lr_bulk/talon/mouse_talon_abundance.tsv'
mouse_filt_ab = '../../mouse/lr_bulk/cerberus/cerberus_filtered_abundance.tsv'
mouse_read_annot = '../../mouse/lr_bulk/talon/mouse_talon_read_annot.tsv'
mouse_swan_file = '../../mouse/lr_bulk/swan/swan.p'
mouse_meta = '../../mouse/lr_bulk/lr_mouse_library_data_summary.tsv'
mouse_t_metadata = '../../mouse/refs/cerberus/vM25_transcript_metadata.tsv'
mouse_ver = 'vM25_cerberus'
mouse_h5 = '../../mouse/lr_bulk/triplets/cerberus_annot_triplets.h5'

orth_table = '../../refs/biomart_human_to_mouse.tsv'

## Human vs mouse sector sankey

In [None]:
# major isoform triplets mouse match 
h_source = 'obs_mm_major'
m_source = 'obs_major'

df = human_v_mouse_sectors(cerberus_h5,
                      mouse_h5,
                      h_source,
                      m_source,
                      gene_subset=[gene_subset],
                      ofile='figures/obs_major_mouse_v_human_triplets_sankey.pdf')

## Simplex plots for human and mouse ARF4 / Arf4

In [3]:
# restrict sample triplets to mm ones
ca = cerberus.read(cerberus_h5)
samples = get_mouse_match_samples()
inds = ca.triplets.loc[(ca.triplets.source == 'sample_det')&\
                       ~(ca.triplets['sample'].isin(samples))].index
ca.triplets.drop(inds, axis=0, inplace=True)
ca.triplets.loc[ca.triplets.source == 'sample_det', 'source'] = 'sample_mm_det'
ca = get_centroids(ca, 
                   source='sample_mm_det',
                   gene_subset=gene_subset,
                   ver=ver)

# add suffix to mouse sources
m_ca = cerberus.read(mouse_h5)
m_ca.triplets.loc[m_ca.triplets.source == 'sample_det', 'source'] ='sample_det_mouse'
m_ca = get_centroids(m_ca,
                     source='sample_det_mouse',
                     gene_subset=gene_subset,
                     ver=mouse_ver)

In [4]:
# merge in gids for orthologs
def get_human_mouse_gid_table(fname):
    # get matching gids from human and mouse
    df = pd.read_csv(fname, sep='\t')

    # drop nans in either human or mouse
    df = df[['Gene stable ID', 'Mouse gene stable ID']]
    df = df.loc[~df['Gene stable ID'].isnull()]
    df = df.loc[~df['Mouse gene stable ID'].isnull()]    
    
    df = df.drop_duplicates()
    return df

In [5]:
hm_gids = get_human_mouse_gid_table(orth_table)

In [6]:
# merge in mouse gids that we need
ca.triplets = ca.triplets.merge(hm_gids, how='inner', left_on='gid', right_on='Gene stable ID')

# merge in mouse data
m_ca.triplets = m_ca.triplets.merge(hm_gids, how='inner', left_on='gid', right_on='Mouse gene stable ID')

In [7]:
# concatenate the dfs
ca.triplets = pd.concat([ca.triplets, m_ca.triplets], axis=0)

In [8]:
# ca.triplets['gid'] = ca.triplets['Gene stable ID'] 
# ca.triplets['gname'] = 'temp'

In [9]:
ca.triplets.source.unique()

array(['lapa', 'gtex', 'sample_mm_det', 'sample_major',
       'sample_mm_det_centroid', 'v29', 'obs_mm_det', 'obs_mm_major',
       'v40', 'obs_det', 'obs_major', 'all', 'vM25', 'vM21',
       'sample_det_mouse', 'tissue_det', 'tissue_major',
       'tissue_adult_det', 'tissue_adult_major',
       'sample_det_mouse_centroid'], dtype=object)

In [10]:
df = compute_dists(ca, 'sample_mm_det_centroid', 'sample_det_mouse_centroid', gene_merge=['Gene stable ID'])

In [13]:
df.head()

Unnamed: 0,source_sample_mm_det_centroid,gid_sample_mm_det_centroid,n_tss_sample_mm_det_centroid,n_tes_sample_mm_det_centroid,n_ic_sample_mm_det_centroid,n_iso_sample_mm_det_centroid,splicing_ratio_sample_mm_det_centroid,tss_ratio_sample_mm_det_centroid,tes_ratio_sample_mm_det_centroid,spl_ratio_sample_mm_det_centroid,...,spl_ratio_sample_det_mouse_centroid,sector_sample_det_mouse_centroid,gname_sample_det_mouse_centroid,sample_sample_det_mouse_centroid,gene_tpm_sample_det_mouse_centroid,Mouse gene stable ID_sample_det_mouse_centroid,tissue_sample_det_mouse_centroid,tissue_adult_sample_det_mouse_centroid,dist,z_score
0,sample_mm_det_centroid,ENSG00000109667,1.0,1.0,1.0,1.0,1.0,0.333333,0.333333,0.333333,...,0.321476,mixed,Slc2a9,,,ENSMUSG00000005107,,,0.009765,-1.272098
1,sample_mm_det_centroid,ENSG00000109805,1.0,2.428571,2.285714,3.571429,1.261905,0.242615,0.470387,0.286999,...,0.293995,mixed,Ncapg,,,ENSMUSG00000015880,,,0.053128,-0.611812
2,sample_mm_det_centroid,ENSG00000143340,1.0,1.0,1.0,1.0,1.0,0.333333,0.333333,0.333333,...,0.568421,splicing,Fam163a,,,ENSMUSG00000015484,,,0.167865,1.135293
3,sample_mm_det_centroid,ENSG00000149633,2.333333,2.0,2.333333,3.0,1.066667,0.415259,0.361495,0.223246,...,0.333333,simple,D630003M21Rik,,,ENSMUSG00000037813,,,0.089504,-0.057906
4,sample_mm_det_centroid,ENSG00000149636,1.0,1.0,5.666667,5.666667,5.666667,0.145326,0.145326,0.709347,...,0.350068,mixed,Dsn1,,,ENSMUSG00000027635,,,0.263908,2.597731


In [None]:
# mouse match stuff
c_dict, order = get_biosample_colors()
c_dict[np.nan] = 'k'

mm_tissues = get_mouse_match_samples()
print(mm_tissues)
mm_tissues += [np.nan]
mmap = {'v40': '*', 'v29': 'x', 'obs_mm_det': '^', 'cerberus': '2'}
subset = {'source': ['v40', 'obs_mm_det', 'sample_det']}

# can't subset the way that I want because I want to use and and or logic...
ca.triplets = ca.triplets.loc[ca.triplets.source.isin(subset['source'])]
ca.triplets = ca.triplets.loc[ca.triplets['sample'].isin(mm_tissues)]