In [1]:
import pandas as pd
import numpy as np
import scipy.stats as st
import seaborn as sns
import sys
import os
import gseapy as gp
import matplotlib.pyplot as plt
import swan_vis as swan
import yaml
from snakemake.io import expand
import itertools
import glob

p = os.path.dirname(os.path.dirname(os.getcwd()))
sys.path.append(p)

from proc_revisions.utils import *
from proc_revisions.plotting import *

In [2]:
od = '../../proc_revisions/'
config_file = f'{od}/config.yml'
with open(config_file) as f:
    config = yaml.safe_load(f)

In [3]:
ab = od+expand(config['lr']['talon']['fusion_fix']['ab'], species='human')[0]
filt_ab = od+expand(config['lr']['cerberus']['filt_ab'], species='human')[0]
read_annot = od+expand(config['lr']['talon']['full_annot'], species='human')[0]
t_metadata = od+expand(config['ref']['cerberus']['new_gtf_t_info'], species='human')[0]
lib_meta = od+expand(config['lr']['meta'], species='human')[0]
swan_file = od+expand(config['lr']['swan']['sg'], species='human')[0]
cerberus_h5 = od+expand(config['lr']['cerberus']['ca_triplets'], species='human', obs_col='sample')[0]
cerb_t_metadata = od+expand(config['lr']['cerberus']['gtf_t_info'], species='human')[0]
major_isos = od+expand(config['lr']['analysis']['major_isos'], species='human', obs_col='sample')[0]
pi_tpm_table = od+expand(config['lr']['mane']['pi_tpm']['triplet'], species='human', obs_col='sample')[0]
pp_summary = od+expand(config['lr']['protein_pred']['summary'], species='human')[0]

ref_t_metadata = od+expand(config['ref']['new_gtf_t_info'], species='human')[0]
ref_g_metadata = od+expand(config['ref']['new_gtf_g_info'], species='human')[0]

sr_ab = '../'+config['sr']['ab']


ver = 'v40_cerberus'
min_tpm = 1
gene_subset = 'protein_coding'
obs_col = 'sample'
go_gene_subset = 'protein_coding'

m_ab = od+expand(config['lr']['talon']['fusion_fix']['ab'], species='mouse')[0]
m_filt_ab = od+expand(config['lr']['cerberus']['filt_ab'], species='mouse')[0]
m_read_annot = od+expand(config['lr']['talon']['full_annot'], species='mouse')[0]
m_t_metadata = od+expand(config['ref']['cerberus']['new_gtf_t_info'], species='mouse')[0]
m_lib_meta = od+expand(config['lr']['meta'], species='mouse')[0]
m_swan_file = od+expand(config['lr']['swan']['sg'], species='mouse')[0]
m_cerberus_h5 = od+expand(config['lr']['cerberus']['ca_triplets'], species='mouse', obs_col='sample')[0]
cerb_t_metadata = od+expand(config['lr']['cerberus']['gtf_t_info'], species='mouse')[0]
m_major_isos = od+expand(config['lr']['analysis']['major_isos'], species='mouse', obs_col='sample')[0]
m_pi_tpm_table = od+expand(config['lr']['mane']['pi_tpm']['triplet'], species='mouse', obs_col='sample')[0]
m_pp_summary = od+expand(config['lr']['protein_pred']['summary'], species='mouse')[0]


m_ref_t_metadata = od+expand(config['ref']['new_gtf_t_info'], species='mouse')[0]
m_ref_g_metadata = od+expand(config['ref']['new_gtf_g_info'], species='mouse')[0]


orth_table = '../../proc_revisions/ref/biomart_human_to_mouse.tsv'
mouse_ver = 'vM25_cerberus'

In [4]:
def get_flagship_timepoint_colors():
    c_dict = {
    '4d': '#fcfdff',
    '10d': '#e8f0fa',
    '14d': '#d3e3f2',
    '25d': '#accfe6',
    '36d': '#75b2dc',
    '2mo': '#4693c8',
    '18-20mo': '#164894'}
    order = ['4d', '10d', '14d', '25d', '36d', '2mo', '18-20mo']
    return c_dict, order

def get_flagship_tissue_colors():
    c_dict  = {
    'adrenal': '#ff9458',
    'cortex': '#ffe578',
    'hippocampus': '#967add',
    'gastroc': '#34b0d8',
    'heart': '#ff9aaf'}
    order = ['adrenal', 'cortex', 'hippocampus', 'gastroc', 'heart']
    return c_dict, order

In [37]:
sg = swan.read(m_swan_file)

Read in graph from ../../proc_revisions/data/mouse/lr/swan/swan_graph.p


In [38]:
# limit to the timecourse
inds = sg.adata.obs[sg.adata.obs.genotype=='b6/cast'].index
sg.adata = sg.adata[inds, :]
sg.adata

View of AnnData object with n_obs × n_vars = 92 × 152164
    obs: 'dataset', 'total_counts', 'ENCODE_experiment_id', 'sample', 'sample_display', 'general_tissue_cell_type', 'fig1_tissue_label', 'age', 'sex', 'genotype', 'tissue_or_cell_line', 'sample_color_hex_code', 'matching_human_samples', 'reads_post_talon', 'ENCODE_alignments_id', 'ENCODE_reads_id', 'ENCODE_unfiltered_alignments_id', 'document_urls', 'document_labels', 'platform', 'RIN', 'spikeins'
    var: 'tid'
    uns: 'sample_colors', 'sample_dict'
    layers: 'counts', 'tpm', 'pi'

In [39]:
c_dict, order = get_flagship_timepoint_colors()
sg.set_metadata_colors('age', c_dict)
sg.adata.obs['tissue'] = sg.adata.obs['sample'].str.split('_', expand=True)[0]
sg.tss_adata.obs['tissue'] = sg.tss_adata.obs['sample'].str.split('_', expand=True)[0]
sg.tes_adata.obs['tissue'] = sg.tes_adata.obs['sample'].str.split('_', expand=True)[0]
sg.ic_adata.obs['tissue'] = sg.ic_adata.obs['sample'].str.split('_', expand=True)[0]
sg.gene_adata.obs['tissue'] = sg.gene_adata.obs['sample'].str.split('_', expand=True)[0]

c_dict, order = get_flagship_tissue_colors()
sg.set_metadata_colors('tissue', c_dict)

In [40]:
sg.adata.obs.columns

Index(['dataset', 'total_counts', 'ENCODE_experiment_id', 'sample',
       'sample_display', 'general_tissue_cell_type', 'fig1_tissue_label',
       'age', 'sex', 'genotype', 'tissue_or_cell_line',
       'sample_color_hex_code', 'matching_human_samples', 'reads_post_talon',
       'ENCODE_alignments_id', 'ENCODE_reads_id',
       'ENCODE_unfiltered_alignments_id', 'document_urls', 'document_labels',
       'platform', 'RIN', 'spikeins', 'tissue'],
      dtype='object')

In [41]:
c_dict, order = get_flagship_timepoint_colors()
o2 = [o for o in order if o in sg.adata.obs.age.tolist()]
o2

['4d', '10d', '14d', '25d', '36d', '2mo', '18-20mo']

In [42]:
sg.add_multi_groupby(['tissue', 'age'])

'tissue_age'

In [44]:
g = 'Aldoa'
# sg.gen_report(g,
#               f'figures/{g}',
#               novelty=True,
#               cmap='viridis',
#               indicate_novel=True,
#               transcript_col='tname',
#               metadata_cols=['tissue', 'age'],
#               datasets={'tissue': ['gastroc'],
#                         'age': o2})
sg.gen_report(g,
              f'figures/{g}_gb',
              novelty=True,
              cmap='magma',
              transcript_col='tname',
              display_numbers=True,
              browser=True,
              layer='pi',
              groupby='tissue_age',
              metadata_cols=['tissue', 'age'],
              datasets={'tissue': ['gastroc'],
                        'age': o2})
#and Myl4 in Gastroc 


Plotting transcripts for ENSMUSG00000030695
Saving transcript path graph for ENSMUSG00000030695[5,35,1] as figures/Aldoa_gb_browser_ENSMUSG00000030695[5,35,1]_path.png
Saving transcript path graph for ENSMUSG00000030695[1,1,1] as figures/Aldoa_gb_browser_ENSMUSG00000030695[1,1,1]_path.png
Saving transcript path graph for ENSMUSG00000030695[2,35,1] as figures/Aldoa_gb_browser_ENSMUSG00000030695[2,35,1]_path.png
Saving transcript path graph for ENSMUSG00000030695[2,2,1] as figures/Aldoa_gb_browser_ENSMUSG00000030695[2,2,1]_path.png
Saving transcript path graph for ENSMUSG00000030695[5,36,1] as figures/Aldoa_gb_browser_ENSMUSG00000030695[5,36,1]_path.png
Saving transcript path graph for ENSMUSG00000030695[1,3,1] as figures/Aldoa_gb_browser_ENSMUSG00000030695[1,3,1]_path.png
Saving transcript path graph for ENSMUSG00000030695[6,39,1] as figures/Aldoa_gb_browser_ENSMUSG00000030695[6,39,1]_path.png
Saving transcript path graph for ENSMUSG00000030695[5,32,1] as figures/Aldoa_gb_browser_ENSMU

In [10]:
g = 'Myl4'
sg.gen_report(g,
              f'figures/{g}',
              novelty=True,
              cmap='viridis',
              indicate_novel=True,
              transcript_col='tname',
              metadata_cols=['tissue', 'age'],
              datasets={'tissue': ['gastroc'],
                        'age': o2})
sg.gen_report(g,
              f'figures/{g}',
              novelty=True,
              cmap='magma',
              transcript_col='tname',
              display_numbers=True,
              browser=True,
              layer='pi',
              metadata_cols=['tissue', 'age'],
              datasets={'tissue': ['gastroc'],
                        'age': o2})
#and Myl4 in Gastroc 


Plotting transcripts for ENSMUSG00000061086
Saving transcript path graph for ENSMUSG00000061086[9,19,1] as figures/Myl4_novel_ENSMUSG00000061086[9,19,1]_path.png
Saving transcript path graph for ENSMUSG00000061086[3,3,1] as figures/Myl4_novel_ENSMUSG00000061086[3,3,1]_path.png
Saving transcript path graph for ENSMUSG00000061086[8,9,1] as figures/Myl4_novel_ENSMUSG00000061086[8,9,1]_path.png
Saving transcript path graph for ENSMUSG00000061086[2,2,1] as figures/Myl4_novel_ENSMUSG00000061086[2,2,1]_path.png
Saving transcript path graph for ENSMUSG00000061086[3,17,1] as figures/Myl4_novel_ENSMUSG00000061086[3,17,1]_path.png
Saving transcript path graph for ENSMUSG00000061086[3,18,1] as figures/Myl4_novel_ENSMUSG00000061086[3,18,1]_path.png
Saving transcript path graph for ENSMUSG00000061086[4,2,1] as figures/Myl4_novel_ENSMUSG00000061086[4,2,1]_path.png
Saving transcript path graph for ENSMUSG00000061086[2,11,1] as figures/Myl4_novel_ENSMUSG00000061086[2,11,1]_path.png
Saving transcript p

In [16]:
g = 'Gria2'
sg.gen_report(g,
              f'figures/{g}',
              novelty=True,
              cmap='viridis',
              indicate_novel=True,
              transcript_col='tname',
              metadata_cols=['tissue', 'age'],
              datasets={'tissue': ['cortex', 'hippocampus'],
                        'age': o2})
sg.gen_report(g,
              f'figures/{g}',
              novelty=True,
              cmap='magma',
              transcript_col='tname',
              display_numbers=True,
              browser=True,
              layer='pi',
              metadata_cols=['tissue', 'age'],
              datasets={'tissue': ['cortex', 'hippocampus'],
                        'age': o2})


Plotting transcripts for ENSMUSG00000033981
Saving transcript path graph for ENSMUSG00000033981[1,2,2] as figures/Gria2_novel_ENSMUSG00000033981[1,2,2]_path.png
Saving transcript path graph for ENSMUSG00000033981[1,2,12] as figures/Gria2_novel_ENSMUSG00000033981[1,2,12]_path.png
Saving transcript path graph for ENSMUSG00000033981[1,1,12] as figures/Gria2_novel_ENSMUSG00000033981[1,1,12]_path.png
Saving transcript path graph for ENSMUSG00000033981[1,1,2] as figures/Gria2_novel_ENSMUSG00000033981[1,1,2]_path.png
Saving transcript path graph for ENSMUSG00000033981[1,1,1] as figures/Gria2_novel_ENSMUSG00000033981[1,1,1]_path.png
Saving transcript path graph for ENSMUSG00000033981[1,1,3] as figures/Gria2_novel_ENSMUSG00000033981[1,1,3]_path.png
Saving transcript path graph for ENSMUSG00000033981[1,2,3] as figures/Gria2_novel_ENSMUSG00000033981[1,2,3]_path.png
Saving transcript path graph for ENSMUSG00000033981[1,13,12] as figures/Gria2_novel_ENSMUSG00000033981[1,13,12]_path.png
Saving tran

In [18]:
g = 'Dnm3'
sg.gen_report(g,
              f'figures/{g}',
              novelty=True,
              cmap='viridis',
              indicate_novel=True,
              transcript_col='tname',
              metadata_cols=['tissue', 'age'],
              datasets={'tissue': ['cortex', 'hippocampus'],
                        'age': o2})
sg.gen_report(g,
              f'figures/{g}',
              novelty=True,
              cmap='magma',
              transcript_col='tname',
              display_numbers=True,
              browser=True,
              layer='pi',
              metadata_cols=['tissue', 'age'],
              datasets={'tissue': ['cortex', 'hippocampus'],
                        'age': o2})


Plotting transcripts for ENSMUSG00000040265
Saving transcript path graph for ENSMUSG00000040265[9,5,4] as figures/Dnm3_novel_ENSMUSG00000040265[9,5,4]_path.png
Saving transcript path graph for ENSMUSG00000040265[1,14,4] as figures/Dnm3_novel_ENSMUSG00000040265[1,14,4]_path.png
Saving transcript path graph for ENSMUSG00000040265[1,13,3] as figures/Dnm3_novel_ENSMUSG00000040265[1,13,3]_path.png
Saving transcript path graph for ENSMUSG00000040265[1,14,12] as figures/Dnm3_novel_ENSMUSG00000040265[1,14,12]_path.png
Saving transcript path graph for ENSMUSG00000040265[1,1,3] as figures/Dnm3_novel_ENSMUSG00000040265[1,1,3]_path.png
Saving transcript path graph for ENSMUSG00000040265[3,5,2] as figures/Dnm3_novel_ENSMUSG00000040265[3,5,2]_path.png
Saving transcript path graph for ENSMUSG00000040265[1,13,1] as figures/Dnm3_novel_ENSMUSG00000040265[1,13,1]_path.png
Saving transcript path graph for ENSMUSG00000040265[1,1,1] as figures/Dnm3_novel_ENSMUSG00000040265[1,1,1]_path.png
Saving transcript

In [17]:
sg.gen_report(g,
              f'figures/{g}',
              novelty=True,
              cmap='magma',
              transcript_col='tname',
              display_numbers=True,
              browser=True,
              layer='pi',
              metadata_cols=['tissue', 'age'],
              datasets={'tissue': ['hippocampus'],
                        'age': o2})


Plotting transcripts for ENSMUSG00000033981
Saving transcript path graph for ENSMUSG00000033981[1,2,2] as figures/Gria2_browser_ENSMUSG00000033981[1,2,2]_path.png
Saving transcript path graph for ENSMUSG00000033981[1,2,12] as figures/Gria2_browser_ENSMUSG00000033981[1,2,12]_path.png
Saving transcript path graph for ENSMUSG00000033981[1,1,12] as figures/Gria2_browser_ENSMUSG00000033981[1,1,12]_path.png
Saving transcript path graph for ENSMUSG00000033981[1,1,2] as figures/Gria2_browser_ENSMUSG00000033981[1,1,2]_path.png
Saving transcript path graph for ENSMUSG00000033981[1,1,1] as figures/Gria2_browser_ENSMUSG00000033981[1,1,1]_path.png
Saving transcript path graph for ENSMUSG00000033981[1,1,3] as figures/Gria2_browser_ENSMUSG00000033981[1,1,3]_path.png
Saving transcript path graph for ENSMUSG00000033981[1,2,3] as figures/Gria2_browser_ENSMUSG00000033981[1,2,3]_path.png
Saving transcript path graph for ENSMUSG00000033981[1,13,12] as figures/Gria2_browser_ENSMUSG00000033981[1,13,12]_path