# sQuint Differential Splicing 

In [1]:
!date/

/bin/bash: date/: No such file or directory


In [2]:
%env CONDA_PREFIX

'/c4/home/derek/miniconda3/envs/scquint_4'

In [3]:
#analysis approach from:
#https://github.com/songlab-cal/scquint/blob/main/differential_splicing_example.ipynb

In [4]:
import anndata

import pandas as pd
import scanpy as sc
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import os

pd.set_option('display.max_columns', None)
sns.set_style("white")

In [5]:
from scquint.differential_splicing import run_differential_splicing
from scquint.data import calculate_PSI

  from .autonotebook import tqdm as notebook_tqdm


In [6]:
from scquint.data import calculate_PSI
from scquint.differential_splicing import run_differential_splicing, run_differential_splicing_for_each_group, find_marker_introns, mask_PSI
from scquint.dimensionality_reduction.pca import run_pca

In [7]:
input_dir = '/nowakowskilab/data1/derek/data_scSLR/prenatal_brain/'

In [8]:
output_dir = '/c4/home/derek/data1/derek/scSLR/notebooks/fig3/plots/'
os.makedirs(output_dir, exist_ok=True)

In [9]:
adata_gene = sc.read_h5ad(input_dir+'scANVI_label.h5ad')

In [10]:
adata_gene

AnnData object with n_obs × n_vars = 74327 × 36385
    obs: 'n_counts', 'batch', 'n_genes_by_counts', 'log1p_n_genes_by_counts', 'total_counts', 'log1p_total_counts', 'pct_counts_mito', '_scvi_batch', '_scvi_labels', 'leiden', 'scANVI_simple', 'tech', 'C_scANVI', 'C_scANVI_simple'
    var: 'mito', 'n_cells_by_counts', 'mean_counts', 'log1p_mean_counts', 'pct_dropout_by_counts', 'total_counts', 'log1p_total_counts'
    uns: 'C_scANVI_simple_colors', '_scvi_manager_uuid', '_scvi_uuid', 'batch_colors', 'hvg', 'leiden', 'leiden_colors', 'log1p', 'neighbors', 'umap'
    obsm: 'X_scVI', 'X_umap'
    obsp: 'connectivities', 'distances'

In [12]:
# adata SCVI batch corrected expression matrix as layer

exp_layer = sc.read_h5ad(input_dir+'exp_layers_annot.h5ad')

exp_layer = exp_layer[exp_layer.obs_names.isin(adata_gene.obs_names)]

adata_gene.layers['nb_sample'] = exp_layer.layers['nb_sample']

In [None]:
plt.rcParams['figure.figsize'] = 4,4
sc.pl.umap(adata_gene,
          color=[#'scANVI_simple','leiden',
                # 'CELF2','RBM20',
                 'RBFOX1','RBFOX2','RBFOX3'
                ],
          #palette='Spectral_r',
           size=30,
           ncols=3,
              vmax='p99.9',
           cmap='viridis',
           alpha=1,
           layer='nb_sample'
          )

In [None]:
break

In [None]:
model_dir = '/c4/home/derek/data1/derek/data_scSLR/reference/rbpnet_models/models/'

RBPs = [i.split('_')[0] for i in  os.listdir(model_dir)]

In [None]:
len(RBPS)

In [None]:
RBPS = ['EFTUD2', 'LIN28B', 'AGGF1', 'HNRNPL', 'SND1', 'GTF2F1', 'EIF4G2',
       'TIA1', 'TARDBP', 'FXR2', 'HNRNPM', 'IGF2BP1', 'PUM2', 'FAM120A',
       'DDX3X', 'MATR3', 'FUS', 'GRWD1', 'PABPC4', 'U2AF2', 'AKAP8L', 'METAP2',
       'SMNDC1', 'GEMIN5', 'HNRNPK', 'SLTM', 'SRSF1', 'FMR1', 'SAFB2',
       'DROSHA', 'RPS3', 'IGF2BP2', 'ILF3', 'RBFOX2', 'QKI', 'PCBP1', 'ZNF800',
       'PUM1', 'BUD13', 'PTBP1', 'DDX24', 'EWSR1', 'RBM15', 'SF3B4', 'YBX3',
       'UCHL5', 'KHSRP', 'ZNF622', 'NONO', 'EXOSC5', 'PRPF8', 'CSTF2T', 'AQR',
       'UPF1', 'MTPAP', 'RBM22', 'DHX30', 'DDX6', 'DDX55', 'TRA2A', 'XRN2',
       'U2AF1', 'LSM11', 
        #'ZC3H11A', 
        'NOLC1', 'KHDRBS1', 'GPKOW', 'DGCR8',
       'AKAP1', 'FXR1', 'DDX52', 'AATF']

In [None]:
RBPS = [
    'RBM22', 'GRSF1', 'CDC40', 'NOLC1', 'FKBP4', 'DGCR8', 'XRN2', 'SLTM', 'DDX55', 'TIA1', 'SRSF1', 'U2AF1', 'RBM15',
'LSM11', 'NKRF', 'SUB1', 'NCBP2', 'UCHL5', 'LIN28B', 'IGF2BP3', 'SF3A3', 'AGGF1', 'DROSHA', 'DDX59', 'CSTF2', 'DKC1', 'EIF3H', 'FUBP3', 'SFPQ', 'HNRNPC', 'ILF3', 'TIAL1', 'HLTF', 'ZNF800', 'PABPN1', 'YBX3', 'FXR2',
'GTF2F1', 'IGF2BP1', 'HNRNPK', 'XPO5', 'RPS3', 'SF3B4', 'LARP4', 'BUD13', 'SND1', 'G3BP1', 'AKAP1', 'KHSRP',
'DDX3X', 'PCBP2', 'FAM120A', 'HNRNPL', 'RBFOX2', 'PTBP1', 'MATR3', 'EFTUD2', 'PRPF4', 'UPF1',
'GRWD1', 'PRPF8', 'PPIG', 'CSTF2T', 'QKI', 'U2AF2', 'SUGP2', 'HNRNPM', 'AQR', 'BCLAF1'
]

In [None]:
RBP_list = set(['MBNL1', 'P53_NONO', 'PUM2', 'QKI', 'AGO3', 'FUS', 'TAF15', 'ZFP36', 'DICER1', 'EIF3A', 'EIF3D', 'EIF3G', 'SSB', 'PAPD5', 'CPSF4','CPSF3', 'RTCB', 'FXR1', 'NOP58', 'NOP56', 'FBL', 'LIN28A', 'LIN28B', 'UPF1', 'G35', 'G45', 'XPO5','TARDBP', 'ELAVL2', 'ELAVL3', 'ELAVL4', 'RBM20', 'IGF2BP1', 'IGF2BP2', 'IGF2BP3', 'EWSR1', 'HNRNPD', 'RBPMS', 'SRRM4', 'AGO2', 'NUDT21', 'FIP1L1', 'CAPRIN1', 'FMR1iso7', 'FXR2', 'AGO1', 'L1RE1', 'ORF1','DND1', 'CPSF7', 'CPSF6', 'CPSF1', 'CSTF2', 'CSTF2T', 'ZC3H7B', 'FMR1iso1', 'RBM10', 'MOV10', 'ELAVL1','RBM22', 'GRSF1', 'CDC40', 'NOLC1', 'FKBP4', 'DGCR8', 'ZC3H11A', 'XRN2', 'SLTM', 'DDX55', 'TIA1', 'SRSF1', 'U2AF1', 'RBM15','LSM11', 'NKRF', 'SUB1', 'NCBP2', 'UCHL5', 'LIN28B', 'IGF2BP3', 'SF3A3', 'AGGF1', 'DROSHA', 'DDX59', 'CSTF2', 'DKC1', 'EIF3H', 'FUBP3','SFPQ', 'HNRNPC', 'ILF3', 'TIAL1', 'HLTF', 'ZNF800', 'PABPN1', 'YBX3', 'FXR2','GTF2F1', 'IGF2BP1', 'HNRNPK', 'XPO5', 'RPS3', 'SF3B4', 'LARP4', 'BUD13', 'SND1', 'G3BP1', 'AKAP1', 'KHSRP','DDX3X', 'PCBP2', 'FAM120A', 'HNRNPL', 'RBFOX2', 'PTBP1', 'MATR3', 'EFTUD2', 'PRPF4', 'UPF1','GRWD1', 'PRPF8', 'PPIG', 'CSTF2T', 'QKI', 'U2AF2', 'SUGP2', 'HNRNPM', 'AQR', 'BCLAF1','MTPAP', 'RBM22', 'DHX30', 'DDX6', 'DDX55', 'TRA2A', 'XRN2', 'U2AF1', 'LSM11', 'ZC3H11A', 'NOLC1', 'KHDRBS1', 'GPKOW', 'DGCR8', 'AKAP1', 'FXR1', 'DDX52', 'AATF','U2AF2', 'AKAP8L', 'METAP2', 'SMNDC1', 'GEMIN5', 'HNRNPK', 'SLTM', 'SRSF1', 'FMR1', 'SAFB2', 'DROSHA', 'RPS3', 'IGF2BP2', 'ILF3', 'RBFOX2', 'QKI', 'PCBP1', 'ZNF800', 'PUM1','EFTUD2', 'LIN28B', 'AGGF1', 'HNRNPL', 'SND1', 'GTF2F1', 'EIF4G2', 'TIA1', 'TARDBP', 'FXR2', 'HNRNPM', 'IGF2BP1', 'PUM2', 'FAM120A', 'DDX3X', 'MATR3', 'FUS', 'GRWD1', 'PABPC4','BUD13', 'PTBP1', 'DDX24', 'EWSR1', 'RBM15','SF3B4', 'YBX3', 'UCHL5', 'KHSRP', 'ZNF622', 'NONO', 'EXOSC5', 'PRPF8', 'CSTF2T', 'AQR', 'UPF1'])

In [None]:
RBP_list_ = [i for i in RBP_list if i in adata_gene.var_names]

In [None]:
genedf = sc.get.obs_df(
    adata_gene,
    keys=[*RBP_list_]
)


In [None]:
genedf.sum(axis=0)

In [None]:
# adata_spl_bac = adata_spl.copy()

In [None]:
# adata_spl = sc.read_h5ad(
#     '/nowakowskilab/data1/derek/scSLR/notebooks/fig3/singlets_spl_.h5ad')

adata_spl = sc.read_h5ad('/nowakowskilab/data1/derek/data_scSLR/prenatal_brain/adata_spl_equal.h5ad')

In [None]:
# preds = pd.read_csv('../fig3/solo_preds.csv',index_col=0)

# ##fix obs names
# preds.index = preds.index.str.replace(
#     'GW16_1','0').str.replace(
#     'GW16_2','1').str.replace(
#     'GW17_1','2').str.replace(
#     'GW21_1','3').str.replace(
#     'GW21_2','4').str.replace(
#     'GW23_1','5')


# preds = preds[preds.index.isin(adata_spl.obs_names)]

# adata_spl.obs['doublet_soft'] = preds['doublet']
# adata_spl.obs['singlet_soft'] = preds['singlet']
# adata_spl.obs['solo_prediction'] = preds['prediction']

In [None]:
cat_list = adata_spl.obs.C_scANVI_simple.astype('category').cat.categories.tolist()

In [None]:
dict(zip(cat_list, cat_list))

In [None]:
simplify_dict = {'Astrocyte': 'Astrocyte',
 'EN-PFC': 'EN-PFC',
 'EN-V1': 'EN-V1',
 'Endothelial': 'Endothelial',
 'Glyc': 'Glyc',
 'IN-CTX-CGE': 'IN-CTX-CGE',
 'IN-CTX-MGE': 'IN-CTX-MGE',
 'IN-STR': 'IN-STR',
 'IPC-div': 'IPC-nEN',
 'IPC-nEN': 'IPC-nEN',
 'MGE-IPC': 'MGE-IPC',
 'MGE-RG': 'MGE-RG',
 'MGE-div': 'MGE-RG',
 'Microglia': 'Microglia',
 'Mural': 'Mural',
 'OPC': 'OPC',
 'RG': 'RG',
 'RG-div': 'RG',
 'RBC': 'RBC',
 'nEN': 'nEN',
 'nIN': 'nIN'}

# simplify_dict = {'Astrocyte': 'RG',
#  'EN-PFC': 'EN',
#  'EN-V1': 'EN',
#  'Endothelial': 'Endothelial',
#  'Glyc': 'Glyc',
#  'IN-CTX-CGE': 'IN',
#  'IN-CTX-MGE': 'IN',
#  'IN-STR': 'IN',
#  'IPC-div': 'IPC-nEN',
#  'IPC-nEN': 'IPC-nEN',
#  'MGE-IPC': 'MGE-IPC',
#  'MGE-RG': 'RG',
#  'MGE-div': 'RG',
#  'Microglia': 'Microglia',
#  'Mural': 'Mural',
#  'OPC': 'RG',
#  'RG': 'RG',
#  'RG-div': 'RG',
#  'RBC': 'RBC',
#  'nEN': 'EN',
#  'nIN': 'nIN'}

In [None]:
adata_spl.obs['C_scANVI_simple'] = adata_spl.obs.C_scANVI_simple.map(simplify_dict)
adata_gene.obs['C_scANVI_simple'] = adata_gene.obs.C_scANVI_simple.map(simplify_dict)

In [None]:
adata_spl.obs['C_scANVI_simple']

In [None]:
break

In [None]:
adata_gene.obs['C_scANVI_simple'] = adata_gene.obs.C_scANVI_simple.map(simplify_dict)

In [None]:
adata_gene.obs['C_scANVI_simple'] = adata_spl.obs['C_scANVI_simple']

In [None]:
adata_gene.obs['C_scANVI_simple'].value_counts()

In [None]:
##fix celltype palette 

cmap = pd.read_csv('celltype_colors.csv')

cmap['celltype_simple'] = cmap['0'].map(simplify_dict)

cmap = cmap.drop_duplicates(subset='celltype_simple',keep='first')

cmap.loc[16], cmap.loc[18] =  cmap.loc[18].copy(), cmap.loc[16].copy()

adata_gene.uns['C_scANVI_simple_colors'] = cmap['1']
adata_spl.uns['C_scANVI_simple_colors'] = cmap['1']

In [None]:
plt.rcParams['figure.figsize'] = 4,4
sc.pl.umap(adata_spl,
          color=['C_scANVI_simple'],
          #palette='Spectral_r',
           size=20,
           ncols=1,
           
          )

In [None]:
plt.rcParams['figure.figsize'] = 4,4
sc.pl.umap(adata_gene,
          color=['SCANVI_cell_types_simplified'],
          #palette='Spectral_r',
           size=20,
           ncols=1,
           
          )

In [None]:
adata_gene.write_h5ad('UCSC_CB_scSLR_gene_exp.h5ad')

In [None]:
adata_spl.var

In [None]:
#del adata_spl.uns['C_scANVI_simple_colors']

In [None]:
adata_gene.obs.leiden.cat.categories

In [None]:
leiden_dict = {'0': 'nEN',
 '1': 'EN-1', #
 '2': 'EN-1',
 '3': 'EN-2',
 '4': 'IN-CTX-CGE',
 '5': 'IN-CTX-MGE',
 '6': 'IPC-nEN',
 '7': 'EN-2',
 '8': 'EN-3',
 '9': 'RG',
 '10': 'MGE-RG',
 '11': 'EN-3',
 '12': 'EN-2',
 '13': 'IPC-nEN',
 '14': 'EN-2',
 '15': 'IN-CTX-MGE',
 '16': 'EN-2',
 '17': 'IN-CTX-CGE',
 '18': 'nIN',
 '19': 'Astrocyte',
 '20': 'IN-STR',
 '21': 'OPC',
 '22': 'IN-STR',
 '24': 'IN-CTX-CGE',
 '25': 'EN-2',
 '26': 'Microglia',
 '28': 'Endothelial',
 '29': 'RBC',}

In [None]:
adata_spl.obs['leiden_celltypes'] = adata_spl.obs.leiden.map(leiden_dict)
adata_gene.obs['leiden_celltypes'] = adata_spl.obs.leiden.map(leiden_dict)

In [None]:
plt.rcParams['figure.figsize'] = 4,4
sc.pl.umap(adata_spl,
          color=['leiden_celltypes'],
           size=20,
           ncols=1,
           palette='Greys'
          )

In [None]:
# singlets = pd.read_csv('singlets.csv',index_col=0)['0']

# singlets = singlets.str.replace(
#     'GW16_1','0').str.replace(
#     'GW16_2','1').str.replace(
#     'GW17_1','2').str.replace(
#     'GW21_1','3').str.replace(
#     'GW21_2','4').str.replace(
#     'GW23_1','5')



In [None]:
# singlets

In [None]:
# adata_spl = adata_spl[adata_spl.obs_names.isin(singlets)]

In [None]:
adata_spl

In [None]:
features = ["C_scANVI_simple", "age", "batch"]

In [None]:
# # groupby = "C_scANVI_simple"
# # adata_spl.obs[groupby].value_counts()

# groupby = "leiden_celltypes"
# adata_spl.obs[groupby].value_counts()

In [None]:
groupby = "C_scANVI_simple"
adata_spl.obs[groupby].value_counts()

In [None]:
#groups_test = adata_spl.obs[groupby].value_counts().index.values.astype(str).tolist()

groups_test = adata_spl.obs[groupby].value_counts()[:13].index.values.astype(str).tolist()


In [None]:
groups_test = ['EN','RG']

In [None]:
groupby

In [None]:
%%time
#~
diff_spl_intron_groups, diff_spl_introns = run_differential_splicing_for_each_group(
    adata_spl, 
    groupby,
    groups=groups_test, 
    subset_to_groups=True,
    min_cells_per_intron_group=10, 
    min_total_cells_per_intron=10,
    n_jobs=15,
)

In [None]:
break

In [None]:
diff_spl_introns

In [None]:
#break

In [None]:
#diff_spl_introns.to_csv(input_dir+'diff_spliced_introns_shared_CB.csv')
#diff_spl_intron_groups.to_csv(input_dir+'diff_spliced_intron_groups_shared_CB.csv')

# diff_spl_introns = pd.read_csv(input_dir+ 'diff_spliced_introns_shared_CB.csv',index_col=0)
# diff_spl_intron_groups = pd.read_csv(input_dir+'diff_spliced_intron_groups_shared_CB.csv',index_col=0)

In [None]:
# diff_spl_introns.to_csv(input_dir+'diff_spliced_introns_EN_RG.csv')
# diff_spl_intron_groups.to_csv(input_dir+'diff_spliced_intron_groups_EN_RG.csv')

diff_spl_introns = pd.read_csv(input_dir+ 'diff_spliced_introns_EN_RG.csv',index_col=0)
diff_spl_intron_groups = pd.read_csv(input_dir+'diff_spliced_intron_groups_EN_RG.csv',index_col=0)

In [None]:
diff_spl_introns = pd.read_csv(input_dir+ 'diff_spliced_introns_all_equal.csv',index_col=0)
diff_spl_intron_groups = pd.read_csv(input_dir+'diff_spliced_intron_groups_all_equal.csv',index_col=0)

In [None]:
# diff_spl_introns_ = pd.read_csv(input_dir+ 'pseudotime_diff_spliced_introns_all_equal.csv',index_col=0)
# diff_spl_intron_groups_ = pd.read_csv(input_dir+'pseudotime_diff_spliced_intron_groups_all_equal.csv',index_col=0)

In [None]:
diff_spl_introns

In [None]:
sig_diff_spl_intron_groups = diff_spl_intron_groups.query('p_value_adj < 0.05')

sig_diff_spl_introns = diff_spl_introns[diff_spl_introns.intron_group.isin(sig_diff_spl_intron_groups.name)]

print(f'unique introns: {sig_diff_spl_introns["name"].nunique()}')

print(f'intron groups: {sig_diff_spl_intron_groups.shape}')


In [None]:
# diff_spl_introns.to_csv(input_dir+'dif_RG_EN.csv')
# diff_spl_intron_groups.to_csv(input_dir+'dif_intron_groups_RG_EN.csv')

In [None]:
# sig_diff_spl_introns.to_csv(input_dir+'sig_dif_RG_EN.csv')

In [None]:
# sig_diff_spl_intron_groups.to_csv(input_dir+'sig_dif_intron_groups_RG_EN.csv')

In [None]:
input_dir

In [None]:

# sig_dif = sig_diff_spl_introns[sig_diff_spl_introns.abs_delta_psi >= 0.1]
#sig_dif = sig_diff_spl_introns[sig_diff_spl_introns.abs_delta_psi >= 0.15]

In [None]:
# sig_dif.to_csv('sig_dif_RG_EN.csv')

In [None]:
# unsig_dif = sig_diff_spl_introns[sig_diff_spl_introns.abs_delta_psi < 0.1]

In [None]:
# unsig_dif.to_csv('unsig_dif_RG_EN.csv')

In [None]:
adata_spl_ = adata_spl[:,sig_diff_spl_introns.name.unique()]
adata_spl_.uns = adata_spl.uns

In [None]:
adata_spl_.obs

In [None]:
adata_gene

In [None]:
%%time
#calculate PSI per batch separately for memory cap 
adatas_spl = {}

adata_spl_.obs.batch = adata_spl_.obs.batch.astype('category')

for batch in adata_spl_.obs.batch.cat.categories:
    print(f'processing {batch}')
    adata_batch = adata_spl_[adata_spl_.obs['batch'] == batch]
    
    adata_batch.layers["PSI_raw"] = calculate_PSI(adata_batch, smooth = False)
    
    adatas_spl[batch] = adata_batch

adata_spl_ = anndata.concat(adatas_spl, index_unique="-", merge="same")
adata_spl_

In [None]:
dict(zip(adata_spl.obs.batch.cat.categories, ['GW16','GW16','GW17','GW21','GW21','GW23']))

In [None]:
adata_gene.obs['gestational_week'] = adata_gene.obs.batch.map(dict(zip(adata_gene.obs.batch.cat.categories, ['GW16','GW16','GW17','GW21','GW21','GW23'])))
adata_spl_.obs['gestational_week'] = adata_spl_.obs.batch.map(dict(zip(adata_spl_.obs.batch.cat.categories, ['GW16','GW16','GW17','GW21','GW21','GW23'])))

In [None]:
adata_gene.write_h5ad(input_dir+'/UCSC_CB_scSLR_gene_exp.h5ad')
adata_spl_.write_h5ad(input_dir+'/UCSC_CB_scSLR_spl.h5ad')

In [None]:
adata_spl_.var

In [None]:
# adata_spl_.obs = adata_spl_.obs.rename({'C_scANVI_simple':'SCANVI_cell_types_simplified', 
#                       'C_scANVI':'SCANVI_cell_types'}, axis=1).drop('scANVI_simple', axis=1)


adata_spl_.uns['SCANVI_cell_types_simplified_colors'] = adata_spl_.uns['C_scANVI_simple_colors']
del adata_spl_.uns['C_scANVI_simple_colors']


adata_spl_.uns['SCANVI_cell_types_simplified_colors'] = list(adata_spl_.uns['SCANVI_cell_types_simplified_colors'])

In [None]:
addata_spl_.

In [None]:
# add Microglia back to groups_list
groups_test = groups_test + ['Microglia']

In [None]:
# order = ['MGE-RG','MGE-IPC','OPC','Astrocyte','IPC-nEN','RG','nIN','IN-STR','EN-PFC','EN-V1','IN-CTX-CGE','IN-CTX-MGE','nEN']

order = ['RG','MGE-RG','OPC','Astrocyte','MGE-IPC','IPC-nEN','nEN','nIN','IN-STR','EN-PFC','EN-V1','IN-CTX-CGE','IN-CTX-MGE', 'Microglia']

# order = ['RG', 'MGE-RG', 'OPC', 'Astrocyte', 'MGE-IPC','IPC-nEN','nIN', 'nEN', 'IN-STR', 'EN-PFC', 'EN-V1', 'IN-CTX-CGE', 'IN-CTX-MGE'] 

In [None]:
'INTU_chr4:127623492-127643520'

In [None]:
target = 'INTU_chr4:127623492-127643520'


df_barplot = sc.get.obs_df(
        adata_spl_[adata_spl_.obs['C_scANVI_simple'].isin( [i for i in groups_test if i not in ('Glyc')] )],
        layer = 'PSI_raw',
        keys=["C_scANVI_simple", target]

    )

df_barplot = pd.DataFrame(
    df_barplot.groupby('C_scANVI_simple')[target].mean()).T

In [None]:
df_barplot.shape

In [None]:
sns.set(font_scale=1)
sns.set_style('white')

fig, ax = plt.subplots(1, 1, figsize=(2, 10))

ax.legend([],frameon=False)


ax.spines.right.set_visible(False)
ax.spines.top.set_visible(False)

ax.set_ylabel('')


sns.barplot(data=df_barplot,
            orient='h',
            order=order,
            color='powderblue',
            width=0.9
        )

ax.set(yticklabels=[])
ax.set(ylabel=None)

ax.set_xlim(0,1)
ax.set(xlabel='Mean ψ')

ax.grid(visible=True, which ='both', axis='x')

fig.savefig('./plots/INTU_PSI_bar.pdf')

In [None]:
sns.set(font_scale=1)
sns.set_style('white')

fig, ax = plt.subplots(1, 1, figsize=(2, 10))

ax.legend([],frameon=False)


ax.spines.right.set_visible(False)
ax.spines.top.set_visible(False)

ax.set_ylabel('')


sns.violinplot(data=KIF3A_exp,
            orient='h',
            order=order,
            color='powderblue',
            width=0.9
        )

ax.set(yticklabels=[])
ax.set(ylabel=None)

# ax.set_xlim(0,1)
# ax.set(xlabel='Mean ψ')

ax.grid(visible=True, which ='both', axis='x')

In [None]:
target = 'KIF3A_chr5:132708979-132710958'

df = sc.get.obs_df(
        adata_spl_[adata_spl_.obs['C_scANVI_simple'].isin( [i for i in groups_test if i not in ('Glyc', 'Microglia')] )],
        layer = 'PSI_raw',
        keys=["C_scANVI_simple", target,]

    )

df = pd.DataFrame(
    df.groupby('C_scANVI_simple')[target,].mean()).T

In [None]:
order = ['RG',
 'MGE-RG',
 'OPC',
 'Astrocyte',
 'MGE-IPC',
 'IPC-nEN',
 'nEN',
 'nIN',
 'IN-STR',
 'EN-PFC',
 'EN-V1',
 'IN-CTX-CGE',
 'IN-CTX-MGE']

In [None]:
sns.set(font_scale=1)
sns.set_style('white')

fig, ax = plt.subplots(1, 1, figsize=(2, 10))

ax.legend([],frameon=False)


ax.spines.right.set_visible(False)
ax.spines.top.set_visible(False)

ax.set_ylabel('')


sns.barplot(data=df,
            orient='h',
            order=order,
            color='powderblue',
            width=0.9
        )

ax.set(yticklabels=[])
ax.set(ylabel=None)

ax.set_xlim(0,1)
ax.set(xlabel='Mean ψ')

ax.grid(visible=True, which ='both', axis='x')


fig.savefig('./plots/INTU_PSI_bar.pdf')

In [None]:
groups_test

In [None]:
%%time
marker_introns = find_marker_introns(
                                    diff_spl_intron_groups,
                                    diff_spl_introns,
                                    n=50,
                                    max_p_value_adj=0.05,
                                    min_delta_psi=0.2,
)
marker_introns = {c: marker_introns[c] for c in groups_test} # just reordering
# marker_introns

In [None]:
[i for i in adata_spl_.var_names if i.startswith('SCRIB')]

In [None]:
chr9_129915966_129915980_ENSG00000187239

In [None]:
sc.pl.umap(adata_spl_,
           # color= marker_introns['EN-V1'],
           color=[i for i in adata_spl_.var_names if i.startswith('SCRIB')],
           cmap='coolwarm',
           vmax=1,
           vmin=0,
           size=100,
           layer="PSI_raw",
           sort_order=True,
           ncols=5
          )

In [None]:
marker_introns['EN-V1']

In [None]:
import matplotlib.pyplot as plt

#event =  'GOPC_chr6:117575353-117578899'
#event = 'MTUS1_chr8:17715902-17723671'
#event =  'DNM3_chr1:172093726-172131174'

event='SCRIB_chr8:143791441-143791665'

plt.rcParams.update({
    "figure.facecolor":  (0.0, 0.0, 0.0, 0),  
})

fig,ax = plt.subplots(1,1, figsize=(5, 5))

sc.pl.umap(adata_spl_,
           color=event,
           cmap='coolwarm',
           vmax=1,
           vmin=0,
           size=50,
           layer="PSI_raw",
           sort_order=False,
           show=False,
           ax=ax,
           colorbar_loc=None,
           
          )

sc.pl.umap(adata_spl_[~adata_spl_.to_df(layer='PSI_raw')[event].isna()],
           color=event,
           cmap='coolwarm',
           vmax=1,
           vmin=0,
           size=120,
           layer="PSI_raw",
           sort_order=False,
           show=False,
           ax=ax,
           colorbar_loc=None,
           
          )

# fig.savefig(f'./plots/{event}_PSI_umap.pdf')

In [None]:
fig,ax = plt.subplots(1,1, figsize=(5, 5))


sc.pl.umap(adata_gene,
          color=[#'C_scANVI_simple'
              'INTU'
                ],
          #palette='Spectral_r',
           size=50,
           ncols=2,
              vmax='p99.9',
           cmap='viridis',
           alpha=1,
           layer='nb_sample',
           ax=ax, 
           show=False,
                      colorbar_loc=None,
           
           
          )

fig.savefig(f'./plots/INTU_exp_umap.pdf')

In [None]:
event[0]

In [None]:
adata_spl_[~adata_spl_.to_df(layer='PSI_raw')[event[0]].isna()]

In [None]:
col_color_dict = dict(zip(adata_spl.obs.C_scANVI_simple.cat.categories,adata_spl.uns['C_scANVI_simple_colors']))

col_color_dict_filtered = {key: col_color_dict[key] for key in groups_test}

sns.palplot(col_color_dict_filtered.values())

In [None]:
groups_test

In [None]:
# adata_spl = adata_spl_bac.copy()

In [None]:
# Number of significant events per cell type
sns.set(font_scale=1)
sns.set_style('white')

fig, axes = plt.subplots(1, 2, figsize=(5, 10), sharey=True)

celltype_anot = 'C_scANVI_simple'

##cell type counts

sns.countplot(adata_spl[adata_spl.obs.C_scANVI_simple.isin(groups_test)].obs,
              y=celltype_anot, 
              hue=celltype_anot,
              palette = col_color_dict_filtered,
              saturation=1,
              order=groups_test,
              ax=axes[0],
              **{'width':0.95},
              dodge=False,
             )

#axes[0].set_xscale("log")

axes[0].set_title('cell counts \n per celltype',fontsize=16)
axes[0].legend([],frameon=False)
axes[0].set_ylabel('')

axes[0].spines.right.set_visible(False)
axes[0].spines.top.set_visible(False)


##number of DS events
sns.countplot(sig_diff_spl_intron_groups,
              y="test_group",
              hue="test_group",
              palette = col_color_dict_filtered,
              saturation=1,
              order=groups_test,
              ax=axes[1],
             **{'width':0.95},
             dodge=False)

axes[1].set_xscale("log")

axes[1].set_title('sig events \n per celltype',fontsize=16)
axes[1].legend([],frameon=False)
axes[1].set_ylabel('')


axes[1].spines.right.set_visible(False)
axes[1].spines.top.set_visible(False)





#fig.savefig(output_dir+'sig_events_percell.pdf',dpi=600)

In [None]:
# Number of significant events per cell type
sns.set(font_scale=1)
sns.set_style('white')

fig, axes = plt.subplots(1, 1, figsize=(5, 8))


##number of DS events
sns.countplot(sig_diff_spl_intron_groups[~sig_diff_spl_intron_groups.test_group.isin(['Microglia','Glyc'])],
              y="test_group",
              hue="test_group",
              palette = col_color_dict_filtered,
              saturation=1,
            #  order=groups_test,
              ax=axes,
             **{'width':0.90},
             dodge=False)

axes.set_xscale("symlog")
axes.set_xlim(15,12000)
#axes.set_xticks(range())

#axes.set_title('sig events \n per celltype',fontsize=16)
axes.legend([],frameon=False)
axes.set_ylabel('')

axes.set_xlabel('significant splicing \n events (Δψ > 0.1)',fontsize=18)
axes.tick_params(axis='both', which='major', labelsize=16)

axes.spines.right.set_visible(False)
axes.spines.top.set_visible(False)

# fig.savefig(output_dir+'sig_events_percell.pdf',dpi=600)

In [None]:
df = sig_diff_spl_intron_groups[~sig_diff_spl_intron_groups.test_group.isin(['Microglia','Glyc'])]

In [None]:
df.head()

In [None]:
# Number of significant events per cell type
sns.set(font_scale=1)
sns.set_style('white')

fig, ax = plt.subplots(1, 1, figsize=(5, 8))

sns.scatterplot(x=df.test_group.value_counts().reindex(groups_test[:-2]),
                y=df['test_group'].unique().tolist(),
                hue=df['test_group'].unique().tolist(),
                palette=col_color_dict_filtered,
                size = adata_spl_[adata_spl_.obs['C_scANVI_simple'].isin(groups_test[:-2])].obs['C_scANVI_simple'].value_counts().tolist(),
                sizes=(100, 750),
                
           )

ax.hlines(y=df['test_group'].unique(), 
          xmin=0,
          xmax=df.test_group.value_counts().reindex(groups_test[:-2]), 
          color=pd.Series(col_color_dict_filtered.values()),#'grey', 
          alpha=0.4, 
          lw=4,
          zorder=0
         )



handles, labels  =  ax.get_legend_handles_labels()


ax.legend(handles[-6:][::2],labels[-6:][::2], loc='lower right', frameon=False, labelspacing=1.6, title='Number of \n     cells'), #bbox_to_anchor=(1.01, 0.5))


ax.set_xscale("symlog")
ax.set_xlim(15,12000)
# ax.set_xticks(range())

ax.set_ylabel('')

ax.set_xlabel('Significant splicing \n events (Δψ > 0.1)',fontsize=18)
ax.tick_params(axis='both', which='major', labelsize=16)

ax.spines.right.set_visible(False)
ax.spines.top.set_visible(False)

fig.savefig(output_dir+'sig_events_percell_lolipop.pdf',dpi=600)

In [None]:
handles[-6:]

In [None]:
intron_list = sig_diff_spl_introns.copy()


In [None]:
%%time 
intron_list_ = []

for intron_group in set(intron_list[intron_list.abs_lfc_psi >= 0.25].intron_group):

    best_group = sig_diff_spl_intron_groups.loc[sig_diff_spl_intron_groups[sig_diff_spl_intron_groups.name == intron_group]['p_value'].idxmin()]['test_group']

    subset = sig_diff_spl_introns[(sig_diff_spl_introns.intron_group == intron_group) & (sig_diff_spl_introns.test_group == best_group)]

    intron = subset.loc[subset['abs_delta_psi'].idxmax()]['name']
    #intron = subset.sample(n=1)['name'].values[0]

    intron_list_.append(intron)

In [None]:
len(intron_list_)

In [None]:
##exclude exclusion events.

#intron_list = intron_list[intron_list.event_type != 'exclusion']

In [None]:
groups_test.remove('Microglia')

In [None]:
genedf = sc.get.obs_df(
        adata_spl_[adata_spl_.obs['C_scANVI_simple'].isin(groups_test)],
        layer = 'PSI_raw',
        keys=["C_scANVI_simple", *intron_list[(intron_list['abs_delta_psi'] > 0.4)]['name'].tolist()]

    )
grouped = genedf.groupby("C_scANVI_simple")
mean, var = grouped.mean(), grouped.var()

In [None]:
mean.shape

In [None]:
mean

In [None]:
mean_deduped = mean.T[~mean.T.duplicated()]#.T.fillna(0).T

In [None]:
from scipy.stats import zscore

In [None]:
mean_z = mean_deduped.fillna(0).apply(zscore,axis=0)

## define column colors

In [None]:
mean_deduped_ = mean_deduped.fillna(0)

## define row colors

In [None]:
#annotation = pd.read_csv('annotated_introns.csv',index_col=0)

In [None]:
#adata_spl.var['event_type'] = adata_spl.var.index.map(dict(zip(annotation['name'],annotation['event_type'])))

In [None]:
#adata_spl.var['annotated'] = adata_spl.var.event_type.apply(lambda x: True if isinstance(x,str) \
#                                                else False )

In [None]:
#row_color_dict = dict(zip(adata_spl.var.annotated.astype('category').cat.categories.astype(str),sns.color_palette("Greys_r",n_colors=2)))

In [None]:
#row_color_dict

In [None]:
#sns.palplot(row_color_dict.values())

In [None]:
#row_ID_dict = dict(zip(adata_spl.var.index,adata_spl.var['annotated'].astype(str).map(row_color_dict)))

In [None]:
#h.ax_heatmap.get_yticklabels()

In [None]:
h.ax_heatmap.get_xminorticklabels()


In [None]:
%matplotlib inline
sns.set(font_scale=1)

cmap =  sns.color_palette('coolwarm', as_cmap=True).copy()
cmap.set_bad("grey")

h = sns.clustermap(mean_z,
           col_colors=mean_deduped.columns.map(col_color_dict_filtered),
           cmap=cmap,
        #   row_colors=mean_deduped.index.map(row_ID_dict),

            colors_ratio=(.05,0.010),
            # row_linkage=h.dendrogram_row.linkage,
            # col_linkage=h.dendrogram_col.linkage,
            # cmap="coolwarm",
            figsize=(2,12),
            yticklabels=False,
            metric='sqeuclidean',
            mask=(np.isinf(mean_deduped) |  np.isnan(mean_deduped)),
            dendrogram_ratio=(.0,.05),
            cbar_pos=(1, .2, .2, .01),
            cbar_kws={'orientation':'horizontal','label':'mean Ψ'},  
            # tree_kws
            xticklabels=1
                  )



h.ax_heatmap.set_xticklabels(h.ax_heatmap.get_xticklabels(), fontsize = 5,size = 5)
plt.show()
# h.ax_heatmap.set_yticklabels("")
#h.ax_heatmap.set_xticklabels(h.get_xticks(), size = 10)

h.savefig('./plots/PSI_clustermap.pdf',dpi=600)

## plot gene expression

In [None]:
adata_gene.obs

In [None]:
intron_list

In [None]:
genedf_genes = sc.get.obs_df(
        adata_gene[adata_gene.obs['C_scANVI_simple'].isin(groups_test)],
        layer = 'nb_sample',
        keys=["C_scANVI_simple", *intron_list[intron_list['abs_delta_psi'] > .4]['gene_name'].tolist()]
    )
grouped_genes = genedf_genes.groupby("C_scANVI_simple")
mean_genes, var_genes = grouped_genes.mean(), grouped_genes.var()

In [None]:
mean_genes.shape

In [None]:
c = sns.clustermap(mean_genes.fillna(0).T,
               cmap="viridis",
               figsize=(2,12),
               yticklabels=False,
          #     metric='euclidean',
          #     mask=(np.isinf(mean) |  np.isnan(mean))
          #     dendrogram_ratio=(0,0),
               row_linkage=h.dendrogram_row.linkage,
               col_linkage=h.dendrogram_col.linkage,
               cbar_pos=(1.02, .2, .2, .01),
               cbar_kws={'orientation':'horizontal','label':'mean\nexpression'},
                z_score=0,
            #   standard_scale=1,
                   # vmax=1,
                   # vmin=-1
                   xticklabels=1
              )


c.ax_heatmap.set_xticklabels(c.ax_heatmap.get_xticklabels(), fontsize = 5,size = 5)
c.ax_row_dendrogram.set_visible(False)
c.ax_col_dendrogram.set_visible(False)

c.savefig('./plots/expression_clustermap.pdf',dpi=600)

In [None]:
!conda list