# MuTrans Analysis of Epithelial-Mesenchymal Transition (EMT)

## Load raw dataset and explore with Scanpy

In [1]:
import pandas as pd
import scanpy as sc
sc.settings.set_figure_params(dpi=100, frameon=False, figsize=(3, 3), facecolor='white')

import matplotlib.pyplot as plt
import seaborn as sns
color_palette = sns.color_palette('Set1', 5)

import pyMuTrans as pm

datadir = "../Data/"
adata = sc.read_csv(datadir+'GSE110357_htseq_counts_all_v1.csv.gz').T



In [None]:
import mygene
adata.var_names =[x.split(".")[0] for x in adata.var_names]
mg = mygene.MyGeneInfo()
geneList = adata.var_names
geneSyms = mg.querymany(geneList , scopes='ensembl.gene', fields='symbol', species='mouse')

querying 1-1000...done.
querying 1001-2000...done.
querying 2001-3000...done.
querying 3001-4000...done.
querying 4001-5000...done.


In [None]:
gene_name = [x['symbol'] if x.__contains__('symbol') else x['query'] for x in geneSyms if not x['query']=='ENSMUSG00000100417']
adata = adata [:,adata.var_names !='ENSMUSG00000100417']
adata.var_names = gene_name

In [None]:
adata.var_names_make_unique()
sc.pp.filter_cells(adata, min_genes=200)
sc.pp.filter_genes(adata, min_cells=3)

In [None]:
adata.var['ERCC'] = adata.var_names.str.startswith('ERCC')  # annotate the group of mitochondrial genes as 'mt'
sc.pp.calculate_qc_metrics(adata, qc_vars=['ERCC'], percent_top=None, log1p=False, inplace=True)

In [None]:
adata.obs['n_counts']=adata.X.sum(axis=1)
adata = adata[adata.obs['n_counts']>100000,:]

In [None]:
adata = adata[adata.obs['pct_counts_ERCC']<2,:]

In [None]:
adata.raw = adata
genes = pd.read_csv(datadir+'emt_genes.csv')
genes_intersect = list(set(genes['genes'].to_list()) & set(gene_name))
adata = adata[:,adata.var_names.isin(genes_intersect)]

In [None]:
sc.pp.log1p(adata)
sc.pp.scale(adata)

In [None]:
sc.tl.pca(adata, svd_solver='arpack')
sc.pp.neighbors(adata,metric = 'cosine',n_neighbors=50, use_rep='X')
sc.tl.umap(adata)

Using leiden algorithm to detect clusters

In [None]:
sc.settings.set_figure_params(dpi=100, frameon=False, figsize=(3, 3), facecolor='white')
sc.tl.leiden(adata,resolution = 1.0)
sc.pl.umap(adata, color=['leiden','Epcam','Mmp19'], vmax = 'p95')

In [None]:
sc.tl.rank_genes_groups(adata, 'leiden', method='wilcoxon')
sc.pl.rank_genes_groups(adata, n_genes=10, sharey=False)

In [None]:
sc.pl.umap(adata, color=['leiden','Tmem123','Yap1'], vmax = 'p95')

## Estimate Cluster Number with EPI (Eigen-Peak Index)

In [None]:
fig = plt.figure(figsize=(6, 3))
par = {"choice_distance":"cosine"}
out = pm.plot_cluster_num(adata, par, k_plot= 10) #look for the peaks

## MuTrans Analysis Based on the AnnData Object

In [None]:
par = {"choice_distance":"cosine","perplex":150.0,"K_cluster":5.0,"trials":10,"reduction_coord":'umap',"write_anndata":True} 
adata = pm.dynamical_analysis(adata,par)

In [None]:
fig = sc.pl.umap(adata, color=['attractor','entropy','Epcam','Mmp19'], vmax = 'p90', palette = color_palette,return_fig = True)
#fig.savefig('emt_umap.pdf')

In [None]:
sc.tl.rank_genes_groups(adata, 'attractor', method='wilcoxon')
sc.pl.rank_genes_groups(adata, n_genes=10, sharey=True, ncols = 5)

In [None]:
sc.pl.umap(adata, color=['Steap4','Prelp','Birc5','Dcn','Mme'],vmax = 'p99',vmin = 'p1', ncols = 5)

In [None]:
sc.pl.violin(adata, keys = ['Pbk','Prelp','Birc5','Dcn','Mme'], groupby = 'attractor')

In [None]:
sc.settings.set_figure_params(frameon=False,figsize=(4, 3))
sc.pl.violin(adata, keys = ['Epcam','Krt5','Zeb2','Prrx1','Mmp19','Aspn'], groupby = 'attractor')

Infer the transition trajectories from Epithelial to Mesenchymal, with the MPPT (most probable path tree) approach

In [None]:
fig = plt.figure(figsize=(8, 8))
pm.infer_lineage(adata,si=3,sf=0,method = "MPPT",size_point =40,alpha_point = 0.5)
fig.savefig('emt_tpt_0.pdf')

In [None]:
fig = plt.figure(figsize=(8, 8))
pm.infer_lineage(adata,si=3,sf=4,method = "MPPT",size_point =40,alpha_point = 0.5)
fig.savefig('emt_tpt_4.pdf')

## PAGA Analysis

In [None]:
sc.tl.paga(adata, groups='leiden')

In [None]:
sc.pl.paga(adata, color=['leiden','Epcam','Mmp19'])

In [None]:
sc.tl.draw_graph(adata, init_pos='paga')
sc.pl.draw_graph(adata, color=['leiden','Epcam','Mmp19','Lox'], legend_loc='on data')

In [None]:
from scipy.io import savemat
mdic = {"data": adata.X, "genes": adata.var_names.to_numpy()}
savemat('emt_data.mat',mdic)