In [None]:
#ensure cuda is working
import torch
assert torch.cuda.is_available()
print(torch.cuda.is_available())
print(torch.cuda.device_count())
print(torch.cuda.current_device())
print(torch.cuda.device(0))
print(torch.cuda.get_device_name(0))

In [None]:
import mira

import anndata
import scanpy as sc
import numpy as np
import pandas as pd
import scvelo as scv

import matplotlib.pyplot as plt
plt.rcParams.update({'font.size': 14})
import matplotlib
import matplotlib as mpl
from copy import copy
reds = copy(mpl.cm.Reds)
reds.set_under("lightgray")

import os
import sys
from pathlib import Path
os.environ['R_HOME'] = sys.exec_prefix+"/lib/R/"

project_directory = '/Cranio_Lab/Louk_Seton/4_species_project'
os.chdir(os.path.expanduser("~")+project_directory)

In [None]:
species = 'mouse' #specify the species
genome = 'mm10' #specify the genome

adata = sc.read('h5ad_files/'+species+'/'+genome+'/'+'adata_concat.h5ad')


In [None]:
with open('required_files/allTFs_mm.txt') as f:
    tf_list = [line.rstrip('\n') for line in f]
import numpy as np
adata.var['TF'] = np.where(adata.var.index.isin(tf_list),True,False)

In [None]:
adata = adata[:,adata.var['TF']==True].copy()

In [None]:
#sc.pp.filter_genes(adata, min_cells=15)
sc.pp.normalize_total(adata, target_sum=1e4)
sc.pp.log1p(adata)
#sc.pp.highly_variable_genes(adata, min_disp = 0.5)
sc.pp.highly_variable_genes(adata, min_disp = 0.5,batch_key='sample',
                            #n_top_genes=1000
                           )

sc.tl.pca(adata)
sc.pp.neighbors(adata, n_pcs=6)
sc.tl.umap(adata, min_dist = 0.2, negative_sample_rate=0.2)
sc.pl.umap(adata, color = ['sample','phase'], frameon=False)

In [None]:
adata.var['highly_variable'].value_counts()

In [None]:
TF_highly_variable = list(adata.var[adata.var['highly_variable']==True].index)


In [None]:
adata = sc.read('h5ad_files/'+species+'/'+genome+'/'+'adata_concat.h5ad')

with open('required_files/allTFs_mm.txt') as f:
    tf_list = [line.rstrip('\n') for line in f]
import numpy as np
adata.var['TF'] = np.where(adata.var.index.isin(tf_list),True,False)
adata = adata[:,adata.var['TF']==False].copy()
#sc.pp.filter_genes(adata, min_cells=15)
sc.pp.normalize_total(adata, target_sum=1e4)
sc.pp.log1p(adata)
#sc.pp.highly_variable_genes(adata, min_disp = 0.5)
sc.pp.highly_variable_genes(adata, min_disp = 0.5,batch_key='sample',
                            n_top_genes=500
                           )

sc.tl.pca(adata)
sc.pp.neighbors(adata, n_pcs=6)
sc.tl.umap(adata, min_dist = 0.2, negative_sample_rate=0.2)
sc.pl.umap(adata, color = ['sample','phase'], frameon=False)

rest_highly_variable = list(adata.var[adata.var['highly_variable']==True].index)


In [None]:
sc.pl.umap(adata, color = ['sample','phase','Plp1'], frameon=False)


In [None]:
highly_variable_list = TF_highly_variable+rest_highly_variable

In [None]:
adata = sc.read('h5ad_files/'+species+'/'+genome+'/'+'adata_concat.h5ad')

adata.var['highly_variable_list'] = np.where(adata.var.index.isin(highly_variable_list),True,False)
#sc.pp.filter_genes(adata, min_cells=15)
sc.pp.normalize_total(adata, target_sum=1e4)
sc.pp.log1p(adata)
#sc.pp.highly_variable_genes(adata, min_disp = 0.5)
sc.pp.highly_variable_genes(adata, min_disp = 0.5,batch_key='sample',
                            #n_top_genes=500
                           )

sc.tl.pca(adata)
sc.pp.neighbors(adata, n_pcs=6)
sc.tl.umap(adata, min_dist = 0.2, negative_sample_rate=0.2)
sc.pl.umap(adata, color = ['sample','phase'], frameon=False)

In [None]:
adata.var.groupby(['highly_variable_list', 'highly_variable']).size().unstack(fill_value=0)

In [None]:
##mira stuff
model = mira.topics.make_model(
    adata.n_obs, adata.n_vars, # helps MIRA choose reasonable values for some hyperparameters which are not tuned.
    feature_type = 'expression',
    #highly_variable_key='TF',
    highly_variable_key = 'highly_variable_list',
    counts_layer='original_counts',
    categorical_covariates='sample',
    #continuous_covariates= ['S_score','G2M_score'],
)

In [None]:
model.get_learning_rate_bounds(adata)

In [None]:
model.set_learning_rates(1e-3, 0.1) # for larger datasets, the default of 1e-3, 0.1 usually works well.
model.plot_learning_rate_bounds(figsize=(7,3))

In [None]:
topic_contributions = mira.topics.gradient_tune(model, adata)

In [None]:
NUM_TOPICS = 21

mira.pl.plot_topic_contributions(topic_contributions, NUM_TOPICS)

In [None]:
NUM_TOPICS = 24 #24
model = model.set_params(num_topics = NUM_TOPICS,).fit(adata)

In [None]:
model.predict(adata,)

In [None]:
sc.pp.neighbors(adata, use_rep = 'X_umap_features', metric = 'manhattan',n_neighbors=15)
#sc.tl.umap(adata, min_dist=0.1, negative_sample_rate=0.05,)
#sc.tl.umap(adata, min_dist=0.3, negative_sample_rate=0.05,n_components =3)
sc.tl.umap(adata, )



In [None]:
sc.tl.leiden(adata)
sc.tl.leiden(adata,resolution = 2, key_added = 'leiden_high')


In [None]:
sc.pl.umap(adata, color = ['Sox10','leiden_high','sample','phase','Epcam','Alx3','Insc'], cmap = reds, vmin = 0.05, ncols = 2)

In [None]:
sc.tl.rank_genes_groups(adata, 'leiden_high', groups=['29'], reference='30', method='wilcoxon')

In [None]:
gene_df = sc.get.rank_genes_groups_df(adata,group = None)
gene_df = gene_df[gene_df['logfoldchanges']>4.5]
gene_df = gene_df[gene_df['scores']>5]
gene_df

In [None]:
plt.rcParams['figure.figsize'] = [5,4]
sc.pl.umap(adata, color = gene_df.head(40).names, cmap = reds, vmin = 0.05, ncols = 3)

In [None]:
plt.rcParams['figure.figsize'] = [5,4]
sc.pl.umap(adata, color = ['sample','leiden','phase','Tbx3','Sox2','Sox10','Cdh19','Foxd3','Alx3','Dlx2','Mpz','Pax3','Tfap2a','Hand2','Dlx5','Meox1','Barx1','Lef1','Dlx6','Dlx1',], cmap = reds, vmin = 0.05, ncols = 2)

In [None]:
sc.pl.umap(adata[adata.obs['sample']=='ME9'], color = ['sample','phase','Sox2','Sox10','Alx3','Dlx2','Tfap2b'], cmap = reds, vmin = 0.05, ncols = 2)

In [None]:
model.set_learning_rates(1e-3, 0.05) # for larger datasets, the default of 1e-3, 0.1 usually works well.
model.plot_learning_rate_bounds(figsize=(7,3))

In [None]:
## quick loop to try out some different parameters
from matplotlib.backends.backend_pdf import PdfPages

topics = [15,17,20,22,24,26,30]
epochs = [50,75,100,125,150]
output_dir = 'figures_ignore/mouse/mm10/mira_integration_tuning/'
for topic in topics:
    with PdfPages(output_dir+str(topic)+'_trials.pdf') as pdf:
        for epoch in epochs:
            model = model.set_params(num_topics = topic,num_epochs = epoch).fit(adata)
            model.predict(adata,)
            sc.pp.neighbors(adata, use_rep = 'X_umap_features', metric = 'manhattan',n_neighbors=15)
            sc.tl.umap(adata, )
            sc.tl.leiden(adata,resolution = 2, key_added = 'leiden_high')

            plt.rcParams['figure.figsize'] = [5,4]
            ax = sc.pl.umap(adata, color = ['sample','leiden_high','phase','Sox10','Insc','Alx3','Dlx2'], cmap = reds,ncols = 2, vmin = 0.05, show = False)
            for p in ax:
                p.set_rasterized(True)
            pdf.savefig(dpi=150,bbox_inches='tight')
            plt.close()