In [None]:
#ensure cuda is working
import torch
assert torch.cuda.is_available()
print(torch.cuda.is_available())
print(torch.cuda.device_count())
print(torch.cuda.current_device())
print(torch.cuda.device(0))
print(torch.cuda.get_device_name(0))

In [None]:
import mira

import anndata
import scanpy as sc
import numpy as np
import pandas as pd
import scvelo as scv

import matplotlib.pyplot as plt
plt.rcParams.update({'font.size': 14})
import matplotlib
import matplotlib as mpl
from copy import copy
reds = copy(mpl.cm.Reds)
reds.set_under("lightgray")

import os
import sys
from pathlib import Path
os.environ['R_HOME'] = sys.exec_prefix+"/lib/R/"

project_directory = '/Cranio_Lab/Louk_Seton/4_species_project'
os.chdir(os.path.expanduser("~")+project_directory)

In [None]:
seed = 666
import random
random.seed(seed)
np.random.seed(seed)

In [None]:
adata = sc.read('h5ad_files/mouse/ecto_andrea/ecto_nasal_placode_derived_concat.h5ad')
adata.X = adata.layers['original_counts'].copy()

In [None]:
##mira stuff
model = mira.topics.make_model(
    adata.n_obs, adata.n_vars, # helps MIRA choose reasonable values for some hyperparameters which are not tuned.
    feature_type = 'expression',
    #highly_variable_key='TF',
    highly_variable_key = 'highly_variable',
    counts_layer='original_counts',
    categorical_covariates='batch',
    continuous_covariates= ['S_score','G2M_score'],
    #max_learning_rate = 0.1
)

In [None]:
model.get_learning_rate_bounds(adata)

In [None]:
model.set_learning_rates(1e-3, 0.1) # for larger datasets, the default of 1e-3, 0.1 usually works well.
model.plot_learning_rate_bounds(figsize=(7,3))

In [None]:
# ## quick loop to try out some different parameters
# from matplotlib.backends.backend_pdf import PdfPages

# topics = [4,5,6,8,9,11]
# epochs = [50,75,100,125,150]
# output_dir = 'figures_ignore/mouse/mm39/mira_integration_tuning/andrea_ecto/'
# for topic in topics:
#     with PdfPages(output_dir+str(topic)+'_trials.pdf') as pdf:
#         for epoch in epochs:
#             model = model.set_params(num_topics = topic,num_epochs = epoch).fit(adata)
#             model.predict(adata,)
#             sc.pp.neighbors(adata, use_rep = 'X_umap_features', metric = 'manhattan',n_neighbors=15)
#             sc.tl.umap(adata, )

#             plt.rcParams['figure.figsize'] = [5,4]
#             ax = sc.pl.umap(adata, color = ['sample','phase','Fezf2','Aldh1a3','Foxa1','Ascl1','Maob','Pcp4','Fgf8'], cmap = reds,ncols = 2, vmin = 0.05, show = False)
#             for p in ax:
#                 p.set_rasterized(True)
#             pdf.savefig(dpi=150,bbox_inches='tight')
#             plt.close()

In [None]:
topic_contributions = mira.topics.gradient_tune(model, adata)

In [None]:
NUM_TOPICS = 8

mira.pl.plot_topic_contributions(topic_contributions, NUM_TOPICS)

In [None]:
NUM_TOPICS = 6 #8 and no cell cycle correction
model = model.set_params(num_topics = NUM_TOPICS,num_epochs = 100).fit(adata)#85
#model = model.set_params(num_topics = NUM_TOPICS).fit(adata)


In [None]:
model.predict(adata,)

In [None]:
sc.pp.neighbors(adata, use_rep = 'X_umap_features', metric = 'manhattan',n_neighbors=15)
#sc.tl.umap(adata, min_dist=0.1, negative_sample_rate=0.05,)
#sc.tl.umap(adata, min_dist=0.3, negative_sample_rate=0.05,n_components =3)
sc.tl.umap(adata, )



In [None]:
sc.tl.leiden(adata)
#sc.tl.leiden(adata,resolution = 2, key_added = 'leiden_high')


In [None]:
sc.pl.umap(adata, color = ['sample','batch','phase'],)

In [None]:
adata.X = adata.layers['original_counts'].copy()
sc.pp.normalize_total(adata) # Normalizing to median total counts
sc.pp.log1p(adata) # Logarithmize the data
adata.layers["normalized_counts"] = adata.X.copy()

In [None]:

sc.pl.umap(adata,color = ['Dlx5','Fezf2','Fgf8','Casr','Prnp','Ascl1','Neurog1','Maob','Pcp4','Aldh1a3','Fgf17','Wnt6',
                          'Foxa1','Reg3g',
                          'leiden',
                         ], ncols = 3, size = 50,legend_loc = 'on data',
           
           cmap = reds, vmin = 0.05)

In [None]:
adata.write('h5ad_files/mouse/ecto_andrea/ecto_nasal_placode_derived_concat_integrated.h5ad')

In [None]:
adata = sc.read('h5ad_files/mouse/ecto_andrea/ecto_nasal_placode_derived_concat.h5ad')
adata.X = adata.layers['original_counts'].copy()

In [None]:
sc.pp.normalize_total(adata) # Normalizing to median total counts
sc.pp.log1p(adata) # Logarithmize the data

In [None]:
##do linear regression for genes and obs variable
#mostly made this to identify other cell cycle genes
from sklearn.linear_model import LinearRegression
def do_reg(gene,variable):
    X = adata[:,gene].X.toarray() #get the gene expression value
    y = adata[:,gene].obs[variable] #get the variable obs value
    reg = LinearRegression().fit(X, y) #fit linear regression for gene expression and variable value
    #return (variable, gene, reg.score(X, y))
    return reg.score(X, y) #return the r2 score of the linear regression

#now parallelize the function
from multiprocessing import Pool
import itertools

def do_reg_parallel(gene_list,variable,n_threads): #supply the function with a list of genes and a column in adata.obs with your variable you want to fit gene expression to
    with Pool(n_threads) as p: #if you use too many threads and launching them requires more memory than is available, the processes won't launch
        return p.starmap(do_reg, #use starmap to be able to call both vars required for the do_reg function
                         zip(gene_list, #list of genes
                             itertools.repeat(variable) #repeat the column name for each gene
                            ))
        p.close()
        # wait for all tasks to complete
        p.join()

In [None]:
adata.obs['G2M_score'].isnull().any().any()

In [None]:
for var in ['G2M_score','S_score']:
    adata.var[var] = do_reg_parallel(adata.var.index,var,20)

In [None]:
adata.var['highly_variable'].value_counts()

In [None]:
adata.var.loc[adata.var['G2M_score'] > .1, 'highly_variable'] = False
adata.var['highly_variable'].value_counts()

In [None]:
adata.var.loc[adata.var['S_score'] > .1, 'highly_variable'] = False
adata.var['highly_variable'].value_counts()

In [None]:
##mira stuff
model = mira.topics.make_model(
    adata.n_obs, adata.n_vars, # helps MIRA choose reasonable values for some hyperparameters which are not tuned.
    feature_type = 'expression',
    #highly_variable_key='TF',
    highly_variable_key = 'highly_variable',
    counts_layer='original_counts',
    categorical_covariates='batch',
    #continuous_covariates= ['S_score','G2M_score'],
    #max_learning_rate = 0.1
)

In [None]:
model.get_learning_rate_bounds(adata)

In [None]:
model.set_learning_rates(1e-3, 0.1) # for larger datasets, the default of 1e-3, 0.1 usually works well.
model.plot_learning_rate_bounds(figsize=(7,3))

In [None]:
## quick loop to try out some different parameters
from matplotlib.backends.backend_pdf import PdfPages

topics = [7,8,9]
epochs = [80,90,100,110,125,150,175]
output_dir = 'figures_ignore/mouse/mm39/mira_integration_tuning/andrea_ecto/'
for topic in topics:
    with PdfPages(output_dir+str(topic)+'_trials.pdf') as pdf:
        for epoch in epochs:
            model = model.set_params(num_topics = topic,num_epochs = epoch).fit(adata)
            model.predict(adata,)
            sc.pp.neighbors(adata, use_rep = 'X_umap_features', metric = 'manhattan',n_neighbors=15)
            sc.tl.umap(adata, )

            plt.rcParams['figure.figsize'] = [5,4]
            ax = sc.pl.umap(adata, color = ['sample','phase','Fezf2','Aldh1a3','Foxa1','Ascl1','Maob','Pcp4','Fgf8'], cmap = reds,ncols = 2, vmin = 0.05, show = False)
            for p in ax:
                p.set_rasterized(True)
            pdf.savefig(dpi=150,bbox_inches='tight')
            plt.close()

In [None]:
topic_contributions = mira.topics.gradient_tune(model, adata)

In [None]:
NUM_TOPICS = 8

mira.pl.plot_topic_contributions(topic_contributions, NUM_TOPICS)

In [None]:
NUM_TOPICS = 8 #8 and no cell cycle correction
model = model.set_params(num_topics = NUM_TOPICS,num_epochs = 100).fit(adata)#85
#model = model.set_params(num_topics = NUM_TOPICS).fit(adata)


In [None]:
model.predict(adata,)

In [None]:
sc.pp.neighbors(adata, use_rep = 'X_umap_features', metric = 'manhattan',n_neighbors=15)
#sc.tl.umap(adata, min_dist=0.1, negative_sample_rate=0.05,)
#sc.tl.umap(adata, min_dist=0.3, negative_sample_rate=0.05,n_components =3)
sc.tl.umap(adata, )



In [None]:
sc.tl.leiden(adata)
#sc.tl.leiden(adata,resolution = 2, key_added = 'leiden_high')


In [None]:
sc.pl.umap(adata, color = ['sample','batch','phase'],)

In [None]:
sc.pl.umap(adata, color = ['sample'],groups = ['14'])

In [None]:
sc.pl.umap(adata, color = ['Bmp4','Fgf8','Aldh1a3','Acsm4',],cmap = reds, vmin = 0.05)

In [None]:
adata.write('h5ad_files/mouse/ecto_andrea/ecto_nasal_placode_derived_concat_integrated_nocycle.h5ad')

In [None]:
#This cluster 28 is Oc90 positive, clearly some inner ear cells
sc.tl.leiden(adata, resolution = 2)
sc.pl.umap(adata, color = 'leiden')

In [None]:
sc.pl.umap(adata, color = 'Oc90',cmap = reds, vmin = 0.05)

In [None]:
sc.tl.rank_genes_groups(adata,groupby='leiden',method='wilcoxon')
sc.tl.dendrogram(adata,groupby='leiden')
sc.pl.rank_genes_groups_dotplot(adata,
                                
                                groups = ['28',],
                                n_genes = 10, 
                                values_to_plot = 'logfoldchanges',
                                cmap='bwr',vmin=-4,vmax=4,min_logfoldchange = 2,
                                
                               )