In [None]:
import logging
logging.getLogger('matplotlib.font_manager').setLevel(logging.ERROR)
import scanpy as sc
import anndata as ad
import scvelo as scv
import scvi
import seaborn as sns
import plotly.express as px
import numpy as np
from dash import Dash, dcc, html, Input, Output

import pandas as pd
import warnings
import os
import sys
import time
import gc
os.environ['R_HOME'] = sys.exec_prefix+"/lib/R/"

# Plotting
import matplotlib
import matplotlib.pyplot as plt
import matplotlib as mpl
from matplotlib.backends.backend_pdf import PdfPages
from matplotlib.colors import LinearSegmentedColormap, ListedColormap
from matplotlib.lines import Line2D 

from copy import copy
reds = copy(mpl.cm.Reds)
reds.set_under("lightgray")

project_directory = '/Cranio_Lab/Louk_Seton/4_species_project'
os.chdir(os.path.expanduser("~")+project_directory)

In [None]:
ouput_dir = 'h5ad_files/mouse/ecto_andrea/'

adata = sc.read(ouput_dir+'ecto_combined_kaucka_all.h5ad')

In [None]:
adata = adata[adata.obs['sample'].isin(['10','11','12','13','14'])].copy()

In [None]:
adata.X = adata.layers['original_counts'].copy()
sc.pp.normalize_total(adata) # Normalizing to median total counts
sc.pp.log1p(adata) # Logarithmize the data
adata.layers["normalized_counts"] = adata.X.copy()

##highly variable genes
sc.pp.highly_variable_genes(adata, n_top_genes=300,batch_key = 'sample')

##dimensionality reduction and clustering
sc.tl.pca(adata)
sc.pl.pca_variance_ratio(adata,log = True)


In [None]:
sc.pp.neighbors(adata,n_pcs = 8)
sc.tl.umap(adata)
sc.tl.leiden(adata,resolution = 1.2, key_added = 'leiden_post_QC')

sc.pl.umap(adata,color = ['sample','phase','leiden_post_QC',
                         ], ncols = 3, size = 50,legend_loc = 'on data',
           
           cmap = reds, vmin = 0.05)

In [None]:
sc.pl.umap(adata,color = ['phase','Bard1',
                         ], ncols = 3, size = 50,legend_loc = 'on data',
           
           cmap = reds, vmin = 0.05)

In [None]:
sc.pl.pca(adata,color = 'phase',components = '3,4',)

In [None]:
sc.pl.pca_loadings(adata, components='3,4')

In [None]:
adata.X = adata.layers['original_counts'].copy()
sc.pp.normalize_total(adata) # Normalizing to median total counts
sc.pp.log1p(adata) # Logarithmize the data
adata.layers["normalized_counts"] = adata.X.copy()

In [None]:
sc.tl.rank_genes_groups(adata,groupby='phase',method='wilcoxon')
sc.tl.dendrogram(adata,groupby='phase')
sc.pl.rank_genes_groups_dotplot(adata,
                                groups = ['G2M','S'],
                                n_genes = 30, values_to_plot = 'logfoldchanges',
                                cmap='bwr',vmin=-4,vmax=4,min_logfoldchange = 2)

In [None]:
cell_cycle_df = sc.get.rank_genes_groups_df(adata, group = None)
cell_cycle_df = cell_cycle_df[cell_cycle_df['group'].isin(['G2M','S',])]
cell_cycle_df = cell_cycle_df[cell_cycle_df['logfoldchanges']>1.5]
cell_cycle_df = cell_cycle_df[cell_cycle_df['scores']>5]
cell_cycle_df

In [None]:
import seaborn as sns
sns.jointplot(data=cell_cycle_df, x="scores", y="logfoldchanges",marginal_kws=dict(bins=40))

In [None]:
#DEG between two clusters
cluster_of_interest = 'G2M'
reference_cluster = 'G1'
cat_name = 'phase'
sc.set_figure_params(dpi=50,figsize=[6,4])

sc.tl.rank_genes_groups(adata, cat_name, groups=[cluster_of_interest], reference=reference_cluster, method='wilcoxon')
df = sc.get.rank_genes_groups_df(adata, group=None,)
df = df[df['logfoldchanges']>1]
sc.tl.rank_genes_groups(adata, groupby = 'phase', method = 'wilcoxon')
sc.pl.rank_genes_groups_dotplot(adata,var_names = list(df['names'].head(50)),dendrogram=True,
             values_to_plot="logfoldchanges", cmap='bwr',
    vmin=-4,
    vmax=4,)


In [None]:
#DEG between two clusters
cluster_of_interest = 'S'
reference_cluster = 'G1'
cat_name = 'phase'
sc.set_figure_params(dpi=50,figsize=[6,4])

sc.tl.rank_genes_groups(adata, cat_name, groups=[cluster_of_interest], reference=reference_cluster, method='wilcoxon')
df = sc.get.rank_genes_groups_df(adata, group=None,)
df = df[df['logfoldchanges']>1]
sc.tl.rank_genes_groups(adata, groupby = 'phase', method = 'wilcoxon')
sc.pl.rank_genes_groups_dotplot(adata,var_names = list(df['names'].head(50)),dendrogram=True,
             values_to_plot="logfoldchanges", cmap='bwr',
    vmin=-4,
    vmax=4,)


In [None]:
from sklearn.metrics import r2_score
gene = 'Ung'
r2_score(adata[:,gene][adata[:,gene].X>0].X.toarray().flatten(), adata[:,gene][adata[:,gene].X>0].obs['S_score'])

In [None]:
test = {'S_r2':[],'G2M_r2':[]}

In [None]:
['2','5']+['3']

In [None]:
from sklearn.linear_model import LinearRegression

r2_dict = {'S_score':[],'G2M_score':[]}
for gene in list(adata.var.index):
    for phase in ['S_score','G2M_score']:
        X = adata[:,gene].X.toarray()
        y = adata[:,gene].obs[phase]
        reg = LinearRegression().fit(X, y)
        r2_dict[phase] = r2_dict[phase]+[reg.score(X, y)]

r2_dict

In [None]:
len([i for i in r2_dict['G2M_score'] if i >.1])

In [None]:
r2_dict['S_score']
sns.histplot([i for i in r2_dict['G2M_score'] if i >.1],bins = 30)

In [None]:
##do linear regression for genes and obs variable
#mostly made this to identify other cell cycle genes
from sklearn.linear_model import LinearRegression
def do_reg(gene,variable):
    X = adata[:,gene].X.toarray() #get the gene expression value
    y = adata[:,gene].obs[variable] #get the variable obs value
    reg = LinearRegression().fit(X, y) #fit linear regression for gene expression and variable value
    #return (variable, gene, reg.score(X, y))
    return reg.score(X, y) #return the r2 score of the linear regression

#now parallelize the function
from multiprocessing import Pool
import itertools

def do_reg_parallel(gene_list,variable,n_threads): #supply the function with a list of genes and a column in adata.obs with your variable you want to fit gene expression to
    with Pool(n_threads) as p:
        return p.starmap(do_reg, #use starmap to be able to call both vars required for the do_reg function
                         zip(gene_list, #list of genes
                             itertools.repeat(variable) #repeat the column name for each gene
                            ))
        p.close()
        # wait for all tasks to complete
        p.join()

In [None]:
for var in ['G2M_score','S_score']:
    adata.var[var] = do_reg_parallel(adata.var.index,var,50)


In [None]:
adata[:,'Top2a'].X.toarray()

In [None]:
gc.collect()

In [None]:
gene = 'Ptn'

from sklearn.linear_model import LinearRegression
X = adata[:,gene].X.toarray()
y = adata[:,gene].obs['S_score']
reg = LinearRegression().fit(X, y)
reg.score(X, y)

In [None]:
adata[:,'Ung'].X
adata[:,'Ung']

In [None]:
sns.jointplot(x=adata[:,gene][adata[:,gene].X>0].X.toarray().flatten(), y=adata[:,gene][adata[:,gene].X>0].obs['S_score'],marginal_kws=dict(bins=40))

In [None]:

sc.pl.umap(adata,color = [
                          'leiden_post_QC',
                         ], ncols = 3, size = 50,legend_loc = 'on data',
           groups = ['0','5','6','7','8','13','14','15','16','18','20'],
           
           cmap = reds, vmin = 0.05)

In [None]:
sc.tl.rank_genes_groups(adata,groupby='leiden_post_QC',method='wilcoxon')
sc.tl.dendrogram(adata,groupby='leiden_post_QC')
sc.pl.rank_genes_groups_dotplot(adata,
                                groups = ['20',],
                                n_genes = 30, values_to_plot = 'logfoldchanges',
                                cmap='bwr',vmin=-4,vmax=4,min_logfoldchange = 2)

In [None]:
adata.X = adata.layers['original_counts'].copy()
adata = adata[adata.obs['leiden_post_QC'].isin(['0','5','6','7','8','13','14','15','16','18','20'])].copy()
adata.write('h5ad_files/mouse/ecto_andrea/ecto_nasal_placode_derived_2023.h5ad')