Author: Erno Hänninen

Created: 25.02.2023

Title: run_scanvi.ipynb

Description: 
- Run scANVI and evaluate the reference dataset and model

Procedure
- Take scVI integrated data and model as input
- train scANVI model
- Evaluating scANVI model by predicting the cell types of reference and plot the result using scatter plot and sankey plot
- Evaluating scANVI integrated by visualizing it on force-directed graph drawing plot (initialized using PAGA graph)
- Explore marker gene expression of reference using feature plots and dotplots

List of non-standard modules:
- scanpy, scvi, matplotlib, pandas, seaborn, numpy, sankey

Conda environment used:
- PYenv

Usage:
- The script was executed using Jupyter Notebook web interface. All the dependencies required by Jupyter are installed to PYenv Conda environment. See README file for further details

In [1]:
# Import packages
import scanpy as sc
import scvi
import pandas as pd
import seaborn as sns
import numpy as np
import matplotlib.colorbar as colorbar
import matplotlib.colors as clr
import matplotlib.pyplot as plt

from pySankey import sankey
import os
os.environ["MKL_NUM_THREADS"] = "20"
os.environ["NUMEXPR_NUM_THREADS"] = "20"
os.environ["OMP_NUM_THREADS"] = "20"

Global seed set to 0
  from .autonotebook import tqdm as notebook_tqdm


In [None]:
# read scvi integrated data containing hypothalamic nuclei, and scvi model
adata=sc.read("Data/scvi_subtypes.h5ad")
vae = scvi.model.SCVI.load("scvi_model",adata)

## run scANVI

In [None]:
# Initialize SCANVI model with pretrained scvi model
lvae = scvi.model.SCANVI.from_scvi_model(
    vae,
    adata=adata,
    labels_key="Cell_subpopulations_updated",
    unlabeled_category="Unknown",
)
lvae

In [None]:
# Training model
lvae.train(max_epochs=60, early_stopping=True, train_size=0.72, batch_size=502, early_stopping_monitor='elbo_train')

In [None]:
# Plot convergence
train_elbo = lvae.history['elbo_train'][1:]
test_elbo = lvae.history['elbo_validation']
ax = train_elbo.plot()
test_elbo.plot(ax=ax)

In [26]:
# Save model and data for later use
lvae.save("scanvi_model", overwrite=True)
adata.write("Data/scanvi_adata.h5ad")

In [3]:
#adata=sc.read("Data/scanvi_adata.h5ad")
#lvae = scvi.model.SCANVI.load("scanvi_model",adata)

[34mINFO    [0m File scanvi_model/model.pt already downloaded                                                             


No GPU/TPU found, falling back to CPU. (Set TF_CPP_MIN_LOG_LEVEL=0 and rerun for more info.)


## Evaluating scANVI model by predicting the cell types of reference

In [4]:
# Let's see how well this model predicts the cell types from the dataset it was trained with
# Run prediction
df = lvae.predict(adata, soft=True) 
# Predict function returns a dataframe
# From row get the cell type with the highest probability and the probability value
data = {"Cell_type":list(df.idxmax(axis=1)), "Probability":list(df.max(axis="columns"))}
# Create a dataframe from this data, cells with prediction probability less than 0.6 are labelled as unknown 
cell_prob = pd.DataFrame(data) 
adata.obs["Predictions"] = "Unknown"
cell_prob.loc[cell_prob.Probability < 0.6, ['Cell_type']] = 'Unknown'
adata.obs.loc[adata.obs["Predictions"] == "Unknown", "Predictions"] = list(cell_prob["Cell_type"])

In [6]:
# Create a cell type X cell type prediction probability dataframe.
res_dict = {}

# Get cell types to list from adata and loop over the list
cells = list(adata.obs["Cell_subpopulations_updated"].cat.categories)
for cell in cells:
    # Get the prediction probability values for the cell type
    cell_subset = list(adata.obs[adata.obs["Cell_subpopulations_updated"] == cell].index)
    temp_dict = adata[cell_subset,:].obs["Predictions"].value_counts().to_dict()
    percentage_dict = {}
    
    # Round the value
    for key,value in temp_dict.items():
        value = round((value / len(cell_subset)*100),2)
        if value > 1: percentage_dict.update({key: value })
    
    # Store results to dictionarry
    res_dict.update({cell:percentage_dict})

# Create dataframe from the dict
prediction_df = pd.DataFrame.from_dict(res_dict, orient='index')
prediction_df = prediction_df.fillna(0)
prediction_df = prediction_df.reindex(prediction_df.max(1).sort_values(ascending=True).index)

In [8]:
# Creates a dataframe which for each cell type contains percentage describing the ratio of correct predictions
difference = [x for x in prediction_df.index if x not in prediction_df.columns]
correct_dict = {}
for item in difference:
    prediction_df[item] = 0

# Get the ratio of correct predictions, and update the value to dict
for index in prediction_df.index:
    for column in prediction_df.columns:
        if index == column:
            correct_dict.update({index:prediction_df[column][index]})

# Create e dict which contains the amount of cells for each cell type
num_cells_dict = {}
cells = list(adata.obs["Cell_subpopulations_updated"].cat.categories)
for cell in cells:
    cell_subset = list(adata.obs[adata.obs["Cell_subpopulations_updated"] == cell].index)
    num_cells_dict.update({cell: len(cell_subset)})
    
# Create df from two dict 
correct_prediction_df = pd.concat([pd.Series(d) for d in [correct_dict, num_cells_dict]], axis=1).fillna(0)
correct_prediction_df = correct_prediction_df.rename(index={0: "correct_predictions", 1: "num_cells"})
correct_prediction_df.columns = ['correct_predictions', 'num_cells']
correct_prediction_df["cell_type"] = correct_prediction_df.index
correct_prediction_df['cell_group'] = correct_prediction_df['cell_type'].replace(['Tanycytes', 'TM',"PVN", "SMN", "VMH", "LHA", "MN", "ARC", "NP", "Astrocyte", "RadialGlia", "OPC", "Oligo", "Ependy", "Neuron", "Mural", "Endoth", "Microglia", "VLMC"], ["Neuronal lineage", "Neurons", "Neurons", "Neurons", 
"Neurons", "Neurons", "Neurons", "Neurons", "Neuronal lineage", "Neuronal lineage", "Neuronal lineage", "Neuronal lineage", "Neuronal lineage", "Neuronal lineage", "Neurons", "Non-neuronal lineage", "Non-neuronal lineage", "Non-neuronal lineage", "Non-neuronal lineage"])
correct_prediction_df = correct_prediction_df.sort_values(by="correct_predictions", ascending=True)

In [None]:
# Plotting the scatterplot where x axis contains the correct prediction proportion and y axis the number of cells in that cell type. The cells are divided to groups and colored accordingly
# Allows us to see whether the number of cells have an impact to the prediction accuracy

# TO make the plot more interpretable some of the spots in scatterplot will be annotated. Some of the spots are not that interesting, we create another df containing only the spots that will be annotated
filtered_df = correct_prediction_df.drop(["TM", "PVN", "SMN", "VMH", "MN", "Mural", "Endoth", "Microglia", "VLMC"])
with plt.rc_context({"figure.dpi": (600)}):
    # Create the scatterplot
    fig, ax = plt.subplots()
    ax = sns.scatterplot(data=correct_prediction_df, x="correct_predictions", y="num_cells", hue="cell_group", s=12)
    plt.legend( loc='upper left', borderaxespad=0.2, )
    plt.xlabel("Correct prediction proportion (%)")
    plt.ylabel("Number of cells")
    
    # Annotate the spots
    for i, txt in enumerate(filtered_df["cell_type"]):
        # Adjust radial glia text location
        if txt == "RadialGlia":
            ax.annotate(" " + txt, (filtered_df["correct_predictions"][i]-4.8, filtered_df["num_cells"][i]+400), fontsize=10, horizontalalignment="center")
        # Adds an array to tanycytes
        elif txt == "Tanycytes" or txt == "Astrocyte":
            ax.annotate("              " + txt, (filtered_df["correct_predictions"][i], filtered_df["num_cells"][i]), fontsize=10, arrowprops=dict(arrowstyle='->',lw=0.3, fc="k"))
        elif txt == "Ependy":
            ax.annotate("           " + txt, (filtered_df["correct_predictions"][i], filtered_df["num_cells"][i]), fontsize=10, arrowprops=dict(arrowstyle='->',lw=0.3))
        elif txt == "Neuron" or txt == "NP" or txt=="ARC" or txt == "Oligo":
            ax.annotate(" " + txt, (filtered_df["correct_predictions"][i], filtered_df["num_cells"][i]+400), fontsize=10, horizontalalignment="center")
        # Otherwise use default settings
        else:
            ax.annotate("" + txt, (filtered_df["correct_predictions"][i]+1, filtered_df["num_cells"][i]), fontsize=10)
    plt.savefig("figures/prediction_scatter.png", dpi=600, bbox_inches='tight')
plt.show()

In [None]:
# Sankey plot

# Define color space
colorDict = {'ARC':'#EDCABE','LHA':'#6cb16d','VMH':'#f7c701',"PVN":'#EBB8DD',"Tanycytes":'#56c596',"TM":'#cc6677',"MN":'#723d46',"SMN":'#5c9090',"Neuron":"#FF7F0E" , "Microglia":"#279E68","Unknown":"#279E68",
    "NP":"#1F77B4","Astrocyte":"#1F77B4","OPC":"#1F77B4","RadialGlia":"#1F77B4","Oligo":"#1F77B4","Ependy":"#1F77B4","Endoth":"#279E68"}

# Plotting the true cell types and predicted cell types
# Using only cell types which the model had difficulties to predict
adata_sankey = adata[~adata.obs["Cell_subpopulations_updated"].isin(["Mural", "Endoth", "Microglia", "VLMC", "Ependy", "Neuron", "NP", "Oligo", "RadialGlia", "OPC", "Astrocyte"])]
with plt.rc_context({"figure.dpi": (300)}):
    sankey.sankey(left=adata_sankey.obs['Cell_subpopulations_updated'], right=adata_sankey.obs['Predictions'],fontsize=12, colorDict=colorDict, rightColor=False, leftLabels=["PVN", "LHA", "VMH", "MN", "SMN", "TM", "ARC", "Tanycytes"],
                  rightLabels=['Neuron','ARC','NP','Astrocyte','OPC','RadialGlia','Oligo','Ependy','Microglia','Unknown','Endoth'])
    # Save figure to file
    fig = plt.gcf()
    fig.set_facecolor("w")
    fig.savefig("figures/predictions_nuclei.png", bbox_inches="tight", dpi=400)

# Plotting the batch corrected embedding

In [18]:
# Store scanvi latent representation 
adata.obsm["X_scANVI"] = lvae.get_latent_representation() # Store the adata output
sc.pp.neighbors(adata, use_rep="X_scANVI")

In [None]:
## Compute PAGA graph and plot results
sc.tl.paga(adata, groups='Cell_subpopulations_updated')
with plt.rc_context({"figure.figsize": [9, 9], "figure.dpi": (400)}):
    sc.pl.paga(adata, threshold=0.086,node_size_scale=3, edge_width_scale=0.8, frameon=False, save="hypo.png")

In [None]:
# COmpute and plot force-directed graph drawing
sc.tl.draw_graph(adata, init_pos='paga', random_state=2)
with plt.rc_context({"figure.dpi": (400)}):
    sc.pl.draw_graph(adata, color=["Cell_types_4"],legend_loc="on data", legend_fontsize="x-small",legend_fontweight="semibold", save="_Cell_types"+".png", frameon=False, title="")


In [None]:
# Plotting markers used to identify radialglia and tanycytes
with plt.rc_context({"figure.dpi": (400)}):
    sc.pl.draw_graph(adata, use_raw=True, color=["HOPX","EGFR","CRYM","RAX"],frameon=False, ncols=2, title=["HOPX - RadialGlia","EGRF - RadialGlia", "CRYM - Tanycytes","RAX - Tanycytes"],
                     colorbar_loc=None, save="_rg_tan_ex_.png")

In [None]:
# Plot integration colored by dataset source
with plt.rc_context({"figure.dpi": (400)}):
    sc.pl.draw_graph(adata, color=["source"], legend_fontsize="x-small",legend_fontweight="semibold", save="_source.png", frameon=False, title="")


In [None]:
# Plot marker gene expression on dotplot
markers = {"Astrocyte":["GFAP","AQP4"],"RadialGlia":["HOPX","EGFR"], "OPC":["OLIG1","OLIG2"],"Oligo":["PLP1","MBP"],"Neuron":["STMN2","SYT1"],"Tanycytes":["CRYM","RAX"], "NP":["NES","MKI67"], "Ependy":"CCDC153", "VLMC":"COL1A1", "Microglia":"AIF1", "Endoth":"CLDN5", "Mural":"NDUFA4L2"}
with plt.rc_context({"figure.dpi": (400), 'font.size': '15'}):
    sc.pl.dotplot(adata, markers, groupby="Cell_types_4", dot_min=0.12, save="_reference_dotplot.png")

# Plot hypothalamic nuclei markers










In [None]:
# Subset nuclei and neurons
adata_neurons = adata[adata.obs["Cell_subpopulations_updated"].isin(["VMH", "ARC","LHA", "MN", "PVN", "SMN", "TM", "Neuron"])]
with plt.rc_context({"figure.dpi": (400)}):
    sc.pl.draw_graph(adata_neurons, color=["Cell_subpopulations_updated"], legend_fontsize="small", frameon=False,  save="_neuronal_populations_2"+".png",title="Hypothalamic nuclei")
    
# Subset only nuclei
adata_neurons = adata[adata.obs["Cell_subpopulations_updated"].isin(["VMH", "ARC","LHA", "MN", "PVN", "SMN", "TM"])]

# Plotting genes
title_list = ["PITX2 - SMN/MN", "LMX1A - SMN", "HDC - TM", "HCRT - LHA", "SIM1 - PVN", "AVP - PVN","FEZF1 - VMH", "NR5A1 - VMH", "TBX3 - ARC",  "GHRH - ARC", "FOXB1 - MN", "FOXA1 - SMN", "LHX1 - MN"]
gene_list = ["PITX2", "LMX1A", "HDC", "HCRT", "SIM1", "AVP","FEZF1", "NR5A1", "TBX3", "GHRH", "FOXB1", "FOXA1", "LHX1"]
for i, gene in enumerate(gene_list):
    with plt.rc_context({"figure.dpi": (400)}):
        sc.pl.draw_graph(adata_neurons, color=gene, legend_fontsize="small", frameon=False, use_raw=True, save="_"+gene+".png", colorbar_loc=None, title = title_list[i])

# Plot colorbar        
fig = plt.figure()
ax = fig.add_axes([0.3,0.05, 0.03, 0.3])
cb = colorbar.ColorbarBase(ax, orientation='vertical', cmap="viridis")
cb.set_ticks(range(2))
cb.ax.set_yticklabels(["Low", "High"])
plt.savefig("figures/viridis_colorbar.png", dpi=450, bbox_inches='tight')

In [None]:
# Store scanvi integrated  data where draw-graph is computed
adata.write("Data/adata_draw_graph.h5ad")