# Drosophila Mature Neuronal Subset scVelo Analysis

### **Date:** 01 Aug 2025

In [None]:
# =============================================================================
# Import Necessary Packages
# =============================================================================
import numpy as np
import os
import scanpy as sc
import anndata as ad
import scvelo as scv

# Additional packages for plotting, logging, etc.
import pandas as pd
import matplotlib.pyplot as plt
import re
import logging
from scipy.stats import median_abs_deviation

In [None]:
# =============================================================================
# Initialise Environment Settings
# =============================================================================
np.random.seed(12345)  # Set random seed for reproducibility

# Set working directory
working_directory = "/DataDrives/Drive2/Clifton/R_Projects/2025_Drosophila_scRNAseq_MonoamineSpecification/ANALYSIS/Step_3_scVelo/Step_3.2_Mature_neuronal_subset_analysis"
os.chdir(working_directory)
print(f"Current working directory: {os.getcwd()}")


In [None]:
# Configure Scanpy settings
sc.settings.verbosity = 3
sc.logging.print_header()

In [None]:
# Configure scVelo settings
scv.settings.verbosity = 3
scv.settings.presenter_view = True
scv.set_figure_params('scvelo')

In [None]:
# List of shared cell types
shared_cell_types = [
    "Acetylcholine",
    "Acetylcholine/GABA",
    "GABA",
    "GABA/Glutamate",
    "GABA/Serotonin",
    "Glutamate",
    "Serotonin",
    "Serotonin/GABA",
    "Monoamine",
    "Monoamine/Acetylcholine",
    "Monoamine/Serotonin",
    "Monoamine/GABA",
    "Immature_neurons",
    "New-born_neurons/Immature_neurons",
    "Unknown_mature_neurons",
    "Neuroblasts",
    "Neuroblasts/GMCs",
    "Neuroblasts/GMCs/Immature_neurons",
    "GMCs",
    "GMCs/New-born_neurons/Immature_neurons",
    "Neuroblasts/GMCs/New-born_neurons/Immature_neurons",
    "Unknown"
]

# Dictionary mapping cell types to colors
shared_color_palette = {
    # Acetylcholine-related
    "Acetylcholine": "#FFD700",  # vivid gold
    "Acetylcholine/GABA": "#FFEE58",  # sunflower yellow

    # GABA-related
    "GABA": "#B71C1C",  # dark red
    "GABA/Glutamate": "#D84315",  # burnt orange-red
    "GABA/Serotonin": "#F06292",  # deep pink
    "Serotonin/GABA": "#E91E63",  # strong pink-rose
    "Monoamine/GABA": "#EF5350",  # soft red-pink

    # Glutamate-related
    "Glutamate": "#43A047",  # strong green

    # Serotonin-related
    "Serotonin": "#8E24AA",  # deep purple

    # Monoamine-related
    "Monoamine": "#FB8C00",  # bright orange
    "Monoamine/Acetylcholine": "#FFA726",  # soft orange
    "Monoamine/Serotonin": "#FF7043",  # orange-coral

    # Neuroblasts & GMCs
    "Neuroblasts": "#1565C0",  # cobalt blue
    "Neuroblasts/GMCs": "#1E88E5",  # vivid blue
    "Neuroblasts/GMCs/Immature_neurons": "#64B5F6",  # sky blue
    "GMCs": "#0D47A1",  # navy
    "GMCs/New-born_neurons/Immature_neurons": "#1976D2",  # medium blue
    "Neuroblasts/GMCs/New-born_neurons/Immature_neurons": "#90CAF9",  # pale blue

    # Developmental/Immature/Unknown
    "Immature_neurons": "#29B6F6",  # bright cyan
    "New-born_neurons/Immature_neurons": "#4DD0E1",  # teal
    "Unknown_mature_neurons": "#757575",  # neutral gray
    "Unknown": "#BDBDBD"  # light gray
}

In [None]:
# =============================================================================
# Define Input Files for the 16 Datasets (Full Dataset)
# =============================================================================

In [None]:
input_files = [
    ("hrs_00_03", "A1", "Rep1"),
    ("hrs_00_03", "A9", "Rep2"),
    ("hrs_02_05", "A2", "Rep1"),
    ("hrs_02_05", "A10", "Rep2"),
    ("hrs_04_07", "A3", "Rep1"),
    ("hrs_04_07", "A11", "Rep2"),
    ("hrs_05_08", "A4", "Rep1"),
    ("hrs_05_08", "A12", "Rep2"),
    ("hrs_06_10", "A5", "Rep1"),
    ("hrs_06_10", "A13", "Rep2"),
    ("hrs_09_13", "A6", "Rep1"),
    ("hrs_09_13", "A14", "Rep2"),
    ("hrs_12_17", "A7", "Rep1"),
    ("hrs_12_17", "A15", "Rep2"),
    ("hrs_16_22", "A8", "Rep1"),
    ("hrs_16_22", "A16", "Rep2")
]

In [None]:
# =============================================================================
# Load Loom Files into AnnData Objects and Concatenate (Full Dataset)
# =============================================================================

In [None]:
adata_list = []
common_genes = set()

In [None]:
print("Reading loom files and creating AnnData objects for full dataset...")
for time, replicate, batch in input_files:
    adata_name = f"{time}_{replicate}"
    loom_dir = f"/DataDrives/Drive2/Clifton/R_Projects/2025_Drosophila_scRNAseq_MonoamineSpecification/ANALYSIS/Step_1_CellRanger_and_Velocyto_Processing/02_Counts/{time}_{replicate}_{batch}/velocyto/{time}_{replicate}_{batch}.loom"
    
    # Read the loom file with gene names and ensure unique variable names
    adata = sc.read(loom_dir, var_names='Gene', cache=True)
    adata.var_names_make_unique()
    
    # Save a copy of the raw counts in a separate layer
    adata.layers['counts'] = adata.X.copy()
    
    # Add metadata for time, replicate, and batch
    adata.obs['time'] = str(time)
    adata.obs['time'] = adata.obs['time'].astype('category')
    adata.obs['replicate'] = str(replicate)
    adata.obs['replicate'] = adata.obs['replicate'].astype('category')
    adata.obs['batch'] = str(batch)
    adata.obs['batch'] = adata.obs['batch'].astype('category')
    
    # Update the common gene set
    current_genes = set(adata.var_names)
    print(f"{adata_name}: {len(current_genes)} genes")
    common_genes = common_genes.union(current_genes)
    
    # Append the AnnData object to our list
    adata_list.append(adata)
    print(f"Added {adata_name} to the list.")

In [None]:
# Concatenate all AnnData objects
print("Concatenating all AnnData objects for full dataset...")
combined_adata = ad.concat(adata_list, index_unique=None, label="Neuronal Embryogenesis", join="outer")

In [None]:
# =============================================================================
# Process the Subset Dataset
# =============================================================================

In [None]:
# Load the subset Seurat h5ad file
adata_subset_seurat = sc.read_h5ad("4_mature_neuronal_subset_annotated.h5ad")
print(adata_subset_seurat)
print("Subset cell metadata columns:", adata_subset_seurat.obs.columns)
print("Subset gene metadata columns:", adata_subset_seurat.var.columns)
print("Subset embeddings available:", adata_subset_seurat.obsm.keys())

In [None]:
# Compare cell barcodes for the subset
loom_barcodes = set(combined_adata.obs_names)
seurat_subset_barcodes = set(adata_subset_seurat.obs_names)
common_cells = loom_barcodes.intersection(seurat_subset_barcodes)
loom_only_cells = loom_barcodes - common_cells
seurat_only_cells = seurat_subset_barcodes - common_cells

In [None]:
print(f"Total cells in subset adata_loom: {len(loom_barcodes)}")
print(f"Total cells in adata_subset_seurat: {len(seurat_subset_barcodes)}")
print(f"Number of matching cells in subset: {len(common_cells)}")
print(f"Cells only in subset adata_loom: {len(loom_only_cells)}")
print(f"Cells only in adata_subset_seurat: {len(seurat_only_cells)}")
print("Examples (subset adata_loom only):", list(loom_only_cells)[:5])
print("Examples (subset adata_subset_seurat only):", list(seurat_only_cells)[:5])

In [None]:
# Subset both objects to the common cells for the subset
combined_adata_common = combined_adata[list(common_cells)].copy()
adata_seurat_common = adata_subset_seurat[list(common_cells)].copy()

In [None]:
# Align cells and genes for the subset
combined_adata_common = combined_adata_common[adata_seurat_common.obs_names]
combined_adata_common = combined_adata_common[:, adata_seurat_common.var_names]

In [None]:
# Save the common subset AnnData objects
combined_adata_common.write("1_combined_adata_common_subset.h5ad")
adata_seurat_common.write("1_adata_seurat_common_subset.h5ad")

In [None]:
# Clean up temporary variables
del adata_list

In [None]:
# Reload the common subset AnnData objects
combined_adata_common = ad.read_h5ad("1_combined_adata_common_subset.h5ad")
adata_seurat_common = ad.read_h5ad("1_adata_seurat_common_subset.h5ad")

In [None]:
# Merge the data for the subset as done for the full dataset
adata = combined_adata_common.copy()
adata.X = adata_seurat_common.X.copy()
adata.layers = adata_seurat_common.layers.copy()
adata.obs = adata_seurat_common.obs.copy()
adata.var = adata_seurat_common.var.copy()
for key in adata_seurat_common.obsm.keys():
    adata.obsm[key] = adata_seurat_common.obsm[key].copy()

adata.uns = adata_seurat_common.uns.copy()

In [None]:
# Retain spliced/unspliced layers from the original combined loom data for the subset
adata.layers['ambiguous'] = combined_adata_common.layers['ambiguous'].copy()
adata.layers['matrix'] = combined_adata_common.layers['matrix'].copy()
adata.layers['spliced'] = combined_adata_common.layers['spliced'].copy()
adata.layers['unspliced'] = combined_adata_common.layers['unspliced'].copy()

In [None]:
adata.write("1_Velocity_Start_Data_subset.h5ad")

In [None]:
adata = sc.read_h5ad("1_Velocity_Start_Data_subset.h5ad")

In [None]:
adata

In [None]:
# Get categories in the same order they appear in the AnnData object
categories = adata.obs["neuronal_annotation_fine"].unique().tolist()

# Map them to colors
color_pal_hex = [shared_color_palette[cat] for cat in categories]

# Store in AnnData uns metadata
adata.uns["neuronal_annotation_fine_colors"] = color_pal_hex


In [None]:
import pandas as pd
import scanpy as sc

key = "neuronal_annotation_fine"

# 1) Ensure the obs column is categorical in the order you want.
#    Here we keep the *order of first appearance* in the data.
if not pd.api.types.is_categorical_dtype(adata.obs[key]):
    order = pd.unique(adata.obs[key])  # preserves first-seen order
    adata.obs[key] = pd.Categorical(adata.obs[key], categories=order, ordered=True)

# 2) Get the exact category order that Scanpy will use
cats = adata.obs[key].cat.categories.tolist()

# 3) Map categories to hex colors in that SAME order
#    (will raise a clear error if a category is missing from your palette)
try:
    color_pal_hex = [shared_color_palette[c] for c in cats]
except KeyError as e:
    missing = str(e).strip("'")
    raise KeyError(f"Category '{missing}' missing from shared_color_palette")

# 4) Store in .uns using the Scanpy naming convention
adata.uns[f"{key}_colors"] = color_pal_hex

# 5) Plot
sc.pl.umap(
    adata,
    color=key,
    legend_loc="right margin",
    size=80,
    save="_mature_neuronal_subset_annotation.svg",
)
sc.pl.umap(
    adata,
    color=key,
    legend_loc="right margin",
    size=80,
    save="_mature_neuronal_subset_annotation.pdf",
)

In [None]:
# Plot proportions grouped by timepoint, and save the figure
scv.pl.proportions(adata, groupby="timepoint", 
                   save='proportions_timepoint_subset.svg', dpi=300)
# Plot proportions grouped by timepoint, and save the figure
scv.pl.proportions(adata, groupby="timepoint", 
                   save='proportions_timepoint_subset.pdf', dpi=300)

In [None]:
# Define your list of genes of interest.
genes_of_interest = [
    "Trh",
    "Ddc",
    "Vmat",
    "SerT",
    "elav", "nSyb", "Syn", "Syt1", "Syt4",  
    "DAT",
    "Tbh",
    "Tdc1",
    "Tdc2",
    "ple",
    "5-HT1A",
    "5-HT1B",
    "5-HT2A",
    "5-HT2B",
    "5-HT7", "Dop1R1", "Dop1R2", "Dop2R", "DopEcR",
    "Tbh",
    "CG4328",
    "Lmx1a",
    "Ets65A",
    "vvl",
    "salm",
    "salr",
    "CG32532", "dmrt99B", "fd59A", "scro", "Fer2", 
    "Tdc2",
    "SerT",
    "DAT",
    "Hdc",
    "HisCl1", 
    "Oct-TyrR", 
    "Octbeta1R", 
    "Octbeta3R", 
    "Oamb", 
    "Octbeta2R", 
    "Octalpha2R", 
    "TyrR", 
    "TyrRII"
]

In [None]:
# Check which genes are present in the dataset.
genes_present = [gene for gene in genes_of_interest if gene in adata.var_names]
missing_genes   = [gene for gene in genes_of_interest if gene not in adata.var_names]

In [None]:
# Print the lists to see the results:
print("Genes present in the dataset:", genes_present)
print("Genes missing from the dataset:", missing_genes)

In [None]:
# Filter and normalize
scv.pp.filter_and_normalize(adata, min_shared_counts=5, retain_genes = genes_of_interest, n_top_genes=8000, subset_highly_variable=False)

In [None]:
# Compute neighbors and moments
sc.pp.neighbors(adata, n_pcs=100, n_neighbors=30, random_state=0)
scv.pp.moments(adata, n_pcs=100, n_neighbors=30)

In [None]:
adata.write("2_Velocity_Processing_subset.h5ad")

In [None]:
adata = sc.read_h5ad("2_Velocity_Processing_subset.h5ad")

In [None]:
scv.tl.recover_dynamics(adata, n_jobs=64, show_progress_bar=True)

In [None]:
scv.tl.velocity(adata, mode='dynamical')

In [None]:
scv.tl.velocity_graph(adata)

In [None]:
adata.write("3_Velocity_scVelo_Dynamical_subset.h5ad")

In [None]:
# Load AnnData
adata = sc.read_h5ad("3_Velocity_scVelo_Dynamical_subset.h5ad")

In [None]:
# ------------------------------------------------------------------------------
# Velocity embedding stream by 'neuronal_annotation_fine' with custom colors
scv.pl.velocity_embedding_stream(
    adata, 
    basis='X_umap', 
    color='neuronal_annotation_fine', 
    dpi=300, 
    save="subset_annotated_clusters.svg", 
    legend_loc='right margin'
)
# ------------------------------------------------------------------------------
# Velocity embedding stream by 'neuronal_annotation_fine' with custom colors
scv.pl.velocity_embedding_stream(
    adata, 
    basis='X_umap', 
    color='neuronal_annotation_fine', 
    dpi=300, 
    save="subset_annotated_clusters.pdf", 
    legend_loc='right margin'
)

In [None]:
# ------------------------------------------------------------------------------
# 2. Load timepoint color mapping and assign custom colors for timepoint annotation
color_df = pd.read_csv("/DataDrives/Drive2/Clifton/R_Projects/2025_Drosophila_scRNAseq_MonoamineSpecification/ANALYSIS/Step_2_Initial_preprocessing/Supplementary_Data/timepoint2colour_mapping.csv")
time2color = dict(zip(color_df["Label"], color_df["Color"]))
# Assign a color for each cell's timepoint; defaulting to black ("#000000") if not found
adata.uns["timepoint_colors"] = [time2color.get(tp, "#000000") for tp in adata.obs["timepoint"]]

In [None]:
# Use the timepoint mapping as the palette
palette_time = time2color

In [None]:
# Plot velocity embedding stream by 'timepoint' with the custom timepoint palette
scv.pl.velocity_embedding_stream(
    adata, 
    basis='X_umap', 
    color='timepoint', 
    palette=palette_time, 
    dpi=300, 
    save="subset_timepoint.svg", 
    legend_loc='right margin'
)
# Plot velocity embedding stream by 'timepoint' with the custom timepoint palette
scv.pl.velocity_embedding_stream(
    adata, 
    basis='X_umap', 
    color='timepoint', 
    palette=palette_time, 
    dpi=300, 
    save="subset_timepoint.pdf", 
    legend_loc='right margin'
)

In [None]:
# Plot UMAP colored by 'timepoint' and save the figure
sc.pl.umap(
    adata, 
    color='timepoint', 
    palette=palette_time, 
    legend_loc='right margin', 
    size=80, 
    save='subset_timepoint.svg'
)
# Plot UMAP colored by 'timepoint' and save the figure
sc.pl.umap(
    adata, 
    color='timepoint', 
    palette=palette_time, 
    legend_loc='right margin', 
    size=80, 
    save='subset_timepoint.pdf'
)

In [None]:
# ------------------------------------------------------------------------------
# 3. Plot and save UMAP colored by 'neuronal_annotation_fine'
sc.pl.umap(
    adata, 
    color='neuronal_annotation_fine', 
    legend_loc='right margin', 
    size=80, 
    save='_subset_annotation.svg'
)
# ------------------------------------------------------------------------------
# 3. Plot and save UMAP colored by 'neuronal_annotation_fine'
sc.pl.umap(
    adata, 
    color='neuronal_annotation_fine', 
    legend_loc='right margin', 
    size=80, 
    save='_subset_annotation.pdf'
)

In [None]:
# ------------------------------------------------------------------------------
# 4. Create histograms for fit parameters (transcription, splicing, degradation rates)
df = adata.var.copy()
df = df[(df['fit_likelihood'] > 0.1) & (df['velocity_genes'] == True)]

In [None]:
kwargs = dict(xscale='log', fontsize=16)
with scv.GridSpec(ncols=3) as pl:
    pl.hist(df['fit_alpha'], xlabel='transcription rate', **kwargs)
    pl.hist(df['fit_beta'] * df['fit_scaling'], xlabel='splicing rate', xticks=[0.1, 0.4, 1], **kwargs)
    pl.hist(df['fit_gamma'], xlabel='degradation rate', xticks=[0.1, 0.4, 1], **kwargs)

In [None]:
# Optionally, print head of all fit parameters
print(scv.get_df(adata, 'fit*', dropna=True).head())

In [None]:
# ------------------------------------------------------------------------------
# 5. Compute latent time and plot the scatter colored by latent time
# Set the root cell based on timepoint "hrs_00_03"
# Find the first cell that belongs to this timepoint
root_cell_index = np.flatnonzero(adata.obs["timepoint"] == "hrs_00_03")[0]
adata.uns['iroot'] = root_cell_index

In [None]:
scv.tl.latent_time(adata)

In [None]:
scv.pl.scatter(
    adata, 
    color='latent_time', 
    color_map='gnuplot', 
    size=80, 
    save='latent_time.svg'
)
scv.pl.scatter(
    adata, 
    color='latent_time', 
    color_map='gnuplot', 
    size=80, 
    save='latent_time.pdf'
)

In [None]:
# ------------------------------------------------------------------------------
# 6. Top 500 genes based on fit_likelihood and heatmap
top_genes = adata.var['fit_likelihood'].sort_values(ascending=False).index[:500]

In [None]:
# GO for top dynamical genes (for R GO analysis)
import numpy as np
import pandas as pd
from tqdm import tqdm  # progress bar

# 1) choose the “dynamical” genes and the background
dyn_genes = adata.var_names[adata.var['fit_likelihood'] > 0.1]   # tweak threshold
background = adata.var_names[~adata.var['fit_likelihood'].isna()]  # all tested

# 2) assign each dynamical gene a latent-time “peak”
lt = adata.obs['latent_time'].values

def gene_peak_time(g):
    # use expression layer you trust ("X", "spliced", "Ms", etc.)
    x = adata[:, g].layers.get("spliced", adata[:, g].X).A.ravel() if hasattr(adata[:, g].X, "A") else np.array(adata[:, g].X).ravel()
    return lt[np.argmax(x)]

# Added tqdm progress bar here
peaks = {g: gene_peak_time(g) for g in tqdm(dyn_genes, desc="Calculating peak times")}

# 3) bin genes by peak latent time
k = 20  # number of bins
quantiles = pd.qcut(pd.Series(peaks), q=k, labels=False, duplicates='drop')

# 5) write per-bin gene lists + background as plain text
bins = []
for b in sorted(quantiles.unique()):
    genes_in_bin = [g for g, bb in quantiles.items() if bb == b]
    bins.append((b, genes_in_bin))
    with open(f"genes_bin_{b}.txt", "w") as fh:
        fh.write("\n".join(genes_in_bin))

with open("genes_background.txt", "w") as fh:
    fh.write("\n".join(background))


In [None]:
dyn_genes = adata.var_names[adata.var['fit_likelihood'] > 0.1]  
print(len(dyn_genes))

gene_peak_time = {}
lt = adata.obs['latent_time'].values  # cache latent time outside the loop

for gene in tqdm(dyn_genes, desc="Calculating peak latent times"):
    expr = adata[:, gene].layers["spliced"].toarray().flatten()  # or "X"
    peak_time = lt[np.argmax(expr)]
    gene_peak_time[gene] = peak_time

gene_peak_df = pd.DataFrame.from_dict(
    gene_peak_time,
    orient='index',
    columns=['peak_latent_time']
)

In [None]:
# Split into 20 equal bins
gene_peak_df['bin'] = pd.qcut(gene_peak_df['peak_latent_time'], q=20, labels=False)

In [None]:
from gprofiler import GProfiler
gp = GProfiler(return_dataframe=True)

for b in sorted(gene_peak_df['bin'].unique()):
    genes_in_bin = gene_peak_df.index[gene_peak_df['bin'] == b].tolist()
    if len(genes_in_bin) >= 5:  # avoid tiny bins
        res = gp.profile(
            organism='dmelanogaster',  # or dmelanogaster, hsapiens, etc.
            query=genes_in_bin
        )
        res.to_csv(f"GO_bin_{b}.csv", index=False)


In [None]:
import glob
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from textwrap import fill

# --- Config ---
in_pattern = "GO_bin_*.csv"
top_per_bin = 10
term_col = "name"              # <-- changed from "term_name"
pval_col = "p_value"           # <-- changed from "p_value_adjusted"
bin_label = lambda path: path.split("_")[-1].split(".")[0]

# --- Load & pick top terms per bin ---
tables = {}
for path in sorted(glob.glob(in_pattern)):
    df = pd.read_csv(path)
    df = df.sort_values(pval_col, ascending=True)
    tables[bin_label(path)] = df.head(top_per_bin).copy()

# --- Build union of terms ---
all_terms = pd.Index(sorted(set().union(*[set(t[term_col]) for t in tables.values()])))
bins = list(tables.keys())

# --- Assemble matrix of -log10(p) ---
M = pd.DataFrame(0.0, index=all_terms, columns=bins)
for b, df in tables.items():
    M.loc[df[term_col], b] = (-np.log10(df[pval_col].clip(lower=1e-300))).values

# Optional: order rows by the column where each term peaks
order = M.values.argmax(axis=1)
M = M.iloc[np.argsort(order)]

# --- Plot ---
plt.figure(figsize=(10, max(4, 0.35*len(M))))
plt.imshow(M.values, aspect='auto', interpolation='nearest')
plt.xticks(range(len(bins)), bins, rotation=0)
labels = [fill(t, width=40) for t in M.index]
plt.yticks(range(len(M)), labels)
plt.xlabel("Latent time bin")
plt.title("GO enrichment by bin (−log10 p)")
cbar = plt.colorbar()
cbar.set_label("−log10 p")
plt.tight_layout()
plt.show()


In [None]:
# Generate a heatmap of the top genes, sorted by latent time.
scv.pl.heatmap(
    adata, 
    var_names=top_genes, 
    sortby='latent_time',
    col_color='neuronal_annotation_fine',  # set the column color annotation to neuronal_annotation_fine
    n_convolve=100, 
    palette=color_pal_hex, 
    save='_dynamicalgenes500_neuronal_annotation.pdf',
figsize=(12, 8)
)

In [None]:
# ------------------------------------------------------------------------------
# 7. Rank dynamical genes by 'neuronal_annotation_fine'
scv.tl.rank_dynamical_genes(adata, groupby='neuronal_annotation_fine')
ranked_df = scv.get_df(adata, 'rank_dynamical_genes/names')
print(ranked_df.head(5))

# Save the ranked dynamical genes DataFrame to CSV
ranked_df.to_csv("ranked_dynamical_genes.csv", index=False)

In [None]:
import re

selected_categories = ['Monoamine', 'Monoamine/GABA', 'GABA', 'Acetylcholine', 'Glutamate', "GABA/Glutamate"]

def sanitize_filename(name):
    # Replace any non-alphanumeric or underscore character with underscore
    return re.sub(r'[^\w\-\.]', '_', name)

for category in selected_categories:
    if category in ranked_df.columns:
        # Take the top 5 genes for this category (drop any NaN values)
        genes_to_plot = ranked_df[category].dropna().tolist()[:5]
        if genes_to_plot:  # Only plot if list is non-empty
            safe_category = sanitize_filename(category)
            scv.pl.scatter(
                adata, 
                genes_to_plot, 
                ylabel=category, 
                frameon=False, 
                save=f'_{safe_category}_scatter.svg'
            )

for category in selected_categories:
    if category in ranked_df.columns:
        # Take the top 5 genes for this category (drop any NaN values)
        genes_to_plot = ranked_df[category].dropna().tolist()[:5]
        if genes_to_plot:  # Only plot if list is non-empty
            safe_category = sanitize_filename(category)
            scv.pl.scatter(
                adata, 
                genes_to_plot, 
                ylabel=category, 
                frameon=False, 
                save=f'_{safe_category}_scatter.pdf'
            )


In [None]:
# -------------------------------------------------------------------------------
# 9. Rank velocity genes grouped by 'neuronal_annotation_fine'
scv.tl.rank_velocity_genes(adata, groupby='neuronal_annotation_fine', min_corr=0.3)
# Create a DataFrame from the gene names ranked for each group
df = pd.DataFrame(adata.uns['rank_velocity_genes']['names'])
# Save the ranked velocity genes to a CSV file
df.to_csv('top_velocity_genes_neuronal_annotation_fine.csv', index=False)

In [None]:
# -------------------------------------------------------------------------------
# 10. Calculate velocity confidence and plot scatter of velocity length and confidence
scv.tl.velocity_confidence(adata)
keys = ('velocity_length', 'velocity_confidence')
scv.pl.scatter(
    adata, 
    c=keys, 
    cmap='coolwarm', 
    perc=[5, 95], 
    dpi=300, 
    save='velocity_length_and_confidence.svg'
)
scv.pl.scatter(
    adata, 
    c=keys, 
    cmap='coolwarm', 
    perc=[5, 95], 
    dpi=300, 
    save='velocity_length_and_confidence.pdf'
)

In [None]:
# -------------------------------------------------------------------------------
# 11. Cell transitions plot: Compute cell transitions and overlay the velocity graph
# Use the previously computed root cell if available; otherwise, default to 0.
starting_cell = adata.uns.get('iroot', 0)

# Compute cell transitions coordinates using the selected starting cell
x, y = scv.utils.get_cell_transitions(adata, basis='umap', starting_cell=starting_cell)

# First, generate the velocity graph without immediately displaying it
ax = scv.pl.velocity_graph(adata, c='lightgrey', edge_width=0.05, show=False)

# Then, overlay a scatter plot on the velocity graph and save the figure as an SVG
ax = scv.pl.scatter(
    adata, 
    x=x, y=y, 
    s=120, 
    c='ascending', 
    cmap='gnuplot', 
    ax=ax, 
    save='cell_transitions.svg'
)
ax = scv.pl.scatter(
    adata, 
    x=x, y=y, 
    s=120, 
    c='ascending', 
    cmap='gnuplot', 
    ax=ax, 
    save='cell_transitions.pdf'
)

In [None]:
# -------------------------------------------------------------------------------
# 12. Compute velocity pseudotime and plot scatter colored by pseudotime
scv.tl.velocity_pseudotime(adata)
scv.pl.scatter(
    adata, 
    color='velocity_pseudotime', 
    cmap='gnuplot', 
    save='velocity_pseudotime.svg'
)
scv.pl.scatter(
    adata, 
    color='velocity_pseudotime', 
    cmap='gnuplot', 
    save='velocity_pseudotime.pdf'
)

In [None]:
adata.write("4_Velocity_scVelo_processed_subset.h5ad")

In [None]:
# Load AnnData
adata = sc.read_h5ad("4_Velocity_scVelo_processed_subset.h5ad")

In [None]:
# Define your list of genes of interest.
genes_of_interest = [
    "Trh",
    "Ddc",
    "Vmat",
    "SerT",
    "elav", "nSyb", "Syn", "Syt1", "Syt4",  
    "DAT",
    "Tbh",
    "Tdc1",
    "Tdc2",
    "ple",
    "5-HT1A",
    "5-HT1B",
    "5-HT2A",
    "5-HT2B",
    "5-HT7", "Dop1R1", "Dop1R2", "Dop2R", "DopEcR",
    "Tbh",
    "CG4328",
    "Lmx1a",
    "Ets65A",
    "vvl",
    "salm",
    "salr",
    "CG32532", "dmrt99B", "fd59A", "scro", "Fer2", 
    "Tdc2",
    "SerT",
    "DAT",
    "Hdc",
    "HisCl1", 
    "Oct-TyrR", 
    "Octbeta1R", 
    "Octbeta3R", 
    "Oamb", 
    "Octbeta2R", 
    "Octalpha2R", 
    "TyrR", 
    "TyrRII"
]

# Add any additional genes that you want to include, which are not in the list above.
extra_genes = ["elav", "nSyb", "Syn", "Syt1", "Syt4"]
genes_of_interest.extend(extra_genes)

In [None]:
# Loop over the genes_of_interest list and plot velocity for each gene.
for gene in genes_of_interest:
    # Sanitize gene name for filename (remove characters like '-' or '/')
    safe_gene = gene.replace("/", "").replace(" ", "")
    scv.pl.velocity(
        adata, 
        [gene], 
        save=f'{safe_gene}_velocity_subset.svg', 
        dpi=300
    )