In [None]:
import os
import pandas as pd
import numpy as np
#import anndata
import time
import matplotlib.pyplot as plt
import json
import requests
import pickle
import gzip as gz

import sys
sys.path.insert(0, '/home/jw3514/Work/ASD_Circuits_CellType/src/')
from CellType_PSY import *
os.chdir("/home/jw3514/Work/ASD_Circuits_CellType/notebooks_mouse_sc/")
print(f"Current working directory: {os.getcwd()}")

In [None]:
HGNC, ENSID2Entrez, GeneSymbol2Entrez, Entrez2Symbol = LoadGeneINFO()

In [None]:
# Load config file
with open("../config/config.yaml", "r") as f:
    config = yaml.safe_load(f)

expr_matrix_path = config["analysis_types"]["CT_Spec"]["expr_matrix"]
SC_BiasMat = pd.read_parquet(f"../{expr_matrix_path}")
#Anno = STR2Region()

In [None]:
CellTypesDF = pd.read_csv("dat/CellTypeHierarchy.csv", index_col=0)

In [None]:
CellTypesDF

In [None]:
def plot_gene_spec_across_superclusters(Gene_Sepc, Gene, Gene_Entrez, superclusters, Anno, cluster_col=None):
    """
    Plots the specificity value for a given gene across superclusters, with boxplots sorted by median.
    Works with both human data (Supercluster column) and mouse data (class column).

    Parameters:
    - Gene_Sepc: Series, specificity values indexed by cell type for the gene.
    - Gene: str, gene symbol.
    - Gene_Entrez: int or str, Entrez ID of the gene.
    - superclusters: list, unique supercluster names.
    - Anno: DataFrame, annotation with 'Supercluster' (human) or 'class' (mouse) column 
            and cell type indices matching Gene_Sepc index.
    - cluster_col: str, optional. Column name to use for grouping ('Supercluster' or 'class').
                   If None, will auto-detect based on available columns.
    """
    # Auto-detect column name if not provided
    if cluster_col is None:
        if 'class' in Anno.columns:
            cluster_col = 'class'
            xlabel = "Class"
        elif 'Supercluster' in Anno.columns:
            cluster_col = 'Supercluster'
            xlabel = "Supercluster"
        else:
            raise ValueError("Annotation DataFrame must have either 'Supercluster' or 'class' column")
    else:
        # Use provided column name and set appropriate label
        if cluster_col == 'class':
            xlabel = "Class"
        else:
            xlabel = "Supercluster"
    
    # Collect spec values and their medians for each supercluster
    supercluster_data = []
    for supercluster in superclusters:
        # Find cell types belonging to this supercluster
        cts = Anno.loc[Anno[cluster_col] == supercluster].index
        # Get intersection with available cell types in Gene_Sepc
        available_cts = cts.intersection(Gene_Sepc.index)
        if len(available_cts) > 0:
            spec = Gene_Sepc[available_cts]
            median_spec = np.median(spec.values)
            supercluster_data.append((supercluster, spec.values, median_spec))

    if len(supercluster_data) == 0:
        print(f"Warning: No matching cell types found between annotation and gene specificity data.")
        return

    # Sort by median
    supercluster_data_sorted = sorted(supercluster_data, key=lambda x: x[2], reverse=True)

    supercluster_labels = [item[0] for item in supercluster_data_sorted]
    supercluster_specs = [item[1] for item in supercluster_data_sorted]

    plt.figure(figsize=(10, 6))
    plt.boxplot(supercluster_specs, labels=supercluster_labels, notch=True)
    plt.ylabel(f"Spec value for {Gene} ({Gene_Entrez})")
    plt.xlabel(xlabel)
    plt.title(f"{Gene} spec value across {xlabel.lower()}s (sorted by median)")
    plt.xticks(rotation=45, ha='right')
    plt.tight_layout()
    plt.show()


# Get unique supertypes
classes = sorted(Anno_mouse['class'].unique())

In [None]:
Gene = "SCN2A"
Gene_Entrez = GeneSymbol2Entrez[Gene]
Gene_Sepc = SC_BiasMat.loc[Gene_Entrez, :]

# For mouse data: prepare annotation from CellTypesDF
# CellTypesDF has cluster as index and supertype as column
# Make sure the annotation index matches the cell type names in Gene_Sepc
Anno_mouse = CellTypesDF.copy()  # CellTypesDF already has cluster as index


# Plot gene specificity across supertypes for mouse data
plot_gene_spec_across_superclusters(Gene_Sepc, Gene, Gene_Entrez, classes, Anno_mouse)

In [None]:
Gene = "SCN1A"
Gene_Entrez = GeneSymbol2Entrez[Gene]
Gene_Sepc = SC_BiasMat.loc[Gene_Entrez, :]

# For mouse data: prepare annotation from CellTypesDF
# CellTypesDF has cluster as index and supertype as column
# Make sure the annotation index matches the cell type names in Gene_Sepc
Anno_mouse = CellTypesDF.copy()  # CellTypesDF already has cluster as index


# Plot gene specificity across supertypes for mouse data
plot_gene_spec_across_superclusters(Gene_Sepc, Gene, Gene_Entrez, classes, Anno_mouse)

In [None]:
Gene_Sepc