In [None]:
#@title Mount Drive
from google.colab import drive
drive.mount('/content/drive')

%cd /content/drive/My Drive

Mounted at /content/drive
/content/drive/My Drive


In [1]:

import scanpy as sc
import pandas as pd
import os
import hdf5plugin
import numpy as np
#os.environ["Hdata5_PLUGIN_PATH"] = hdf5plugin.PLUGIN_PATH

Matplotlib is building the font cache; this may take a moment.


In [2]:
os.environ["Hdata5_PLUGIN_PATH"] = hdf5plugin.PLUGIN_PATH
data = sc.read_h5ad("combined_corrected.h5ad")
print(data.obs)

                                                                                     sample_id  \
exp_component_name                                                                               
AGAAGTAGTGCTGCAC-L8TX_210430_01_H04-1142430420  AGAAGTAGTGCTGCAC-L8TX_210430_01_H04-1142430420   
ATTCCTAGTATCGAGG-L8TX_210430_01_H04-1142430420  ATTCCTAGTATCGAGG-L8TX_210430_01_H04-1142430420   
ATTGTTCAGACCTCCG-L8TX_210430_01_H04-1142430420  ATTGTTCAGACCTCCG-L8TX_210430_01_H04-1142430420   
TGCTCCACACACAGCC-L8TX_210430_01_H04-1142430420  TGCTCCACACACAGCC-L8TX_210430_01_H04-1142430420   
CAGTACCCAGGCTACT-L8XR_210812_01_F11-1124987484  CAGTACCCAGGCTACT-L8XR_210812_01_F11-1124987484   
...                                                                                        ...   
AATGACCGTAGTTACC-L8TX_210729_01_F12-1153814344  AATGACCGTAGTTACC-L8TX_210729_01_F12-1153814344   
GGGATCCAGAAGCTGC-L8TX_210729_01_F12-1153814344  GGGATCCAGAAGCTGC-L8TX_210729_01_F12-1153814344   
GCTCAAATCCCTGTTG-L8T

In [3]:
data.obs.columns

Index(['sample_id', 'Neurotypical reference', 'Donor ID', 'Organism',
       'Brain Region', 'Sex', 'Gender', 'Age at Death', 'Race (choice=White)',
       'Race (choice=Black/ African American)',
       ...
       'Class', 'Subclass confidence', 'Subclass', 'Supertype confidence',
       'Supertype (non-expanded)', 'Supertype',
       'Continuous Pseudo-progression Score', 'Severely Affected Donor',
       'donor_id', 'genotype'],
      dtype='object', length=137)

In [5]:
data.shape

(1240908, 2000)

In [6]:
data.var_names

Index(['RSRC1', 'RNGTT', 'LIMS1', 'MAPK1', 'KIZ', 'SACS', 'RNF180', 'TRA2A',
       'MRPS30-DT', 'RAB3IP',
       ...
       'RBFOX1', 'IQCJ-SCHIP1', 'SGCZ', 'CNTN5', 'RALYL', 'DPP10', 'ADARB2',
       'ROBO2', 'KCNIP4', 'ERBB4'],
      dtype='object', length=2000)

# Marker Gene Define

In [9]:
# marker gene from 
# Wang, C., Nat Commun 15, 4710 (2024). https://doi.org/10.1038/s41467-024-49133-z

cell_type_markers = {
    "Astrocytes": ["GFAP", "AQP4", "GJA1", "SLC1A2", "FGFR3", "NKAIN4", "AGT", "PLXNB1", "SLC1A3"],
    "Endothelial cells": ["CLDN5", "VWF"],
    "Neurons": ["GLS", "RBFOX3", "CAMK2A"],
    "Excitatory neurons": ["SLC17A6", "SLC17A7", "SATB2"],
    "Inhibitory neurons": ["GAD1", "GAD2"],
    "Microglia": ["P2RY12", "CSF1R", "C3", "CX3CR1"],
    "Oligodendrocytes": ["OLIG2", "MBP", "MOBP", "PLP1", "MYRF", "MAG"],
    "Oligodendrocyte precursor cells": ["VCAN", "SOX8"],
    "Pericytes": ["AMBP", "HIGD1B", "PTH1R"]
}

# Dictionary to store the updated list of markers (i.e., markers that exist in adata.var_names)
updated_markers = {}

# Loop over each cell type and check for the presence of each marker gene in adata.var_names
for cell_type, genes in cell_type_markers.items():
    present_genes = [gene for gene in genes if gene in data.var_names]
    missing_genes = [gene for gene in genes if gene not in data.var_names]
    
    # Optionally print which genes are missing per cell type:
    if missing_genes:
        print(f"For cell type '{cell_type}', missing genes: {missing_genes}")
    
    updated_markers[cell_type] = present_genes
print(updated_markers)

For cell type 'Astrocytes', missing genes: ['GFAP', 'AQP4', 'GJA1', 'FGFR3', 'NKAIN4', 'AGT', 'PLXNB1']
For cell type 'Endothelial cells', missing genes: ['CLDN5', 'VWF']
For cell type 'Excitatory neurons', missing genes: ['SLC17A6', 'SLC17A7']
For cell type 'Microglia', missing genes: ['P2RY12', 'CSF1R', 'C3', 'CX3CR1']
For cell type 'Oligodendrocytes', missing genes: ['OLIG2', 'MYRF', 'MAG']
For cell type 'Oligodendrocyte precursor cells', missing genes: ['SOX8']
For cell type 'Pericytes', missing genes: ['AMBP', 'HIGD1B', 'PTH1R']
{'Astrocytes': ['SLC1A2', 'SLC1A3'], 'Endothelial cells': [], 'Neurons': ['GLS', 'RBFOX3', 'CAMK2A'], 'Excitatory neurons': ['SATB2'], 'Inhibitory neurons': ['GAD1', 'GAD2'], 'Microglia': [], 'Oligodendrocytes': ['MBP', 'MOBP', 'PLP1'], 'Oligodendrocyte precursor cells': ['VCAN'], 'Pericytes': []}


In [21]:
# create a subset of data to only include the marker genes
marker_genes = [
    "GFAP", "AQP4", "GJA1", "SLC1A2", "FGFR3", "NKAIN4", "AGT", "PLXNB1", "SLC1A3",  # Astrocytes
    "CLDN5", "VWF",                                                                # Endothelial cells
    "GLS", "RBFOX3", "CAMK2A",                                                      # Neurons
    "SLC17A6", "SLC17A7", "SATB2",                                                   # Excitatory neurons
    "GAD1", "GAD2",                                                                # Inhibitory neurons
    "P2RY12", "CSF1R", "C3", "CX3CR1",                                               # Microglia
    "OLIG2", "MBP", "MOBP", "PLP1", "MYRF", "MAG",                                  # Oligodendrocytes
    "VCAN", "SOX8",                                                                # Oligodendrocyte precursor cells
    "AMBP", "HIGD1B", "PTH1R"                                                       # Pericytes
]
valid_marker_genes = [gene for gene in marker_genes if gene in data.var_names]
subset_data = data[:, valid_marker_genes].copy()

In [22]:
subset_data.X.shape

(1240908, 12)

In [23]:
# k-means cluster
    
def distance(centroid, x): 
    # centroid and x are nparray  
    return np.sum((centroid - x)**2)

def clustering(centroids, x): 
    # centroids and x are nparray
    dis = [distance(centroid, x) for centroid in centroids] 
    idx = np.argmin(dis) 
    return idx, dis[idx] # return to the centroid index

def update(cluster): 
    # cluster is an ndarray that contains a list of points'coordinates 
    if len(cluster) == 0:
        return None  
    return np.mean(cluster, axis=0) 

def subdata(k): 
    return [[] for _ in range(k)]

def k_means(X, k, max_iterations=1000, random_state=42, tol=1e-4):
    """
    Perform k-means clustering on data X using custom functions.
    
    Parameters:
      X: np.ndarray
         Data matrix of shape (n_samples, n_features).
      k: int
         Number of clusters.
      max_iterations: int
         Maximum number of iterations.
      random_state: int
         Seed for reproducibility.
      tol: float
         Tolerance for convergence.
         
    Returns:
      centroids: np.ndarray
         Final centroids.
      cluster_assignments: list
         Cluster index for each sample.
    """
    np.random.seed(random_state)
    # Initialize centroids by randomly selecting k samples from X
    initial_indices = np.random.choice(X.shape[0], size=k, replace=False)
    centroids = X[initial_indices]

    for iteration in range(max_iterations):
        clusters = subdata(k)
        cluster_assignments = []
        
        # Assignment step: assign each sample to the closest centroid.
        for sample in X:
            idx, _ = clustering(centroids, sample)
            clusters[idx].append(sample)
            cluster_assignments.append(idx)
        
        # Update step: recalculate centroids
        new_centroids = []
        for cluster in clusters:
            new_centroid = update(np.array(cluster))
            if new_centroid is None:
                # Reinitialize empty clusters with a random sample from X
                new_centroid = X[np.random.choice(X.shape[0])]
            new_centroids.append(new_centroid)
        new_centroids = np.array(new_centroids)
        
        # Check convergence: if centroids do not change more than tol
        if np.allclose(centroids, new_centroids, atol=tol):
            print(f"Converged at iteration {iteration}")
            break
        centroids = new_centroids
        
    return centroids, cluster_assignments

In [25]:
X = subset_data.X
if hasattr(X, "toarray"):
    X = X.toarray()

k = 6 # 6 clusters since some clusters has no gene included in the 2000-highest variable genes 
centroids, cluster_assignments = k_means(X, k, max_iterations=1000, random_state=42)
print("Cluster assignments (first 10 cells):", cluster_assignments[:10])

# -----------------------------
# Append the Cluster Label as a New (Last) Column of the Expression Matrix
# -----------------------------
# Convert the list of cluster assignments to a column vector
cluster_labels = np.array(cluster_assignments).reshape(-1, 1)
# Horizontally stack the cluster label column to the expression matrix X
X_with_labels = np.hstack([X, cluster_labels])
print("New shape of expression matrix with cluster labels:", X_with_labels.shape)

# Optionally, you might want to store the cluster labels in the AnnData object’s .obs
subset_data.obs['cluster_label'] = cluster_assignments

Converged at iteration 12
Cluster assignments (first 10 cells): [np.int64(5), np.int64(2), np.int64(4), np.int64(1), np.int64(4), np.int64(5), np.int64(3), np.int64(4), np.int64(1), np.int64(5)]
New shape of expression matrix with cluster labels: (1240908, 13)


In [27]:
cell_type_markers = {
    "Astrocytes": ["GFAP", "AQP4", "GJA1", "SLC1A2", "FGFR3", "NKAIN4", "AGT", "PLXNB1", "SLC1A3"],
    "Endothelial cells": ["CLDN5", "VWF"],
    "Neurons": ["GLS", "RBFOX3", "CAMK2A"],
    "Excitatory neurons": ["SLC17A6", "SLC17A7", "SATB2"],
    "Inhibitory neurons": ["GAD1", "GAD2"],
    "Microglia": ["P2RY12", "CSF1R", "C3", "CX3CR1"],
    "Oligodendrocytes": ["OLIG2", "MBP", "MOBP", "PLP1", "MYRF", "MAG"],
    "Oligodendrocyte precursor cells": ["VCAN", "SOX8"],
    "Pericytes": ["AMBP", "HIGD1B", "PTH1R"]
}

# Create a mapping from gene to its column index in your subset matrix.
# Assume valid_marker_genes is your list of marker genes present in the data.
gene_to_index = {gene: idx for idx, gene in enumerate(valid_marker_genes)}

# Example centroids produced from your k_means_custom function, 
# where centroids is an array of shape (n_clusters, len(valid_marker_genes))
# For demonstration, assume centroids is already computed:
# centroids = ... (from k_means_custom)

# Now, assign a cell type to each cluster based on the centroid's marker expression.
cluster_cell_type_assignment = {}

for cluster_idx, centroid in enumerate(centroids):
    scores = {}
    # For each cell type, calculate the average expression of its markers in the centroid.
    for cell_type, markers in cell_type_markers.items():
        # Only consider markers that are in your valid_marker_genes.
        markers_in_data = [gene for gene in markers if gene in gene_to_index]
        if markers_in_data:
            # Get the indices corresponding to these marker genes.
            indices = [gene_to_index[gene] for gene in markers_in_data]
            # Calculate the average expression (or sum) for these marker genes.
            score = np.mean(centroid[indices])
            scores[cell_type] = score
    # If any scores were computed, pick the cell type with the highest score.
    if scores:
        assigned_cell_type = max(scores, key=scores.get)
    else:
        assigned_cell_type = "Unknown"
    cluster_cell_type_assignment[cluster_idx] = assigned_cell_type

# Print out the mapping from cluster number to cell type.
print("Cluster to cell type mapping:")
for cluster, cell_type in cluster_cell_type_assignment.items():
    print(f"Cluster {cluster}: {cell_type}")

Cluster to cell type mapping:
Cluster 0: Astrocytes
Cluster 1: Excitatory neurons
Cluster 2: Inhibitory neurons
Cluster 3: Oligodendrocytes
Cluster 4: Excitatory neurons
Cluster 5: Excitatory neurons


In [31]:
subset_data.obs["cluster_label"] = subset_data.obs["cluster_label"].replace({4: 1, 5: 1})

df = pd.DataFrame({
    "sample_id": subset_data.obs_names,
    "cell_type": subset_data.obs["cluster_label"]
})

# Save the DataFrame to a CSV file
output_csv = "sample_cell_types.csv"
df.to_csv(output_csv, index=False)

In [32]:
cell_type_counts = subset_data.obs["cluster_label"].value_counts()

# Print the counts
print("Count of cells in each cell type:")
print(cell_type_counts)

Count of cells in each cell type:
cluster_label
1    700188
2    295688
0    137210
3    107822
Name: count, dtype: int64


In [33]:
data.obs.columns

Index(['sample_id', 'Neurotypical reference', 'Donor ID', 'Organism',
       'Brain Region', 'Sex', 'Gender', 'Age at Death', 'Race (choice=White)',
       'Race (choice=Black/ African American)',
       ...
       'Class', 'Subclass confidence', 'Subclass', 'Supertype confidence',
       'Supertype (non-expanded)', 'Supertype',
       'Continuous Pseudo-progression Score', 'Severely Affected Donor',
       'donor_id', 'genotype'],
      dtype='object', length=137)

In [35]:
if hasattr(data.X, "toarray"):
    X_dense = data.X.toarray()
else:
    X_dense = data.X

# Create a DataFrame from the expression matrix. 
# Rows are labeled by sample IDs (adata.obs_names) and columns by gene names (adata.var_names).
df_expr = pd.DataFrame(X_dense, index=data.obs_names, columns=data.var_names)

# Optionally, if you prefer the sample IDs to be a column instead of the index:
df_expr = df_expr.reset_index().rename(columns={"index": "sample_id"})


In [36]:
df_expr["Donor ID"] = data.obs["Donor ID"].values

In [37]:
df_expr.to_csv("cell_type_expression.csv")

KeyboardInterrupt: 

In [45]:
mapping_dict = dict(zip(df['sample_id'], df['cell_type']))

# Step 3. Add a new column to df_expr by mapping the "exp_component_name" values.
df_expr['cell_type'] = df_expr['exp_component_name'].map(mapping_dict)


In [46]:
sampled_df = (
    df_expr.groupby(["Donor ID", "cell_type"])
           .apply(lambda x: x.sample(n=100, replace=(len(x) < 100), random_state=42))
           .reset_index(drop=True)
)
sampled_df.to_csv("sampled100.csv")

  df_expr.groupby(["Donor ID", "cell_type"])
  .apply(lambda x: x.sample(n=100, replace=(len(x) < 100), random_state=42))


In [50]:
with open("obs_columns.txt", "w") as f:
    for col in data.obs.columns:
        f.write(f"{col}\n")

In [None]:
# cognitive index 
# real value: 'Last CASI Score', 'Last MMSE Score'
# binary value: 'Cognitive Status'

In [57]:
# neuropathalogical index 
# all are classification: 

neuropath_cols = [
    'Overall AD neuropathological Change', 'Thal', 'Braak', 'CERAD score',
    'Overall CAA Score', 'Highest Lewy Body Disease',
    'Total Microinfarcts (not observed grossly)', 'Total microinfarcts in screening sections',
    'Atherosclerosis', 'Arteriolosclerosis', 'LATE'
]
print(data.obs['Overall AD neuropathological Change'].head())

exp_component_name
AGAAGTAGTGCTGCAC-L8TX_210430_01_H04-1142430420    Not AD
ATTCCTAGTATCGAGG-L8TX_210430_01_H04-1142430420    Not AD
ATTGTTCAGACCTCCG-L8TX_210430_01_H04-1142430420    Not AD
TGCTCCACACACAGCC-L8TX_210430_01_H04-1142430420    Not AD
CAGTACCCAGGCTACT-L8XR_210812_01_F11-1124987484    Not AD
Name: Overall AD neuropathological Change, dtype: category
Categories (4, object): ['High', 'Intermediate', 'Low', 'Not AD']
