In [None]:
import warnings
warnings.filterwarnings('ignore') 
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans, AgglomerativeClustering, SpectralClustering
from sklearn.metrics import adjusted_rand_score, adjusted_mutual_info_score

# Model
import tensorflow as tf
from tensorflow.keras.layers import Dense, GlobalAveragePooling2D, Dropout
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam
from tensorflow.keras import layers, models, applications
from tensorflow.keras.callbacks import EarlyStopping
from sklearn.model_selection import train_test_split
from tensorflow.keras.applications import MobileNetV2, ResNet50
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from sklearn.preprocessing import StandardScaler, MinMaxScaler, MaxAbsScaler, LabelEncoder
from tensorflow.keras.applications.mobilenet import MobileNet, preprocess_input
from sklearn.metrics import adjusted_mutual_info_score, adjusted_rand_score
from sklearn.svm import SVC
from scipy.stats import iqr # For Silverman's rule
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import MultiLabelBinarizer

# RLAC MODEL
from sklearn.random_projection import GaussianRandomProjection
from scipy.stats import gaussian_kde
from scipy.signal import find_peaks  
from sklearn.neighbors import KernelDensity  
from sklearn.model_selection import GridSearchCV, LeaveOneOut
from scipy import stats
from scipy.special import eval_hermitenorm  # For normalized Hermite polynomials H_n(x)
from scipy.stats import skew
from scipy.stats import norm
import diptest
from sklearn.metrics import (adjusted_mutual_info_score, adjusted_rand_score, 
                             homogeneity_score, completeness_score, v_measure_score,
                             fowlkes_mallows_score, silhouette_score, calinski_harabasz_score,
                             davies_bouldin_score)

In [None]:
import sc_loader as loader  

#  LOAD DATA
DATA_DIR = r'GSE55291 data'
SRA_PATH = r'SraRunTable.csv'

# Use the local loader to get the Raw Matrix
# This handles the "single-cell RNA-seq" selection logic automatically
raw_matrix, metadata_df = loader.load_dataset_5(DATA_DIR, SRA_PATH)

In [None]:
# 1. PROFESSIONAL IMPORT SETUP
# Get the current notebook directory
current_dir = os.getcwd()

# Define path to the shared_utils folder (one level up)
shared_path = os.path.abspath(os.path.join(current_dir, '..', 'shared_utils'))

# Add to system path if not already there
if shared_path not in sys.path:
    sys.path.append(shared_path)

import sc_processor as scp

In [None]:
#  PROCESSING (Using Shared Module)
# Config for RPKM data (similar to FPKM)
QC_PARAMS = {
    'min_tpm': 1,               # RPKM > 1
    'min_genes_per_sample': 3000, 
    'min_samples_per_gene': 5
}
# 1. Filter
filtered_df, qc_metrics = scp.filter_tpm_matrix(raw_df, **QC_PARAMS)

# 2. Normalize (Log1p)
log_df = scp.log_transform(filtered_df, method='log1p')

# 3. Visualize Normalization
scp.plot_expression_distribution(log_df, title="Log(RPKM+1) Distribution")

# 4. Feature Selection (HVG) & PCA
df_hvg, hvg_metrics = scp.select_highly_variable_genes(log_df, n_top_genes=3000)
df_scaled = scp.scale_data(df_hvg)
df_pca, var_ratio, pca_model = scp.run_pca_pipeline(df_scaled, n_components=30)

# Visualize PCA
scp.plot_pca_results(df_pca, var_ratio)

In [None]:
# 1. PROFESSIONAL IMPORT SETUP
# Get the current notebook directory
current_dir = os.getcwd()

# Define path to the shared_utils folder (one level up)
shared_path = os.path.abspath(os.path.join(current_dir, '..', 'shared_utils'))

# Add to system path if not already there
if shared_path not in sys.path:
    sys.path.append(shared_path)

import sc_clustering as scc

In [None]:
import sc_clustering as scc # The updated shared file

#  CLUSTERING BENCHMARK
# We define the columns we want to check against.
TARGETS = ['cell_type']

# Ensure we have the data
if 'df_pca' in locals() and 'metadata_df' in locals():
    
    # Run for a range of k
    k_range = [3, 5] 
    clustering_results = scc.run_clustering_benchmark(
        pca_df=df_pca, 
        cell_metadata=metadata_df, 
        n_clusters_range=k_range, 
        target_cols=TARGETS
    )
    # --- DISPLAY RESULTS ---
    if not clustering_results.empty:
        print("\n=== Final Clustering Leaderboard ===")
        # We display the top 10 results
        print(clustering_results.head(10))
        
        # Optional: Save to CSV for your portfolio evidence
        # clustering_results.to_csv('clustering_benchmark_GSE52583.csv', index=False)
       
else:
    print("Error: PCA or Metadata not found. Please run previous cells.")

In [None]:
Clustering results:
        Method  n_clusters  AMI_cell_type  ARI_cell_type  
0       KMeans           2       0.690935       0.598930    
1       HClust           2       0.048250       0.038386   
2         Ncut           2       0.591832       0.536889    
3         RLAC           2       0.461140       0.547696    
4     RLAC-Dip           2       0.591832       0.536889    
5   RLAC-Holes           2       0.139126       0.084130    
6     RLAC-Min           2       0.485661       0.496004    
7     RLAC-Max           2      -0.016632      -0.016451    
8    Mdh Model           2       0.087522       0.051396

In [None]:
import sys
import os

# 1. Get the path of the parent directory (Project_Root)
# '..' means "go up one level"
parent_dir = os.path.abspath(os.path.join(os.getcwd(), '..'))

# 2. Add it to Python's search path if not already there
if parent_dir not in sys.path:
    sys.path.append(parent_dir)

# 3. Now you can import normally
from rlac import RLAC
from mdh import MDH

print("Successfully imported models from:", parent_dir)

In [None]:
X_train = single_cell_scaled_data
y_train = single_cell_metadata['cell_type']

from sklearn.metrics import adjusted_rand_score, adjusted_mutual_info_score
# Import your custom models
from rlac import RLAC
from mdh import MDH

# --- CONFIGURATION ---
n_clusters = len(set(y_train))

# RLAC Parameters
rlac_methods = [
    'depth_ratio', 'dip', 'holes', 'min_kurt', 'max_kurt', 
    'negentropy', 'skewness', 'fisher', 'hermite', 'friedman_tukey'
]
rlac_params = {
    'random_state': [32, 42, 43, 44, 45],
    'bw_adjust': [0.05, 0.1, 0.2, 0.3, 0.4],
    'r': [None, 50, 100, 200, 300, 500] 
}

# MDH Parameters
mdh_config = {
    "h_multiplier": 1.0,
    "alphamax_val": 0.9,
    "alpha_steps": 5,
    "random_state": 42
}

results = []

print(f"\nStarting Benchmark on Single Cell Data (n={len(X_train)}, k={n_clusters})...")
print(f"\nTotal RLAC Combinations: {len(rlac_methods) * len(rlac_params['r']) * len(rlac_params['bw_adjust']) * len(rlac_params['random_state'])}")
print("-" * 80)

# ==========================================
# 1. RLAC LOOP
# ==========================================
for method in rlac_methods:
    for r_val in rlac_params['r']:
        for bw in rlac_params['bw_adjust']:
            for seed in rlac_params['random_state']:
                
                param_str = f"r={r_val}, bw={bw}, s={seed}"
                # Using end="" to keep the line until 'Done' is printed
                print(f"\n\rRunning RLAC {method:<15} | {param_str} ... ", end="")
                
                try:
                    model = RLAC(
                        n_clusters=n_clusters,
                        method=method,
                        r=r_val,
                        bw_adjust=bw,
                        random_state=seed,
                        plot=False
                    )
                    
                    with warnings.catch_warnings():
                        warnings.filterwarnings("ignore", category=UserWarning)
                        model.fit(X_train)
                    
                    ami = adjusted_mutual_info_score(y_train, model.labels_)
                    ari = adjusted_rand_score(y_train, model.labels_)
                    
                    results.append({
                        'Model': 'RLAC',
                        'Method': method,
                        'Params': param_str,
                        'AMI': ami,
                        'ARI': ari
                    })
                    
                except Exception as e:
                    # Fail silently in the loop to keep output clean, but record failure
                    results.append({
                        'Model': 'RLAC', 'Method': method, 'Params': param_str,
                        'AMI': -1, 'ARI': -1
                    })

print("\nRLAC Loop Complete.")

# ==========================================
# 2. MDH RUN
# ==========================================
print(f"\nRunning MDH {'Standard':<15} | h=1.0, a=0.9 ... ")
try:
    mdh_model = MDH(
        n_clusters=n_clusters,
        h_multiplier=mdh_config['h_multiplier'],
        alphamax_val=mdh_config['alphamax_val'],
        alpha_steps=mdh_config['alpha_steps'],
        random_state=mdh_config['random_state'],
        verbose=False,
        plot=False
    )
    
    mdh_model.fit(X_train)
    
    ami_mdh = adjusted_mutual_info_score(y_train, mdh_model.labels_)
    ari_mdh = adjusted_rand_score(y_train, mdh_model.labels_)
    
    print(f"Done (AMI: {ami_mdh:.4f})")
    
    results.append({
        'Model': 'MDH',
        'Method': 'Standard',
        'Params': 'Fixed',
        'AMI': ami_mdh,
        'ARI': ari_mdh
    })
    
except Exception as e:
    print(f"MDH FAILED. Error: {e}")

# ==========================================
# 3. RESULTS TABLE
# ==========================================
print("\n" + "="*80)
print("FINAL RESULTS (ALL MODELS - SORTED BY AMI)")
print("="*80)

# Create DataFrame and sort
results_df = pd.DataFrame(results).sort_values(by='AMI', ascending=False)

# Print everything
print(results_df.to_string(index=False))

In [None]:
================================================================================
FINAL RESULTS (ALL MODELS - SORTED BY AMI)
================================================================================
Model         Method                Params       AMI       ARI
  MDH       Standard                 Fixed  0.802932  0.701835
 RLAC         fisher   r=200, bw=0.2, s=44  0.754999  0.735711
 RLAC         fisher   r=200, bw=0.4, s=44  0.754999  0.735711
 RLAC         fisher   r=200, bw=0.1, s=44  0.754999  0.735711
 RLAC         fisher  r=200, bw=0.05, s=44  0.754999  0.735711
 RLAC     negentropy   r=200, bw=0.4, s=44  0.754999  0.735711
 RLAC         fisher   r=200, bw=0.3, s=44  0.754999  0.735711
 RLAC     negentropy   r=200, bw=0.2, s=44  0.754999  0.735711
 RLAC     negentropy   r=200, bw=0.3, s=44  0.754999  0.735711
 RLAC friedman_tukey   r=300, bw=0.4, s=45  0.722946  0.731874
 RLAC         fisher   r=200, bw=0.4, s=45  0.698183  0.702977
 RLAC         fisher   r=500, bw=0.1, s=44  0.697274  0.706558
 RLAC         fisher   r=500, bw=0.3, s=44  0.697274  0.706558
 RLAC         fisher   r=500, bw=0.2, s=44  0.697274  0.706558
 RLAC         fisher   r=500, bw=0.4, s=44  0.697274  0.706558
 RLAC         fisher  r=500, bw=0.05, s=44  0.697274  0.706558
 RLAC            dip   r=200, bw=0.4, s=45  0.690207  0.692245
 RLAC        hermite   r=200, bw=0.4, s=45  0.688946  0.684325
 RLAC         fisher r=None, bw=0.05, s=32  0.678041  0.570674
 RLAC         fisher  r=None, bw=0.3, s=32  0.678041  0.570674