In [None]:
import warnings
warnings.filterwarnings('ignore') 
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans, AgglomerativeClustering, SpectralClustering
from sklearn.metrics import adjusted_rand_score, adjusted_mutual_info_score

# Model
import tensorflow as tf
from tensorflow.keras.layers import Dense, GlobalAveragePooling2D, Dropout
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam
from tensorflow.keras import layers, models, applications
from tensorflow.keras.callbacks import EarlyStopping
from sklearn.model_selection import train_test_split
from tensorflow.keras.applications import MobileNetV2, ResNet50
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from sklearn.preprocessing import StandardScaler, MinMaxScaler, MaxAbsScaler, LabelEncoder
from tensorflow.keras.applications.mobilenet import MobileNet, preprocess_input
from sklearn.metrics import adjusted_mutual_info_score, adjusted_rand_score
from sklearn.svm import SVC
from scipy.stats import iqr # For Silverman's rule
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import MultiLabelBinarizer

# RLAC MODEL
from sklearn.random_projection import GaussianRandomProjection
from scipy.stats import gaussian_kde
from scipy.signal import find_peaks  
from sklearn.neighbors import KernelDensity  
from sklearn.model_selection import GridSearchCV, LeaveOneOut
from scipy import stats
from scipy.special import eval_hermitenorm  # For normalized Hermite polynomials H_n(x)
from scipy.stats import skew
from scipy.stats import norm
from sklearn.metrics import (adjusted_mutual_info_score, adjusted_rand_score, 
                             homogeneity_score, completeness_score, v_measure_score,
                             fowlkes_mallows_score, silhouette_score, calinski_harabasz_score,
                             davies_bouldin_score)

In [None]:
import sc_loader as loader # The new file above

# --- Configuration ---
METADATA_PATH = r"\SraRunTable.csv"
DATA_PATH = r"GSE42268 data"

# --- 1. LOAD DATA ---
# This single line replaces the loop, pivot, and merge logic
raw_counts_df, metadata_df = loader.load_gse42268_data(DATA_PATH, METADATA_PATH)

# --- 2. INSPECT ---
print("\n--- Expression Matrix Head (Genes x Samples) ---")
print(raw_counts_df.iloc[:5, :5]) # Show first 5 genes and 5 samples

print("\n--- Metadata Head ---")
print(metadata_df.head())

# CHECK: Do the columns of the matrix match the rows of the metadata?
if raw_counts_df.columns.equals(metadata_df.index):
    print("\nSUCCESS: Data and Metadata are perfectly aligned.")
else:
    print("\nWARNING: Alignment mismatch. Check sample IDs.")
    # Auto-align if needed
    common = raw_counts_df.columns.intersection(metadata_df.index)
    raw_counts_df = raw_counts_df[common]
    metadata_df = metadata_df.loc[common]

In [None]:
# 1. PROFESSIONAL IMPORT SETUP
# Get the current notebook directory
current_dir = os.getcwd()

# Define path to the shared_utils folder (one level up)
shared_path = os.path.abspath(os.path.join(current_dir, '..', 'shared_utils'))

# Add to system path if not already there
if shared_path not in sys.path:
    sys.path.append(shared_path)

import sc_processor as scp

In [None]:
#  SETUP & CONFIGURATION
QC_CONFIG = {
    'min_tpm': 1,              # FPKM > 1 is a standard detection threshold
    'min_genes_per_sample': 1000, # High quality Smart-Seq usually has >2000 genes
    'min_samples_per_gene': 2
}

#  FILTERING & NORMALIZATION
if 'raw_counts_df' in locals():
    
    # Filter, Note: The function says 'tpm', but it works mathematically identical for FPKM
    filtered_df, qc_metrics = proc.filter_tpm_matrix(
        raw_counts_df, 
        **QC_CONFIG
    )
    # Log Transform (log(FPKM + 1))
    log_df = proc.log_transform(filtered_df, method='log1p')

    # Visual Check
    print("\n--- QC Summary ---")
    print(qc_metrics.describe())
    proc.plot_expression_distribution(log_df, title="Log(FPKM+1) Distribution")

else:
    print("Error: raw_counts_df not found. Please run the Loading step first.")

# 3. FEATURE SELECTION & PCA
if 'log_df' in locals() and not log_df.empty:
    
    # 1. Select Highly Variable Genes (HVG)
    df_hvg, hvg_metrics = proc.select_highly_variable_genes(log_df, n_top_genes=3000)
    proc.plot_hvg_dispersion(hvg_metrics)

    # 2. Scale Data (Z-score)
    df_scaled = proc.scale_data(df_hvg)

    # 3. Run PCA
    df_pca, var_ratio, pca_model = proc.run_pca_pipeline(df_scaled, n_components=50)

    # 4. Visualize Results
    proc.plot_pca_results(df_pca, var_ratio)
    
    print("PCA processing complete.")

In [None]:
# 1. PROFESSIONAL IMPORT SETUP
# Get the current notebook directory
current_dir = os.getcwd()

# Define path to the shared_utils folder (one level up)
shared_path = os.path.abspath(os.path.join(current_dir, '..', 'shared_utils'))

# Add to system path if not already there
if shared_path not in sys.path:
    sys.path.append(shared_path)

import sc_clustering as scc 

In [None]:
#  CLUSTERING BENCHMARK
TARGET_COL = 'cell_type' 

# Check if the column exists in our metadata
if 'metadata_df' in locals() and TARGET_COL not in metadata_df.columns:
    print(f"Warning: '{TARGET_COL}' not found. Switching to 'Filename_Group'.")
    TARGET_COL = 'Filename_Group'

if 'df_pca' in locals() and 'metadata_df' in locals():
    
    print(f"Running Benchmark against Ground Truth: {TARGET_COL}")
    
    # Define range of K to test
    # (e.g., if we expect 3 cell types, we test 2, 3, 4, 5)
    k_range = [2, 3, 4, 5, 6]

    clustering_results = cluster.run_baseline_clustering(
        pca_df=df_pca, 
        cell_metadata=metadata_df, 
        n_clusters_range=k_range,
        true_label_col=TARGET_COL
    )
    # --- DISPLAY LEADERBOARD ---
    if not clustering_results.empty:
        print("\n=== Final Clustering Leaderboard ===")
        print(clustering_results)
        
        # Grab the best result
        best = clustering_results.iloc[0]
        print(f"\nWinner: {best['Method']} with k={best['k']} (AMI={best['AMI']:.3f})")
else:
    print("Required data (PCA or Metadata) is missing.")

In [None]:
import sys
import os

# 1. Get the path of the parent directory (Project_Root)
# '..' means "go up one level"
parent_dir = os.path.abspath(os.path.join(os.getcwd(), '..'))

# 2. Add it to Python's search path if not already there
if parent_dir not in sys.path:
    sys.path.append(parent_dir)

# 3. Now you can import normally
from rlac import RLAC
from mdh import MDH

print("Successfully imported models from:", parent_dir)

In [None]:
X_train = pca_df
y_train = metadata_filtered['cell_type']

from sklearn.metrics import adjusted_rand_score, adjusted_mutual_info_score
# Import your custom models
from rlac import RLAC
from mdh import MDH

# --- CONFIGURATION ---
n_clusters = len(set(y_train))

# RLAC Parameters
rlac_methods = [
    'depth_ratio', 'dip', 'holes', 'min_kurt', 'max_kurt', 
    'negentropy', 'skewness', 'fisher', 'hermite', 'friedman_tukey'
]
rlac_params = {
    'random_state': [32, 42, 43, 44, 45],
    'bw_adjust': [0.05, 0.1, 0.2, 0.3, 0.4, 0.5],
    'r': [None, 50, 100, 200, 300, 500] 
}

# MDH Parameters
mdh_config = {
    "h_multiplier": 1.0,
    "alphamax_val": 0.9,
    "alpha_steps": 5,
    "random_state": 42
}

results = []

print(f"\nStarting Benchmark on Single Cell Data (n={len(X_train)}, k={n_clusters})...")
print(f"\nTotal RLAC Combinations: {len(rlac_methods) * len(rlac_params['r']) * len(rlac_params['bw_adjust']) * len(rlac_params['random_state'])}")
print("-" * 80)

# ==========================================
# 1. RLAC LOOP
# ==========================================
for method in rlac_methods:
    for r_val in rlac_params['r']:
        for bw in rlac_params['bw_adjust']:
            for seed in rlac_params['random_state']:
                
                param_str = f"r={r_val}, bw={bw}, s={seed}"
                # Using end="" to keep the line until 'Done' is printed
                print(f"\n\rRunning RLAC {method:<15} | {param_str} ... ", end="")
                
                try:
                    model = RLAC(
                        n_clusters=n_clusters,
                        method=method,
                        r=r_val,
                        bw_adjust=bw,
                        random_state=seed,
                        plot=False
                    )
                    
                    with warnings.catch_warnings():
                        warnings.filterwarnings("ignore", category=UserWarning)
                        model.fit(X_train)
                    
                    ami = adjusted_mutual_info_score(y_train, model.labels_)
                    ari = adjusted_rand_score(y_train, model.labels_)
                    
                    results.append({
                        'Model': 'RLAC',
                        'Method': method,
                        'Params': param_str,
                        'AMI': ami,
                        'ARI': ari
                    })
                    
                except Exception as e:
                    # Fail silently in the loop to keep output clean, but record failure
                    results.append({
                        'Model': 'RLAC', 'Method': method, 'Params': param_str,
                        'AMI': -1, 'ARI': -1
                    })

print("\nRLAC Loop Complete.")

# ==========================================
# 2. MDH RUN
# ==========================================
print(f"\nRunning MDH {'Standard':<15} | h=1.0, a=0.9 ... ")
try:
    mdh_model = MDH(
        n_clusters=n_clusters,
        h_multiplier=mdh_config['h_multiplier'],
        alphamax_val=mdh_config['alphamax_val'],
        alpha_steps=mdh_config['alpha_steps'],
        random_state=mdh_config['random_state'],
        verbose=False,
        plot=False
    )
    
    mdh_model.fit(X_train)
    
    ami_mdh = adjusted_mutual_info_score(y_train, mdh_model.labels_)
    ari_mdh = adjusted_rand_score(y_train, mdh_model.labels_)
    
    print(f"Done (AMI: {ami_mdh:.4f})")
    
    results.append({
        'Model': 'MDH',
        'Method': 'Standard',
        'Params': 'Fixed',
        'AMI': ami_mdh,
        'ARI': ari_mdh
    })
    
except Exception as e:
    print(f"MDH FAILED. Error: {e}")

# ==========================================
# 3. RESULTS TABLE
# ==========================================
print("\n" + "="*80)
print("FINAL RESULTS (ALL MODELS - SORTED BY AMI)")
print("="*80)

# Create DataFrame and sort
results_df = pd.DataFrame(results).sort_values(by='AMI', ascending=False)

# Print everything
print(results_df.to_string(index=False))