In [None]:
import warnings
warnings.filterwarnings('ignore') 
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans, AgglomerativeClustering, SpectralClustering
from sklearn.metrics import adjusted_rand_score, adjusted_mutual_info_score

# Model
import tensorflow as tf
from tensorflow.keras.layers import Dense, GlobalAveragePooling2D, Dropout
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam
from tensorflow.keras import layers, models, applications
from tensorflow.keras.callbacks import EarlyStopping
from sklearn.model_selection import train_test_split
from tensorflow.keras.applications import MobileNetV2, ResNet50
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from sklearn.preprocessing import StandardScaler, MinMaxScaler, MaxAbsScaler, LabelEncoder
from tensorflow.keras.applications.mobilenet import MobileNet, preprocess_input
from sklearn.metrics import adjusted_mutual_info_score, adjusted_rand_score
from sklearn.svm import SVC
from scipy.stats import iqr # For Silverman's rule
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import MultiLabelBinarizer

# RLAC MODEL
from sklearn.random_projection import GaussianRandomProjection
from scipy.stats import gaussian_kde
from scipy.signal import find_peaks  
from sklearn.neighbors import KernelDensity  
from sklearn.model_selection import GridSearchCV, LeaveOneOut
from scipy import stats
from scipy.special import eval_hermitenorm  # For normalized Hermite polynomials H_n(x)
from scipy.stats import skew
from scipy.stats import norm
import diptest
from sklearn.metrics import (adjusted_mutual_info_score, adjusted_rand_score, 
                             homogeneity_score, completeness_score, v_measure_score,
                             fowlkes_mallows_score, silhouette_score, calinski_harabasz_score,
                             davies_bouldin_score)

In [None]:
import sc_loader as loader  # Local File Parsing

#  LOAD DATA
DATA_DIR = r'GSE65528 data'
SERIES_MATRIX = r'GSE65528 data\GSE65528_series_matrix.txt'

# Load
raw_tpm, metadata_df = loader.load_gse65528_data(DATA_DIR, SERIES_MATRIX)

# Inspect
print("\n--- Metadata Summary ---")
print(metadata_df['Combined_Label'].value_counts())

In [None]:
# 1. PROFESSIONAL IMPORT SETUP
# Get the current notebook directory
current_dir = os.getcwd()

# Define path to the shared_utils folder (one level up)
shared_path = os.path.abspath(os.path.join(current_dir, '..', 'shared_utils'))

# Add to system path if not already there
if shared_path not in sys.path:
    sys.path.append(shared_path)

import sc_processor as scp

In [None]:
# 3. PROCESSING
# Standard Pipeline
# Note: Data is TPM, so we just filter and Log
QC_PARAMS = {'min_tpm': 1, 'min_genes_per_sample': 2000, 'min_samples_per_gene': 3}

filtered_df, qc_metrics = scp.filter_tpm_matrix(raw_tpm, **QC_PARAMS)
log_df = scp.log_transform(filtered_df, method='log1p')
df_hvg, hvg_metrics = scp.select_highly_variable_genes(log_df, n_top_genes=2000)
df_scaled = scp.scale_data(df_hvg)
df_pca, var_ratio, pca_model = scp.run_pca_pipeline(df_scaled, n_components=50)

# Visualize
scp.plot_pca_results(df_pca, var_ratio)

In [None]:
# 1. PROFESSIONAL IMPORT SETUP
# Get the current notebook directory
current_dir = os.getcwd()

# Define path to the shared_utils folder (one level up)
shared_path = os.path.abspath(os.path.join(current_dir, '..', 'shared_utils'))

# Add to system path if not already there
if shared_path not in sys.path:
    sys.path.append(shared_path)

import sc_clustering as scc

In [None]:
#  CLUSTERING BENCHMARK
# We test against Time, Status, AND the combination of both
TARGETS = ['TimePoint', 'Infection_Status', 'Combined_Label']

# Ensure we have the data
if 'df_pca' in locals() and 'metadata_df' in locals():
    
    # Run for a range of k
    # We include 13 because the paper might mention specific stages
    k_range = [3, 4, 5] 
    
    clustering_results = scc.run_clustering_benchmark(
        pca_df=df_pca, 
        cell_metadata=metadata_df, 
        n_clusters_range=k_range, 
        target_cols=TARGETS
    )

    # --- DISPLAY RESULTS ---
    if not clustering_results.empty:
        print("\n=== Final Clustering Leaderboard ===")
        # We display the top 10 results
        print(clustering_results.head(10))
        
        # Optional: Save to CSV for your portfolio evidence
        # clustering_results.to_csv('clustering_benchmark_GSE45719.csv', index=False)
        
else:
    print("Error: PCA or Metadata not found. Please run previous cells.")

In [None]:
import sys
import os

# 1. Get the path of the parent directory (Project_Root)
# '..' means "go up one level"
parent_dir = os.path.abspath(os.path.join(os.getcwd(), '..'))

# 2. Add it to Python's search path if not already there
if parent_dir not in sys.path:
    sys.path.append(parent_dir)

# 3. Now you can import normally
from rlac import RLAC
from mdh import MDH

print("Successfully imported models from:", parent_dir)

In [None]:
from sklearn.metrics import adjusted_rand_score, adjusted_mutual_info_score

# Import your custom models
from rlac import RLAC
from mdh import MDH

X_train = pca_results_df
y_train = filtered_cell_metadata['TimePoint_h'].astype(str)
n_clusters = y_train.nunique()

print(f"Number of Clusters (k): {n_clusters}")
print("-" * 80)

# --- 2. CONFIGURATION ---
rlac_methods = [
    'depth_ratio', 'dip', 'holes', 'min_kurt', 'max_kurt', 
    'negentropy', 'skewness', 'fisher', 'hermite', 'friedman_tukey'
]

# Grid Search Parameters
rlac_params = {
    'random_state': [32, 42, 43, 44, 45],
    'bw_adjust': [0.05, 0.1, 0.2, 0.3, 0.4],
    'r': [None, 50, 100, 300, 500]
}

# MDH Fixed Parameters
mdh_config = {
    "h_multiplier": 1.0,
    "alphamax_val": 0.9,
    "alpha_steps": 5,
    "random_state": 42
}

results = []

# ==========================================
# 3. RLAC LOOP
# ==========================================
print(f"\nStarting RLAC Grid Search ({len(rlac_methods) * len(rlac_params['r']) * len(rlac_params['bw_adjust']) * len(rlac_params['random_state'])} runs)...")

for method in rlac_methods:
    for r_val in rlac_params['r']:
        for bw in rlac_params['bw_adjust']:
            for seed in rlac_params['random_state']:
                
                # Format r for display
                r_str = "JL" if r_val is None else str(r_val)
                param_str = f"r={r_str}, bw={bw}, s={seed}"
                
                print(f"\nRunning RLAC {method:<15} | {param_str} ... ", end="")
                
                try:
                    # Instantiate
                    model = RLAC(
                        n_clusters=n_clusters,
                        method=method,
                        r=r_val,
                        bw_adjust=bw,
                        random_state=seed,
                        plot=False
                    )
                    
                    # Fit
                    with warnings.catch_warnings():
                        warnings.filterwarnings("ignore", category=UserWarning)
                        model.fit(X_train)
                    
                    # Evaluate
                    ami = adjusted_mutual_info_score(y_train, model.labels_)
                    ari = adjusted_rand_score(y_train, model.labels_)
                    
                    print(f"Done (AMI: {ami:.4f})")
                    
                    results.append({
                        'Model': 'RLAC',
                        'Method': method,
                        'Params': param_str,
                        'AMI': ami,
                        'ARI': ari
                    })
                    
                except Exception as e:
                    print(f"FAILED. Error: {e}")
                    results.append({
                        'Model': 'RLAC', 'Method': method, 'Params': param_str,
                        'AMI': -1, 'ARI': -1
                    })

# ==========================================
# 4. MDH RUN
# ==========================================
print("-" * 80)
print(f"Running MDH {'Standard':<15} | h=1.0, a=0.9 ... ", end="")
try:
    mdh_model = MDH(
        n_clusters=n_clusters,
        h_multiplier=mdh_config['h_multiplier'],
        alphamax_val=mdh_config['alphamax_val'],
        alpha_steps=mdh_config['alpha_steps'],
        random_state=mdh_config['random_state'],
        verbose=False,
        plot=False
    )
    
    mdh_model.fit(X_train)
    ami_mdh = adjusted_mutual_info_score(y_train, mdh_model.labels_)
    ari_mdh = adjusted_rand_score(y_train, mdh_model.labels_)
    
    print(f"Done (AMI: {ami_mdh:.4f})")
    
    results.append({
        'Model': 'MDH',
        'Method': 'Standard',
        'Params': 'Fixed',
        'AMI': ami_mdh,
        'ARI': ari_mdh
    })
    
except Exception as e:
    print(f"FAILED. Error: {e}")

# ==========================================
# 5. RESULTS TABLE
# ==========================================
if results:
    results_df = pd.DataFrame(results)
    
    print("\n" + "="*80)
    print(f"FINAL RESULTS FOR TARGET: {n_clusters} (Sorted by AMI)")
    print("="*80)
    
    # Sort and display from best AMI to worst
    results_df_sorted = results_df.sort_values(by='AMI', ascending=False)
    print(results_df_sorted.to_string(index=False))
else:
    print("No results collected.")

In [None]:
================================================================================
FINAL RESULTS FOR TARGET: 4 (Sorted by AMI)
================================================================================
Model         Method               Params       AMI       ARI
 RLAC       skewness  r=500, bw=0.1, s=43  0.251717  0.180310
 RLAC       skewness r=500, bw=0.05, s=43  0.251717  0.180310
 RLAC    depth_ratio   r=JL, bw=0.4, s=32  0.214503  0.126993
 RLAC    depth_ratio r=500, bw=0.05, s=44  0.214196  0.107437
 RLAC        hermite  r=300, bw=0.3, s=45  0.199606  0.134232
 RLAC        hermite  r=300, bw=0.2, s=45  0.199606  0.134232
 RLAC    depth_ratio  r=JL, bw=0.05, s=32  0.193110  0.126934
 RLAC friedman_tukey   r=JL, bw=0.2, s=42  0.189521  0.111926
 RLAC friedman_tukey   r=JL, bw=0.1, s=42  0.189521  0.111926
 RLAC friedman_tukey  r=JL, bw=0.05, s=42  0.189521  0.111926
