In [2]:
pip install pandas numpy scikit-learn matplotlib seaborn

Note: you may need to restart the kernel to use updated packages.


In [3]:
import pandas as pd
import numpy as np
from sklearn.cluster import DBSCAN
from sklearn.metrics import silhouette_score
import matplotlib.pyplot as plt
import seaborn as sns
import csv

In [4]:
# Load SVD files
chi_df = pd.read_csv("CHI_trunc.csv")
nyc_df = pd.read_csv("NYC_trunc.csv")

# Add back state labels for alignment and safety checks
chi_df["state"] = "CHI"
nyc_df["state"] = "NYC"

full_df = pd.concat([chi_df, nyc_df], axis=0).reset_index(drop=True)

# Prototype
proto_df = full_df.sample(frac=0.05, random_state=42)

# Select features
print("Selecting relevant features for DBSCAN...")

keep_cols = (
    [f"unit_svd_{i}" for i in range(1, 21)] +
    ['us_aqi_max', 'us_aqi_pm2_5_max', 'us_aqi_pm10_max',
     'us_aqi_ozone_max', 'us_aqi_carbon_monoxide_max', 'us_aqi_sulphur_dioxide_max',
     'us_aqi_nitrogen_dioxide_max', 'us_aqi_max_lag',
     'apparent_temperature_max', 'relative_humidity_2m_mean',
     'wind_speed_10m_mean']
)

# Check for missing columns
missing = [col for col in keep_cols if col not in proto_df.columns]
if missing:
    print("Missing columns:", missing)

# Filter and cast for DBSCAN
proto_df = proto_df[keep_cols].dropna()
X_proto = proto_df.astype(float)

Selecting relevant features for DBSCAN...


In [5]:
# Parameter sweep
print("Sweeping DBSCAN parameters...")
eps_values = np.linspace(0.5, 5.0, 10)
min_samples_values = [3, 5, 10]

best_score = -1
best_params = None

for eps in eps_values:
    for min_samples in min_samples_values:
        dbscan = DBSCAN(eps=eps, min_samples=min_samples)
        labels = dbscan.fit_predict(X_proto)
        n_clusters = len(set(labels)) - (1 if -1 in labels else 0)
        n_noise = list(labels).count(-1)

        if n_clusters > 1:
            score = silhouette_score(X_proto, labels)
            print(f"→ eps={eps:.2f}, min_samples={min_samples}, clusters={n_clusters}, noise={n_noise}, silhouette={score:.4f}")
            if score > best_score:
                best_score = score
                best_params = (eps, min_samples)
        else:
            print(f"→ eps={eps:.2f}, min_samples={min_samples}, clusters={n_clusters}, insufficient")

Sweeping DBSCAN parameters...
→ eps=0.50, min_samples=3, clusters=1454, noise=0, silhouette=0.7518
→ eps=0.50, min_samples=5, clusters=1453, noise=4, silhouette=0.7518
→ eps=0.50, min_samples=10, clusters=1331, noise=985, silhouette=0.7190
→ eps=1.00, min_samples=3, clusters=836, noise=0, silhouette=0.3017
→ eps=1.00, min_samples=5, clusters=835, noise=4, silhouette=0.3017
→ eps=1.00, min_samples=10, clusters=762, noise=591, silhouette=0.2833
→ eps=1.50, min_samples=3, clusters=204, noise=0, silhouette=-0.2228
→ eps=1.50, min_samples=5, clusters=203, noise=4, silhouette=-0.2228
→ eps=1.50, min_samples=10, clusters=185, noise=143, silhouette=-0.2218
→ eps=2.00, min_samples=3, clusters=71, noise=0, silhouette=-0.0625
→ eps=2.00, min_samples=5, clusters=71, noise=0, silhouette=-0.0625
→ eps=2.00, min_samples=10, clusters=67, noise=34, silhouette=-0.0635
→ eps=2.50, min_samples=3, clusters=31, noise=0, silhouette=0.0585
→ eps=2.50, min_samples=5, clusters=31, noise=0, silhouette=0.0585
→ e

In [6]:
# DBSCAN Fit & Visualization
if best_params:
    print(f"\n Best DBSCAN params: eps={best_params[0]:.2f}, min_samples={best_params[1]}")
    dbscan_final = DBSCAN(eps=best_params[0], min_samples=best_params[1])
    labels_final = dbscan_final.fit_predict(X_proto)
    proto_df['Cluster'] = labels_final

    silhouette_final = silhouette_score(X_proto, labels_final)
    print(f" Final silhouette score: {silhouette_final:.4f}")

    plt.figure(figsize=(8, 6))
    sns.scatterplot(
        x=X_proto.iloc[:, 0], y=X_proto.iloc[:, 1],
        hue=proto_df['Cluster'], palette='tab20', alpha=0.6, legend='full'
    )
    plt.xlabel("SVD Component 1")
    plt.ylabel("SVD Component 2")
    plt.title(f"DBSCAN Clustering (eps={best_params[0]:.2f}, min_samples={best_params[1]})")
    plt.grid(True)
    plt.tight_layout()
    plt.savefig("dbscan_proto_clusters.png")
    plt.close()
    print(" Saved: dbscan_proto_clusters.png")


 Best DBSCAN params: eps=0.50, min_samples=3
 Final silhouette score: 0.7518


  plt.tight_layout()


 Saved: dbscan_proto_clusters.png


In [7]:
# DBSCAN on full dataset
print("Running DBSCAN on full dataset...")
full_df_cluster_input = full_df[keep_cols].dropna().astype(float)
full_df['Cluster'] = dbscan_final.fit_predict(full_df_cluster_input)
full_df.to_csv("full_with_dbscan.csv", index=False)
print("Saved: full_with_dbscan.csv")

Running DBSCAN on full dataset...
Saved: full_with_dbscan.csv


In [8]:
# Combine with final_cleaned.csv
print("Concatenating DBSCAN clusters with final_cleaned.csv...")

# Load final cleaned dataset
final_df = pd.read_csv("final_cleaned.csv")

# Sort final_df by city then date, to align with CHI followed by NYC in full_df
final_df = final_df.sort_values(by=["state", "date"]).reset_index(drop=True)

# Safety check: row count cmatch
if len(final_df) != len(full_df):
    raise ValueError(f" Row count mismatch: final_cleaned={len(final_df)}, full_df={len(full_df)}")

# Assign cluster labels from full_df to final_df
final_df['Cluster'] = full_df['Cluster'].values

# Save output
final_df.to_csv("trunc-svd_dbscan_final.csv", index=False)
print(" Final dataset saved: trunc-svd_dbscan_final.csv")

Concatenating DBSCAN clusters with final_cleaned.csv...


  final_df = pd.read_csv("final_cleaned.csv")


 Final dataset saved: trunc-svd_dbscan_final.csv
