<a href="https://colab.research.google.com/github/cwjng/si670-final-project/blob/main/si670_final_project.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Git repo to clone
# git clone https://github.com/cwjng/si670-final-project.git

In [1]:
import pandas as pd
import sklearn as sk
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import OneHotEncoder, StandardScaler, FunctionTransformer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import r2_score, mean_squared_error

from sklearn.linear_model import LinearRegression, Ridge, Lasso, LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingRegressor
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import GradientBoostingClassifier, StackingClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report, f1_score
from xgboost import XGBClassifier

from imblearn.over_sampling import SMOTE

In [2]:
cmr_df = pd.read_csv("si670-final-project/cancer_mortality_rates.csv")
cmr_df.head()

Unnamed: 0,cmRate,population,medianIncome,pctPoverty,avgHouse,medianAge,medianAgeM,medianAgeF,pctHS,pctBach,...,pctBlack,pctAsian,pctOther,rmRace,pctMarried,pctMarriedHouse,birthRate,region,state,county
0,191.2,25859,38013,18.6,2.28,45.4,44.6,46.2,36.4,7.9,...,4.925408,0.338357,0.065365,white,62.1,57.173258,4.68779,Southeast,Alabama,Cherokee County
1,187.9,82005,39922,17.2,2.55,40.5,39.4,41.9,34.7,9.1,...,1.190638,0.557031,0.737356,white,57.6,54.850257,3.913309,Southeast,Alabama,Cullman County
2,217.4,41131,26602,35.2,2.53,38.7,36.2,40.0,34.6,8.2,...,69.184419,0.341605,0.161313,black,33.4,30.987632,8.433257,Southeast,Alabama,Dallas County
3,193.8,81468,54298,14.4,2.65,37.9,36.7,39.1,35.4,13.4,...,21.218875,0.516326,0.678529,white,51.0,54.326202,5.800147,Southeast,Alabama,Elmore County
4,205.8,16759,35664,20.6,2.36,43.2,41.3,46.1,38.8,8.4,...,12.387547,0.390625,0.0,white,53.6,51.499218,5.730897,Southeast,Alabama,Fayette County


In [7]:
# corr_matrix = cmr_df.drop(columns=["rmRace", "region",
#                      "state", "county"]).corr()

In [3]:
corr = cmr_df["pctMarried"].corr(cmr_df["pctMarriedHouse"])
print("Correlation:", corr)

Correlation: 0.8487861233872582


In [None]:
# cmr_df.info()

In [4]:
from sklearn.cluster import KMeans
# import numpy as np

# # Step 1: Run K-means on cmRate only
kmeans = KMeans(n_clusters=3, random_state=42)
cmr_df["cluster_raw"] = kmeans.fit_predict(cmr_df[["cmRate"]])

# Step 2: Sort clusters by actual mean cmRate
# (so cluster 0 = low, 1 = medium, 2 = high)
cluster_order = (
    cmr_df.groupby("cluster_raw")["cmRate"]
    .mean()
    .sort_values()
    .index
)

# Mapping raw clusters → ordered levels (0=low, 1=med, 2=high)
cluster_map = {old: new for new, old in enumerate(cluster_order)}
cmr_df["cmClass"] = cmr_df["cluster_raw"].map(cluster_map)

# Step 3: Drop the raw cluster
cmr_df = cmr_df.drop(columns=["cluster_raw"])

# Step 4: Check distribution
print(cmr_df["cmClass"].value_counts())

# Step 5: Extract labels per county
county_labels = cmr_df[["county", "state", "cmRate", "cmClass"]]
county_labels.head()


cmClass
1    263
0    142
2    132
Name: count, dtype: int64


Unnamed: 0,county,state,cmRate,cmClass
0,Cherokee County,Alabama,191.2,1
1,Cullman County,Alabama,187.9,1
2,Dallas County,Alabama,217.4,2
3,Elmore County,Alabama,193.8,1
4,Fayette County,Alabama,205.8,2


In [9]:
# def categorize_cmrates(rate, q1, median, q3):
#   if rate < q1:
#     return "Low"
#   elif rate < median:
#     return "Medium"
#   return "High"

# quartile_1 = cmr_df["cmRate"].quantile(0.25)
# median = cmr_df["cmRate"].quantile(0.5)
# quartile_3 = cmr_df["cmRate"].quantile(0.75)

# cmr_df["cmClass"] = cmr_df["cmRate"].apply(lambda x: categorize_cmrates(x, quartile_1,
#                                                                                median, quartile_3))
# cmr_df.head()

In [5]:
# log_transform_cols = ["population", "medianIncome", "pctPoverty"]
X = cmr_df.drop(columns=["cmRate", "cmClass", "medianAgeM",
                         "medianAgeF", "pctMarriedHouse", "rmRace"])
y = cmr_df["cmClass"]

cat_cols = X.select_dtypes(include="object").columns
num_cols = X.select_dtypes(include="number").columns

In [6]:
# Create a pipeline for numeric features
# numeric_pipeline = Pipeline([
#     # Log-transform selected features
#     ("log", ColumnTransformer([
#         ("log_features", FunctionTransformer(np.log1p, validate=True), log_transform_cols)
#     ], remainder="passthrough")),  # keep other numeric features unchanged
#     ("scaler", StandardScaler())
# ])
preprocess = ColumnTransformer(
    transformers=[
        ("num", StandardScaler(), num_cols),
        ("cat", OneHotEncoder(handle_unknown="ignore"), cat_cols)
    ]
)

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)
X_train_proc = preprocess.fit_transform(X_train)
X_test_proc = preprocess.transform(X_test)

# 2. Apply SMOTE on preprocessed numeric array
smote = SMOTE(sampling_strategy="not majority", random_state=42)
X_train_res, y_train_res = smote.fit_resample(X_train_proc, y_train)

In [7]:
# Logistic Regression
log_clf = LogisticRegression(
    multi_class="multinomial",
    max_iter=2000
)
log_clf.fit(X_train_res, y_train_res)

# Random Forest
rf_clf = RandomForestClassifier(
    n_estimators=400,
    random_state=42,
    class_weight="balanced"
)
rf_clf.fit(X_train_res, y_train_res)

# SVM
svm_clf = SVC(
    kernel="rbf",
    probability=True,
    class_weight="balanced"
)
svm_clf.fit(X_train_res, y_train_res)

# kNN
knn_clf = KNeighborsClassifier(
    n_neighbors=7,
    weights="distance"
)
knn_clf.fit(X_train_res, y_train_res)

# Gradient Boost
gb_clf = GradientBoostingClassifier(random_state=42)
gb_clf.fit(X_train_res, y_train_res)



In [8]:
models = {
    "Logistic Regression": log_clf,
    "Random Forest": rf_clf,
    "Gradient Boosting": gb_clf,
    "SVM (RBF)": svm_clf,
    "KNN": knn_clf
}

for name, model in models.items():
    preds = model.predict(X_test_proc)
    print(f"\n=== {name} ===")
    print("Accuracy:", accuracy_score(y_test, preds))
    print("Macro F1:", f1_score(y_test, preds, average="macro"))
    print("Weighted F1:", f1_score(y_test, preds, average="weighted"))
    print(classification_report(y_test, preds))


=== Logistic Regression ===
Accuracy: 0.5648148148148148
Macro F1: 0.566761658886629
Weighted F1: 0.5639858182939776
              precision    recall  f1-score   support

           0       0.53      0.59      0.56        29
           1       0.61      0.51      0.56        53
           2       0.53      0.65      0.59        26

    accuracy                           0.56       108
   macro avg       0.56      0.58      0.57       108
weighted avg       0.57      0.56      0.56       108


=== Random Forest ===
Accuracy: 0.6296296296296297
Macro F1: 0.6209025439227112
Weighted F1: 0.6280066579500913
              precision    recall  f1-score   support

           0       0.65      0.52      0.58        29
           1       0.65      0.66      0.65        53
           2       0.58      0.69      0.63        26

    accuracy                           0.63       108
   macro avg       0.63      0.62      0.62       108
weighted avg       0.63      0.63      0.63       108


=== Gr

In [9]:
# Random Forest Grid
rf_params = {
    "n_estimators": [200, 400],
    "max_depth": [10, 20, None],
    "min_samples_split": [2, 5],
    "min_samples_leaf": [1, 2]
}

# Gradient Boosting Grid
gb_params = {
    "n_estimators": [100, 200],
    "learning_rate": [0.05, 0.1],
    "max_depth": [3, 5]
}

# SVM grid
svm_params = {
    "C": [1, 5, 10],
    "gamma": ["scale", 0.1, 0.01]
}

# KNN
knn_param_grid = {
    "n_neighbors": [3, 5, 7, 9, 11, 15, 21],
    # "weights": ["uniform", "distance"],
    "metric": ["euclidean", "manhattan", "minkowski"],
    "p": [1, 2]  # p=1 → manhattan, p=2 → euclidean
}

In [11]:
# Random Forest
rf_grid = GridSearchCV(
    RandomForestClassifier(class_weight="balanced", random_state=42),
    rf_params,
    scoring="f1_macro",
    cv=5,
    n_jobs=-1
)
rf_grid.fit(X_train_res, y_train_res)
best_rf = rf_grid.best_estimator_
print("Best RF:", rf_grid.best_params_)

# Gradient Boosting
gb_grid = GridSearchCV(
    GradientBoostingClassifier(random_state=42),
    gb_params,
    scoring="f1_macro",
    cv=5,
    n_jobs=-1
)
gb_grid.fit(X_train_res, y_train_res)
best_gb = gb_grid.best_estimator_
print("Best GB:", gb_grid.best_params_)

# SVM
svm_grid = GridSearchCV(
    SVC(kernel="rbf", probability=True, class_weight="balanced"),
    svm_params,
    scoring="f1_macro",
    cv=5,
    n_jobs=-1
)
svm_grid.fit(X_train_res, y_train_res)
best_svm = svm_grid.best_estimator_
print("Best SVM:", svm_grid.best_params_)

# KNN
knn_grid = GridSearchCV(
    estimator=KNeighborsClassifier(),
    param_grid=knn_param_grid,
    scoring="f1_macro",
    cv=5,
    n_jobs=-1
)

knn_grid.fit(X_train_proc, y_train)
best_knn = knn_grid.best_estimator_
print("Best KNN Parameters:", knn_grid.best_params_)


Best RF: {'max_depth': None, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 400}
Best GB: {'learning_rate': 0.1, 'max_depth': 5, 'n_estimators': 100}
Best SVM: {'C': 10, 'gamma': 0.1}
Best KNN Parameters: {'metric': 'euclidean', 'n_neighbors': 5, 'p': 1}


In [18]:
stack = StackingClassifier(
    estimators=[
        ("rf", best_rf),
        # ("knn", best_knn) # KNN makes some classes slightly worse
        # ("gb", best_gb), # GB doesn't improve anything nor decrease
        # ("svm", best_svm) # SVM doesn't work well and reduces the F1 score
    ],
    final_estimator=LogisticRegression(
        max_iter=2000,
        class_weight="balanced",
        multi_class="auto"
    ),
    stack_method="predict_proba",
    n_jobs=-1
)

# Fit on resampled data
stack.fit(X_train_res, y_train_res)



In [19]:
y_pred = stack.predict(X_test_proc)

print("\n=== STACKED MODEL ===")
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Macro F1:", f1_score(y_test, y_pred, average="macro"))
print("Weighted F1:", f1_score(y_test, y_pred, average="weighted"))
print(classification_report(y_test, y_pred))


=== STACKED MODEL ===
Accuracy: 0.6666666666666666
Macro F1: 0.6490699615962598
Weighted F1: 0.6613194560397766
              precision    recall  f1-score   support

           0       0.67      0.48      0.56        29
           1       0.67      0.75      0.71        53
           2       0.67      0.69      0.68        26

    accuracy                           0.67       108
   macro avg       0.67      0.64      0.65       108
weighted avg       0.67      0.67      0.66       108



In [None]:
# RF
# === STACKED MODEL ===
# Accuracy: 0.6666666666666666
# Macro F1: 0.6490699615962598
# Weighted F1: 0.6613194560397766
#               precision    recall  f1-score   support

#            0       0.67      0.48      0.56        29
#            1       0.67      0.75      0.71        53
#            2       0.67      0.69      0.68        26

#     accuracy                           0.67       108
#    macro avg       0.67      0.64      0.65       108
# weighted avg       0.67      0.67      0.66       108