In [26]:
import os
import sqlite3
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import subprocess

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline, FunctionTransformer
from sklearn.metrics import precision_recall_curve, auc, brier_score_loss, classification_report
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import CountVectorizer
from pathlib import Path

In [27]:
# Make sure we can open the db file
try:
    root = subprocess.check_output(["git", "rev-parse", "--show-toplevel"], text=True).strip()
    os.chdir(root)
except Exception:
    while not (Path.cwd() / "data").exists() and Path.cwd() != Path.home():
        os.chdir("..")

print("Working directory:", Path.cwd())

Working directory: /Users/ecasto/cs230-project


In [28]:
conn = sqlite3.connect("data/routes_scores.db")
df = pd.read_sql("SELECT * FROM routes;", conn)

In [29]:
df["county_count"] = df["counties"].str.count(",").fillna(0).astype(int) + 1

numeric_features = ["county_count"]
text_feature = "counties"
target = "impacting_delivery"

df[text_feature] = df[text_feature].fillna("")

print("Label distribution:\n", df[target].value_counts(normalize=True))

Label distribution:
 impacting_delivery
0    0.76505
1    0.23495
Name: proportion, dtype: float64


In [30]:
# ---------------------------------------------------------------
# Split data (70/15/15)
# ---------------------------------------------------------------
train, test = train_test_split(df, test_size=0.15, random_state=42, stratify=df[target])
train, val  = train_test_split(train, test_size=0.1765, random_state=42, stratify=train[target])

X_train, y_train = train[[text_feature] + numeric_features], train[target]
X_val,   y_val   = val[[text_feature] + numeric_features],   val[target]
X_test,  y_test  = test[[text_feature] + numeric_features],  test[target]

In [None]:
# ---------------------------------------------------------------
# Preprocessing
# ---------------------------------------------------------------

to_1d = FunctionTransformer(lambda x: x.squeeze(), validate=False)

county_bow = Pipeline(steps=[
    ("to_1d", to_1d),
    ("vec", CountVectorizer(
        tokenizer=lambda s: [t.strip() for t in s.split(",") if t.strip()],
        lowercase=False,
        min_df=5
    ))
])

numeric_pipe = Pipeline(steps=[("scaler", StandardScaler())])

preprocessor = ColumnTransformer(transformers=[
    ("county_bow", county_bow, [text_feature]),
    ("num", numeric_pipe, numeric_features),
])

In [32]:
# ------------------------------
# Random Forest model
# ------------------------------
rf = Pipeline(steps=[
    ("preprocess", preprocessor),
    ("model", RandomForestClassifier(
        n_estimators=500,
        max_depth=None,
        min_samples_leaf=1,
        class_weight="balanced",
        n_jobs=-1,
        random_state=42,
    )),
])

rf.fit(X_train, y_train)



0,1,2
,steps,"[('preprocess', ...), ('model', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,transformers,"[('county_bow', ...), ('num', ...)]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,func,<function <la...t 0x1523e6d40>
,inverse_func,
,validate,False
,accept_sparse,False
,check_inverse,True
,feature_names_out,
,kw_args,
,inv_kw_args,

0,1,2
,input,'content'
,encoding,'utf-8'
,decode_error,'strict'
,strip_accents,
,lowercase,False
,preprocessor,
,tokenizer,<function <la...t 0x1523e7b00>
,stop_words,
,token_pattern,'(?u)\\b\\w\\w+\\b'
,ngram_range,"(1, ...)"

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,n_estimators,500
,criterion,'gini'
,max_depth,
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,'sqrt'
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [33]:
val_probs = rf.predict_proba(X_val)[:, 1]
prec_v, rec_v, thr_v = precision_recall_curve(y_val, val_probs)

# Choose threshold that maximizes F1 on val
f1_v = (2 * prec_v * rec_v) / (prec_v + rec_v + 1e-12)
best_idx = np.nanargmax(f1_v)
best_thr = thr_v[best_idx] if best_idx < len(thr_v) else 0.5
print(f"Chosen threshold from val (F1-optimal): {best_thr:.3f}")

Chosen threshold from val (F1-optimal): 0.404


In [34]:
# ------------------------------
# Evaluate performance
# ------------------------------
probs = rf.predict_proba(X_test)[:, 1]
precision, recall, thresholds = precision_recall_curve(y_test, probs)

auc_pr = auc(recall, precision)
brier  = brier_score_loss(y_test, probs)

print("AUPRC:", auc_pr)
print("Brier Score:", brier)
print("\nClassification Report:\n")
print(classification_report(y_test, rf.predict(X_test)))

os.makedirs("results", exist_ok=True)
plt.plot(recall, precision)
plt.title(f"Precision-Recall Curve (RF, AUC={auc_pr:.3f})")
plt.xlabel("Recall"); plt.ylabel("Precision")
plt.savefig("results/pr_curve_random_forest.png", dpi=150)
plt.close()

AUPRC: 0.8970652103196126
Brier Score: 0.07120121866666666

Classification Report:

              precision    recall  f1-score   support

           0       0.93      0.96      0.94      2295
           1       0.84      0.76      0.80       705

    accuracy                           0.91      3000
   macro avg       0.88      0.86      0.87      3000
weighted avg       0.91      0.91      0.91      3000



In [None]:
# ------------------------------
# Feature importance
# ------------------------------
pre = rf.named_steps["preprocess"]
vec = pre.named_transformers_["county_bow"].named_steps["vec"]

bow_names = vec.get_feature_names_out()
num_names = np.array(["county_count"])

feat_names = np.concatenate([
    [f"county:{t}" for t in bow_names],
    [f"num:{n}" for n in num_names],
])

importances = rf.named_steps["model"].feature_importances_

imp_df = pd.DataFrame(
    {"feature": feat_names, "importance": importances}
).sort_values("importance", ascending=False)

imp_df.to_csv("results/rf_feature_importances.csv", index=False)
imp_df.head(15)


Unnamed: 0,feature,importance
178,county:SAN BERNARDINO,0.054176
83,county:KERN,0.034005
182,county:SAN DIEGO,0.033845
170,county:RIVERSIDE,0.02842
312,num:county_count,0.028366
78,county:INYO,0.028008
60,county:FRESNO,0.026817
272,county:TULARE,0.026515
233,county:SONOMA,0.022912
94,county:LOS ANGELES,0.02163


In [36]:
pd.DataFrame([{"AUPRC": auc_pr, "brier_score": brier}]).to_csv(
    "results/random_forest_metrics.csv", index=False
)

print("✅ Random Forest run complete. Artifacts written to results/")

✅ Random Forest run complete. Artifacts written to results/


In [37]:
conn.close()