# Here we'll train some models!

In [50]:

from __future__ import annotations
import pathlib, yaml
import pandas as pd
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
import sys, pathlib
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import SelectFromModel
from sklearn.linear_model import LinearRegression, LassoCV, RidgeCV, ElasticNetCV
import warnings, pandas as pd
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
from sklearn.model_selection import cross_validate, KFold



# --- project paths
PROJ = pathlib.Path().resolve().parent
CFG  = yaml.safe_load(open(PROJ / "conf" / "config.yaml"))
var_types = yaml.safe_load(open(PROJ / "conf" / "variable_types.yaml"))
RAW_DIR = PROJ / CFG["data"]["raw_dir"]
PROC_DIR = PROJ / CFG["data"]["processed_dir"]
numeric_cols = var_types["numeric_variables"]
RULES = yaml.safe_load(open(PROJ/"conf"/"cleaning_rules.yaml"))

if str(PROJ) not in sys.path:
    sys.path.insert(0, str(PROJ))

from pipeline.transformers import (MedianImputer, AutoTransform, OneHotEncoder) 
PerfWarn = pd.errors.PerformanceWarning
warnings.filterwarnings("ignore", category=PerfWarn)

# ---------- Column groups from config ----------
TYPES = yaml.safe_load(open(PROJ/"conf"/"variable_types.yaml"))

In [51]:
# First read in the file...
df = pd.read_parquet(PROC_DIR / "cbecs_2018_clean.parquet")
cat_cols = [c for c in TYPES["categorical_variables"] if c in df.columns]
num_cols = [c for c in TYPES["numeric_variables"] if c in df.columns]


# Assign X, y 
X = df.drop('LOG_MFBTU', axis=1)
y = df['LOG_MFBTU']

# Split the data into validation, train, and test sets
# e.g., 80/10/10 split
test_size = 0.10
val_size  = 0.10
rand      = 42

# 1) Hold out test
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=test_size, random_state=rand, shuffle=True
)

# 2) From the remainder, carve out validation
val_rel = val_size / (1.0 - test_size)
X_train, X_val, y_train, y_val = train_test_split(
    X_train, y_train, test_size=val_rel, random_state=rand, shuffle=True
    )

# Check and make sure the splits worked as expected...
df.shape
check_shape = X_train.shape[0]+X_test.shape[0]+X_val.shape[0] == df.shape[0]

print(f"Split correctly: {check_shape}")

Split correctly: True


# Run Linear-specific pipeline to finish preparing the data

In [53]:
from sklearn.model_selection import cross_val_score

base_steps = [
    ("median_imputer", MedianImputer(RULES["impute_rules"])),
    ("auto", AutoTransform(scoring="r2", cv=5, n_jobs=-1, num_cols=num_cols, suffix_identity=False)),
    ("ohe", OneHotEncoder(var_types["categorical_variables"], var_types["numeric_variables"])),
    ("scaler", StandardScaler(with_mean=False))
]


In [55]:
from sklearn.pipeline import Pipeline
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LinearRegression, RidgeCV, LassoCV, ElasticNetCV

# 1️⃣ Define and fit preprocessing pipeline once
preproc = Pipeline(steps=[
    ("median_imputer", MedianImputer(RULES["impute_rules"])),
    ("auto", AutoTransform(scoring="r2", cv=5, n_jobs=-1, num_cols=num_cols, suffix_identity=False)),
    ("ohe", OneHotEncoder(var_types["categorical_variables"], var_types["numeric_variables"])),
    ("scaler", StandardScaler(with_mean=False))
])

X_train_proc = preproc.fit_transform(X_train, y_train)
X_test_proc = preproc.transform(X_test)

# 2️⃣ Try different linear models on the preprocessed data
models = {
    "OLS": LinearRegression(),
    "Ridge": RidgeCV(alphas=np.logspace(-6, 6, 25), cv=5),
    "Lasso": LassoCV(cv=5, n_jobs=-1),
    "ElasticNet": ElasticNetCV(cv=5, n_jobs=-1)
}

for name, m in models.items():
    score = cross_val_score(m, X_train_proc, y_train, cv=5, scoring="r2").mean()
    print(f"{name}: {score:.3f}")

OLS: -826741400936559044198400.000
Ridge: 0.917
Lasso: 0.923
ElasticNet: 0.923


### Quick thoughts about the above:
When I initially fit an unregularized OLS model, performance collapsed (R² ≈ -8×10²³). This indicates severe multicollinearity in the expanded feature space — a common issue after one-hot encoding. I therefore implemented Ridge and Lasso regularization, which stabilized the solution and improved cross-validated R² to ~0.92. I considered manually examining collinearity of features and pruning appropriately, but, given the width of this dataset, I opted for regularization instead.