In [6]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import (
    classification_report, 
    confusion_matrix, 
    roc_auc_score, 
    roc_curve, 
    precision_recall_curve
)
import xgboost as xgb
from imblearn.over_sampling import SMOTE
import shap
import pickle
import warnings

df = pd.read_csv("../data/processed/model_ready.csv")

In [7]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
import pickle
import numpy as np

# Choose target and numeric features
target = "final_four"
features = ["adj_o", "adj_d", "wab", "ov_cur_sos", "season", "seed_filled"]

# Prepare data
data = df.dropna(subset=features + [target])
X = data[features]
y = data[target]

# Train/test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Scale features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Fit linear regression
lr = LinearRegression()
lr.fit(X_train_scaled, y_train)

# Predictions & metrics
y_pred = lr.predict(X_test_scaled)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
r2 = r2_score(y_test, y_pred)

print(f"RMSE: {rmse:.4f}")
print(f"R^2: {r2:.4f}")

# Coefficients
coef_df = pd.DataFrame({"feature": features, "coefficient": lr.coef_})
print(coef_df)

# Save model and scaler
with open("linreg_model.pkl", "wb") as f:
    pickle.dump(lr, f)
with open("scaler.pkl", "wb") as f:
    pickle.dump(scaler, f)

RMSE: 0.1108
R^2: 0.1262
       feature  coefficient
0        adj_o     0.017084
1        adj_d    -0.012730
2          wab    -0.020124
3   ov_cur_sos    -0.006831
4       season    -0.001610
5  seed_filled    -0.033465
