In [1]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.metrics import (mean_squared_error, mean_absolute_error, r2_score)
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
from sklearn.linear_model import LinearRegression, SGDRegressor
from sklearn.tree import DecisionTreeRegressor
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor


import matplotlib.pyplot as plt
import pandas as pd
import numpy as np

In [None]:
df_raw = pd.read_pickle("data/preprocessed_data.pkl")

In [None]:
df_raw.head()

In [None]:
# high level plot of heatmap
import seaborn as sns


corr = df_raw.corr()
f, ax = plt.subplots(figsize=(100,100))
mask = np.triu(np.ones_like(corr, dtype=bool))
sns.heatmap(corr,annot = True, mask = mask)
plt.show()

In [None]:
def identify_correlated(df, threshold):
    corr_matrix = df.corr().abs()
    mask = np.triu(np.ones_like(corr_matrix, dtype=bool))
    reduced_corr_matrix = corr_matrix.mask(mask)
    features_to_drop = [c for c in reduced_corr_matrix.columns if any(reduced_corr_matrix[c] > threshold)]
    return features_to_drop

In [None]:
to_drop = identify_correlated(df_raw, threshold=.5)

In [None]:
print(to_drop)

In [None]:
df_raw = pd.DataFrame(df_raw.drop(to_drop, axis=1))

In [None]:
for col in df_raw.columns:
    print(col)

In [None]:
# need to figure out how to pre process these fields
df_raw = df_raw.drop(['city', 'state'], axis=1)

In [None]:
y = df_raw["los"]

features = []
for col in df_raw.columns:
    if col != "los":  # Skip the 'Target' column if it exists
        features.append(col)

X = df_raw[features]


In [None]:
X.head()

In [None]:
to_drop = identify_correlated(X, threshold=.2)

In [None]:
X = pd.DataFrame(X.drop(to_drop, axis=1))

In [None]:
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)

In [None]:
# for col in df_raw.columns:
#     print(col)

In [None]:
df_raw.dtypes

In [None]:
# split data into training and testing data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=312)

In [None]:
# # set up hyperparameter search space for the random forest regressor
# random_grid = {
#     "bootstrap": [True, False],
#     "max_depth": [int(x) for x in np.linspace(10, 110, num=11)],
#     "max_features": ["auto", "sqrt", "log2"],
#     # "max_leaf_nodes": None,
#     # "max_samples": None,
#     # "min_impurity_decrease": 0.0,
#     "min_samples_leaf": [1, 2, 4, 8],
#     "min_samples_split": [2, 4, 8, 16],
#     # "min_weight_fraction_leaf": 0.0,
#     "n_estimators": [int(x) for x in np.linspace(start=200, stop=2000, num=10)],
# }

In [None]:
# xgbregressor and lgbmregressor, need to change feature names because of symbols

models = [SGDRegressor(random_state = 0), 
          GradientBoostingRegressor(random_state = 0), 
          LinearRegression(),
          DecisionTreeRegressor(),
          RandomForestRegressor(random_state = 0)]
          #XGBRegressor(),
          #LGBMRegressor()]

In [None]:
results ={}

for model in models:
    
    # initalize the models
    regr = model
    regr.fit(X_train, y_train)
    
    # predictions from models
    y_test_pred = regr.predict(X_test)
    
    # model name and stored results with each model
    name = str(model).split("(")[0]
    
    results[name] = r2_score(y_test, y_test_pred)
    print('{} done. R-squared: {:.2f}'.format(name, results[name]))



In [None]:
# # train the regressor
# regr = RandomForestRegressor(max_depth=5, random_state=312, criterion="mse")
# regr_random = RandomizedSearchCV(estimator=regr, param_distributions=random_grid, n_iter=100, cv=5, random_state=312, n_jobs=-1)

# regr_random.fit(X_train, y_train)

In [None]:
# print out selected parameters
best_params = regr_random.best_params_

print(regr_random.best_score_)
print(best_params)

In [None]:
# retrain using the best params
regr = RandomForestRegressor(n_jobs=-1, random_state=312).set_params(**best_params)
regr.fit(X_train, y_train)

In [None]:
# infer on test data
yhat = regr.predict(X_test)

In [None]:
# plot inferences
fig, ax = plt.subplots()
ax.scatter(yhat, y_test, edgecolors=(0, 0, 1))
ax.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'r--', lw=3)
ax.set_xlabel('Predicted')
ax.set_ylabel('Actual')
plt.show()

In [None]:
feature_importances = pd.Series(regr.feature_importances_, index=regr.feature_names_in_)

In [None]:
display(feature_importances.sort_values(ascending=False).head(20))

In [None]:
# evaluate performance
mse = mean_squared_error(y_test, yhat)
mae = mean_absolute_error(y_test, yhat)
r2 = r2_score(y_test, yhat)

metrics_rfr = {
    "mse": mse,
    "mae": mae,
    "r2": r2,
}
# df_metrics = pd.DataFrame.from_dict(metrics, orient="index", columns=["RandomForestRegressor"])

# evaluate a baseline of always guessing the mean
yhat = np.ones((y_test.shape[0],1)) * y_train.mean()
mse = mean_squared_error(y_test, yhat)
mae = mean_absolute_error(y_test, yhat)
r2 = r2_score(y_test, yhat)

metrics_baseline = {
    "mse": mse,
    "mae": mae,
    "r2": r2,
}

In [None]:
df_metrics = pd.DataFrame.from_dict(data={"Baseline": metrics_baseline, "RandomForestRegressor": metrics_rfr})

In [None]:
display(df_metrics)

### XGBoost

In [None]:
from xgboost import XGBRegressor
from sklearn.model_selection import train_test_split, GridSearchCV

In [None]:
# Set up hyperparameter search space for the XGBoost regressor
param_grid = {
    "n_estimators": [100, 500, 1000],
    "max_depth": [3, 5, 7],
    "learning_rate": [0.01, 0.1, 0.5],
    "subsample": [0.5, 0.75, 1.0],
    "colsample_bytree": [0.5, 0.75, 1.0],
}

In [None]:
regr = XGBRegressor(random_state=312, objective="reg:squarederror")
regr_cv = GridSearchCV(estimator=regr, param_grid=param_grid, cv=5, n_jobs=-1)
X_train.columns = [col.replace("[", "_").replace("]", "_") for col in X_train.columns]
regr_cv.fit(X_train, y_train)

In [None]:
# Print out selected parameters
best_params = regr_cv.best_params_
print(f"Best score: {regr_cv.best_score_}")
print(f"Best parameters: {best_params}")

In [None]:
# Retrain using the best params
regr = XGBRegressor(random_state=312, objective="reg:squarederror").set_params(**best_params)
regr.fit(X_train, y_train)

In [None]:
# Infer on test data
X_test.columns = [col.replace("[", "_").replace("]", "_") for col in X_test.columns]
y_pred = regr.predict(X_test)

In [None]:
# Print feature importances
feature_importances = pd.Series(regr.feature_importances_, index=X_train.columns)
print("Top 20 most important features:")
print(feature_importances.sort_values(ascending=False).head(20))

In [None]:
# Evaluate performance
mse = mean_squared_error(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

metrics_xgb = {"mse": mse, "mae": mae, "r2": r2}

In [None]:
# Evaluate a baseline of always guessing the mean
y_mean = y_train.mean()
y_pred_baseline = np.full(y_test.shape, y_mean)
mse_baseline = mean_squared_error(y_test, y_pred_baseline)
mae_baseline = mean_absolute_error(y_test, y_pred_baseline)
r2_baseline = r2_score(y_test, y_pred_baseline)

metrics_baseline = {"mse": mse_baseline, "mae": mae_baseline, "r2": r2_baseline}

df_metrics = pd.DataFrame.from_dict(
    {"Baseline": metrics_baseline, "XGBRegressor": metrics_xgb}
)
print(df_metrics)