In [1]:
%load_ext autoreload
%autoreload 2

import numpy as np
import seaborn as sns
from lightgbm import LGBMRegressor
from sklearn.ensemble import (
    BaggingRegressor,
    ExtraTreesRegressor,
    RandomForestRegressor,
)
from sklearn.linear_model import ElasticNetCV, LassoCV, SGDRegressor
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import cross_val_score
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVR
from xgboost import XGBRegressor

from data_utilities import get_clean_data, get_raw_data, get_split

# Try out a few models
models = {
    "lasso": make_pipeline(StandardScaler(), LassoCV(random_state=57)),
    "elastic net": make_pipeline(StandardScaler(), ElasticNetCV(random_state=57)),
    "support vector machine": make_pipeline(StandardScaler(), SVR()),
    "sgd": make_pipeline(StandardScaler(), SGDRegressor(random_state=57)),
    "random forest": RandomForestRegressor(random_state=57),
    "bagging": BaggingRegressor(random_state=57),
    "extra trees": ExtraTreesRegressor(random_state=57),
    "lgbm": LGBMRegressor(random_state=57, verbose=-1),
    "xbg": XGBRegressor(random_state=57),
}

In [2]:
trained_models = []
for name, model in models.items():
    X_train, y_train, X_test, y_test = get_split(remove_all_nulls=True)
    print(f"Training model: {name}")
    model.fit(X_train, y_train)
    prediction = model.predict(X_test)
    score = mean_absolute_error(y_test, prediction)
    print(f"Finished training, MAE: {score:.2f}")
    trained_models.append(model)

Loading raw dataset
Training model: lasso


  model = cd_fast.enet_coordinate_descent(


Finished training, MAE: 50.18
Training model: elastic net
Finished training, MAE: 50.45
Training model: support vector machine
Finished training, MAE: 49.94
Training model: sgd
Finished training, MAE: 3004728183.75
Training model: random forest
Finished training, MAE: 46.97
Training model: bagging
Finished training, MAE: 49.02
Training model: extra trees
Finished training, MAE: 47.63
Training model: lgbm
Finished training, MAE: 45.30
Training model: xbg
Finished training, MAE: 45.36


In [3]:
# Tune the best model a bit

# LGBM can deal with nulls
X_train, y_train, X_test, y_test = get_split(remove_all_nulls=False)


def test_lgbm(**params):
    params["verbose"] = -1
    params["random_state"] = 57
    lgbm = LGBMRegressor(**params).fit(X_train, y_train)
    train_score = mean_absolute_error(y_train, lgbm.predict(X_train))
    cv_score = -cross_val_score(
        lgbm, X_train, y_train, scoring="neg_mean_absolute_error"
    ).mean()
    print(f"Train score {train_score}. CV score {cv_score}. Params {params}")

Loading raw dataset


In [4]:
test_lgbm(n_estimators=100)
test_lgbm(n_estimators=300)
test_lgbm(n_estimators=500)
test_lgbm(n_estimators=700)

Train score 43.487531918126. CV score 45.49206620041728. Params {'n_estimators': 100, 'verbose': -1, 'random_state': 57}
Train score 40.568781117676245. CV score 45.24527945150708. Params {'n_estimators': 300, 'verbose': -1, 'random_state': 57}
Train score 38.31883726280727. CV score 45.299863319913975. Params {'n_estimators': 500, 'verbose': -1, 'random_state': 57}
Train score 36.327306397487575. CV score 45.37926359268093. Params {'n_estimators': 700, 'verbose': -1, 'random_state': 57}


In [5]:
test_lgbm(n_estimators=500, learning_rate=0.1)
test_lgbm(n_estimators=500, learning_rate=0.08)
test_lgbm(n_estimators=500, learning_rate=0.06)
test_lgbm(n_estimators=500, learning_rate=0.04)

Train score 38.31883726280727. CV score 45.299863319913975. Params {'n_estimators': 500, 'learning_rate': 0.1, 'verbose': -1, 'random_state': 57}
Train score 39.31234122951901. CV score 45.27662095210038. Params {'n_estimators': 500, 'learning_rate': 0.08, 'verbose': -1, 'random_state': 57}
Train score 40.538392745666165. CV score 45.21862648141531. Params {'n_estimators': 500, 'learning_rate': 0.06, 'verbose': -1, 'random_state': 57}
Train score 41.84434827302484. CV score 45.23014727932341. Params {'n_estimators': 500, 'learning_rate': 0.04, 'verbose': -1, 'random_state': 57}


In [13]:
test_lgbm(n_estimators=700, learning_rate=0.04)
test_lgbm(n_estimators=1000, learning_rate=0.04)

Train score 40.75791319257313. CV score 45.18474642449393. Params {'n_estimators': 700, 'learning_rate': 0.04, 'verbose': -1, 'random_state': 57}
Train score 39.32810645373689. CV score 45.206251481761356. Params {'n_estimators': 1000, 'learning_rate': 0.04, 'verbose': -1, 'random_state': 57}


In [14]:
test_lgbm(n_estimators=1000, learning_rate=0.03)

Train score 40.53235459207658. CV score 45.206404664756704. Params {'n_estimators': 1000, 'learning_rate': 0.03, 'verbose': -1, 'random_state': 57}


In [15]:
# Test score of tuned model
lgbm = LGBMRegressor(n_estimators=700, learning_rate=0.04, random_state=57, verbose=-1)
lgbm.fit(X_train, y_train)
prediction = lgbm.predict(X_test)
mean_absolute_error(y_test, prediction)

44.4533196550708