# Test XGBoost, LGB, blending

In [1]:
import joblib
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm.notebook import tqdm
from collections import Counter
import mlflow
import sklearn
from bayes_opt import BayesianOptimization

# local imports
from prepare import *
from evaluate import *

### Set new experiment to keep track in mlflow

In [2]:
mlflow.set_experiment('Regressors with cv full features')
#mlflow.start_run(run_name='run of multiple regressors', nested=True)

### Read in initial datasets if needed

In [3]:
#raw_train, raw_train_labels, raw_test, specs, sample = read_raw_csvs()
#raw_train_labels = pd.read_csv('data/train_labels.csv')

### Load large train/test features from Josh's work

In [5]:
# Try out balanced classes
reduced_train = pd.read_csv('reduced_train_balanced.csv')

#reduced_train = pd.read_csv('reduce_train.csv')
reduced_test = pd.read_csv('reduce_test.csv')
reduced_train.shape, reduced_test.shape

categoricals = ['session_title']
cols_to_drop = ['game_session', 'installation_id', 'accuracy_group']

features = joblib.load('features.pkl')

# Regressors 

In [6]:
from sklearn.ensemble import (
    RandomForestRegressor,
    AdaBoostRegressor,
    GradientBoostingRegressor,
    StackingRegressor,
    VotingRegressor,
    BaggingRegressor,
)

from sklearn.linear_model import (
    HuberRegressor,
    ARDRegression,
    ModifiedHuber,
    PassiveAggressiveRegressor,
    SGDRegressor,
)

from sklearn.tree import DecisionTreeRegressor, ExtraTreeRegressor
from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor

  return _load(spec)


In [7]:
rfr = RandomForestRegressor()
abr = AdaBoostRegressor()
lgbm = LGBMRegressor()
gbr = GradientBoostingRegressor()
hr = HuberRegressor(max_iter=300)
sgd = SGDRegressor()
et = ExtraTreeRegressor()
cbr = CatBoostRegressor( loss_function='RMSE',
    task_type="CPU",
    learning_rate=0.05,
    iterations=2000,
    od_type="Iter",
    early_stopping_rounds=500,
    random_seed=42)

estimators = [
    ('RFR', rfr),
    ('ABR', abr),
    ('LGBM', lgbm),
    ('GBR', gbr),
    ('HR', hr),
    ('SGD', sgd),
    ('ET', et),
    ('CBR', cbr)
]

In [8]:
for i in estimators:
    print(i[0])
    cv_reg(i[1], reduced_train, n_splits=4)

RFR


KeyError: "['game_session' 'installation_id'] not found in axis"

In [75]:
bag_reg = BaggingRegressor(base_estimator=lgbm,
                      n_estimators=10, random_state=42)
bag_reg = quick_eval(reduced_train, bag_reg, pc=True)

The accuracy of BaggingRegressor is 0.5720746184284907
The QWK of BaggingRegressor is 0.5970717147236163


## Tune LGBM

## Vote, stacked, bagged regressors

In [40]:
vote_regressor = VotingRegressor(estimators=estimators,
                                n_jobs=-1)
vote_regressor = quick_eval(reduced_train, vote_regressor, pc=True)

The accuracy of VotingRegressor is 0.5712266817410967
The QWK of VotingRegressor is 0.5915220967303223


In [53]:
stacked_regressor = StackingRegressor(estimators=estimators,
                                n_jobs=-1)
stacked_regressor = quick_eval(reduced_train, stacked_regressor, pc=True)

The accuracy of StackingRegressor is 0.5672696438665913
The QWK of StackingRegressor is 0.5979707068817519


In [55]:
bagged_regressor = BaggingRegressor(base_estimator=lgbm,
                               n_jobs=-1,
                                   warm_start=True)
quick_eval(reduced_train, bagged_regressor, pc=True)

The accuracy of BaggingRegressor is 0.5746184284906727
The QWK of BaggingRegressor is 0.5958131257022264


BaggingRegressor(base_estimator=LGBMRegressor(boosting_type='gbdt',
                                              class_weight=None,
                                              colsample_bytree=1.0,
                                              importance_type='split',
                                              learning_rate=0.1, max_depth=-1,
                                              min_child_samples=20,
                                              min_child_weight=0.001,
                                              min_split_gain=0.0,
                                              n_estimators=100, n_jobs=-1,
                                              num_leaves=31, objective=None,
                                              random_state=None, reg_alpha=0.0,
                                              reg_lambda=0.0, silent=True,
                                              subsample=1.0,
                                              subsample_for_bin=200000,
     

##### Grid Search/Random Search

In [39]:
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV, KFold

In [69]:
#grid of parameters
gridParams = {
    'learning_rate': [0.05],
    'num_leaves': [31,90,200],
    'boosting_type' : ['gbdt','dart','rf'],
    'objective' : ['regression'],
    'max_depth' : [5,6,7,8],
    'random_state' : [42], 
    'colsample_bytree' : [0.3,0.5,0.7],
    'subsample' : [0.3,0.5,0.7],
    'min_split_gain' : [0.01],
    'min_data_in_leaf':[10],
    'metric':['rmse']
    }
#modelling
lgbm = LGBMRegressor()
grid = RandomizedSearchCV(lgbm,gridParams,cv=10,n_jobs = -1,n_iter=10)
X = reduced_train.drop(cols_to_drop, axis=1)[features]
y = reduced_train.accuracy_group

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)
        
grid.fit(X_train,y_train)

grid.best_params_
lgbm = grid.best_estimator_.fit(X_train, y_train)
y_pred = get_class_pred(
    grid.predict(X_test),
    reduced_train)
accuracy = accuracy_score(y_test, y_pred)
qwk = cohen_kappa_score(y_test, y_pred, weights="quadratic")
mlflow.log_param("features_shape", X.shape)
mlflow.log_param("estimator", 'xgb')
mlflow.log_metric("Accuracy", accuracy)
mlflow.log_metric("QWK", qwk)
print(accuracy, qwk)

0.5712266817410967 0.5988696990398876


## Manual crossval

In [47]:
kf = KFold()



## XGBoost

In [63]:
mlflow.start_run(run_name='xgb-tuned-params', nested=True)


import xgboost as xgb
from sklearn import datasets
from sklearn.model_selection import train_test_split


X = reduced_train.drop(cols_to_drop, axis=1)[features]
y = reduced_train.accuracy_group

X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    test_size=0.2,
    random_state=42,
)

dtrain = xgb.DMatrix(X_train, label=y_train)
params = {
            'colsample_bytree': 0.2,                 
            'learning_rate': 0.01,
            'objective':'reg:squarederror',
            'max_depth': 6,
            'subsample': 1,
            'min_child_weight': 3,
            'gamma': 0.25,
            'n_estimators': 1400
         }
xgb_model = xgb.train(params=params, dtrain=dtrain, num_boost_round=10)

xgb_model_path = "xgb_model.pth"
xgb_model.save_model(xgb_model_path)

dtest = xgb.DMatrix(X_test, label=y_test)

print('through the test')
y_pred = xgb_model.predict(dtest)

y_pred = get_class_pred(y_pred,reduced_train)

print('through the test')

accuracy = accuracy_score(y_test, y_pred)
qwk = cohen_kappa_score(y_test, y_pred, weights="quadratic")
mlflow.log_param("features_shape", X.shape)
mlflow.log_param("estimator", 'xgb')
mlflow.log_metric("Accuracy", accuracy)
mlflow.log_metric("QWK", qwk)
mlflow.sklearn.log_model(xgb_model, "model")
mlflow.end_run()
print('complete')

through the test
through the test
complete
