In [1]:
import sys
sys.path.insert(0, '../utils/')

import warnings
warnings.filterwarnings("ignore", category=FutureWarning)
warnings.filterwarnings("ignore", category=UserWarning)

import pandas as pd
import numpy as np

import util_michael as um
from util import *

from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
import lightgbm as lgb
import xgboost as xgb

In [2]:
train = pd.read_csv('./data/fill_train.csv')
test = pd.read_csv('./data/fill_test.csv')
Ylabel = 'College_Persistence_Rate'

Y = train[Ylabel]
train.drop(columns=[Ylabel], inplace=True)
train.shape, test.shape, Y.shape

((69, 166), (38, 166), (69,))

In [7]:
X = train.drop(columns=['rowid', 'unique_id'])
Xt = test.drop(columns=['rowid', 'unique_id'])

X_scaled, Xt_scaled = um.apply_scale(X, Xt)

In [62]:
class EstimatorContainer():
    def __init__(self, metric: str=''):
        self.metric = metric
        self.reset()
        
    def save(self, obj: dict) -> None:
        name = type(obj['estimator']).__name__
        if name in self.names:
            ix = self.names.index(name)
            self.names.pop(ix)
            self.estimators.pop(ix)
            self.Xs.pop(ix)
            self.scores.pop(ix)
        self.names.append(name)
        self.estimators.append(obj['estimator'])
        self.Xs.append(obj['X'])
        self.scores.append(obj['score'])
        
    def get_df(self) -> pd.DataFrame:
        self.df = pd.DataFrame()
        self.df['Estimator'] = self.names
        self.df[self.metric] = self.scores
        return self.df
        
    def reset(self) -> None:
        self.names = []
        self.estimators = []
        self.Xs = []
        self.scores = []
        
estimators = EstimatorContainer('MSE')

# RandomForest

In [63]:
from sklearn.ensemble import RandomForestRegressor
param_grid = [
    { 'n_estimators': [250, 300] }
]
model = RandomForestRegressor()
grid_search = GridSearchCV(model, param_grid, cv=10, scoring='neg_mean_squared_error', return_train_score=True)
grid_search.fit(X, Y)
show_grid_results(grid_search)



Best parameters:
 {'n_estimators': 250} 
 -82.8673466272465 

-82.8673466272465 {'n_estimators': 250}
-85.46663511111129 {'n_estimators': 300}


In [43]:
# evaluate_regressor(grid_search.best_estimator_, X, Y, 'RandomForest')

In [64]:
aux = {
    'estimator': grid_search.best_estimator_,
    'X': X,
    'score': grid_search.best_score_
}
estimators.save(aux)

# AdaBoost Regressor

In [66]:
from sklearn.ensemble import AdaBoostRegressor
param_grid = [
    { 'n_estimators': [350, 400] }
]
model = AdaBoostRegressor()
grid_search = GridSearchCV(model, param_grid, cv=10, scoring='neg_mean_squared_error', return_train_score=True)
grid_search.fit(X, Y)
show_grid_results(grid_search)



Best parameters:
 {'n_estimators': 350} 
 -90.22177568284954 

-90.22177568284954 {'n_estimators': 350}
-93.73312533942473 {'n_estimators': 400}


In [67]:
# evaluate_regressor(grid_search.best_estimator_, X, Y, 'RandomForest', 10)

In [68]:
aux = {
    'estimator': grid_search.best_estimator_,
    'X': X,
    'score': grid_search.best_score_
}
estimators.save(aux)

# XGBoost

In [69]:
xgb.XGBRegressor()

XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=1, gamma=0,
             importance_type='gain', learning_rate=0.1, max_delta_step=0,
             max_depth=3, min_child_weight=1, missing=None, n_estimators=100,
             n_jobs=1, nthread=None, objective='reg:linear', random_state=0,
             reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
             silent=None, subsample=1, verbosity=1)

In [70]:
param_grid = [
#     { 'n_estimators': [55], 'objective': ['reg:squarederror'] }
#         { 'n_estimators': [50, 55, 60], 'colsample_bytree': [0.2], 'objective': ['reg:squarederror'] }
    { 'n_estimators': [50, 55, 60],  'eta': [0.2, 0.3], 
      'colsample_bytree': [0.15, 0.2, 0.25], 'objective': ['reg:squarederror'] }
]
model = xgb.XGBRegressor()
grid_search = GridSearchCV(model, param_grid, cv=10, scoring='neg_mean_squared_error', return_train_score=True)
grid_search.fit(X, Y)
show_grid_results(grid_search)

Best parameters:
 {'colsample_bytree': 0.2, 'eta': 0.2, 'n_estimators': 55, 'objective': 'reg:squarederror'} 
 -79.01418783044306 

-86.81668468412836 {'colsample_bytree': 0.15, 'eta': 0.2, 'n_estimators': 50, 'objective': 'reg:squarederror'}
-86.76364018131724 {'colsample_bytree': 0.15, 'eta': 0.2, 'n_estimators': 55, 'objective': 'reg:squarederror'}
-86.02315309877507 {'colsample_bytree': 0.15, 'eta': 0.2, 'n_estimators': 60, 'objective': 'reg:squarederror'}
-86.81668468412836 {'colsample_bytree': 0.15, 'eta': 0.3, 'n_estimators': 50, 'objective': 'reg:squarederror'}
-86.76364018131724 {'colsample_bytree': 0.15, 'eta': 0.3, 'n_estimators': 55, 'objective': 'reg:squarederror'}
-86.02315309877507 {'colsample_bytree': 0.15, 'eta': 0.3, 'n_estimators': 60, 'objective': 'reg:squarederror'}
-79.73395796118629 {'colsample_bytree': 0.2, 'eta': 0.2, 'n_estimators': 50, 'objective': 'reg:squarederror'}
-79.01418783044306 {'colsample_bytree': 0.2, 'eta': 0.2, 'n_estimators': 55, 'objective': 'r



In [71]:
aux = {
    'estimator': grid_search.best_estimator_,
    'X': X,
    'score': grid_search.best_score_
}
estimators.save(aux)

In [76]:
results = estimators.get_df()
results.sort_values(by='MSE', ascending=False)

Unnamed: 0,Estimator,MSE
2,XGBRegressor,-79.014188
0,RandomForestRegressor,-82.867347
1,AdaBoostRegressor,-90.221776


In [77]:
estimators.estimators[2]

XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=0.2, eta=0.2, gamma=0,
             importance_type='gain', learning_rate=0.1, max_delta_step=0,
             max_depth=3, min_child_weight=1, missing=None, n_estimators=55,
             n_jobs=1, nthread=None, objective='reg:squarederror',
             random_state=0, reg_alpha=0, reg_lambda=1, scale_pos_weight=1,
             seed=None, silent=None, subsample=1, verbosity=1)

In [78]:
best_model = estimators.estimators[2]
Yt = best_model.predict(Xt)

In [85]:
output = pd.DataFrame()
output['unique_id'] = test['unique_id']
output[Ylabel] = Yt.astype(float)
output[Ylabel] = output[Ylabel].apply(lambda x: round(x, 2))
output.head()

Unnamed: 0,unique_id,College_Persistence_Rate
0,21,74.02
1,100,70.47
2,77,68.03
3,63,80.1
4,16,54.52


In [87]:
import datetime
tnow = datetime.datetime.now()
_result_file = 'results/result_'+tnow.strftime("%b-%d %H %M %S")+'.csv'
output.to_csv(_result_file, index=False)