In [1]:
import sys
sys.path.insert(0, '../utils/')

import warnings
warnings.filterwarnings("ignore", category=FutureWarning)
warnings.filterwarnings("ignore", category=UserWarning)

import pandas as pd
import numpy as np

import util_michael as um
from util import *

from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
import lightgbm as lgb
import xgboost as xgb

In [2]:
train = pd.read_csv('./data/fill_train.csv')
test = pd.read_csv('./data/fill_test.csv')
Ylabel = 'College_Persistence_Rate'

Y = train[Ylabel]
train.drop(columns=[Ylabel], inplace=True)
train.shape, test.shape, Y.shape

((69, 57), (38, 57), (69,))

In [3]:
X = train.drop(columns=['unique_id'])
Xt = test.drop(columns=['unique_id'])

X_scaled, Xt_scaled = um.apply_scale(X, Xt)

In [4]:
class EstimatorContainer():
    def __init__(self, metric: str=''):
        self.metric = metric
        self.reset()
        
    def save(self, obj: dict) -> None:
        name = type(obj['estimator']).__name__
        if name in self.names:
            ix = self.names.index(name)
            self.names.pop(ix)
            self.estimators.pop(ix)
            self.Xs.pop(ix)
            self.scores.pop(ix)
        self.names.append(name)
        self.estimators.append(obj['estimator'])
        self.Xs.append(obj['X'])
        self.scores.append(obj['score'])
        
    def get_df(self) -> pd.DataFrame:
        self.df = pd.DataFrame()
        self.df['Estimator'] = self.names
        self.df[self.metric] = self.scores
        return self.df
        
    def reset(self) -> None:
        self.names = []
        self.estimators = []
        self.Xs = []
        self.scores = []
        
estimators = EstimatorContainer('MSE')

# Ridge

In [32]:
from sklearn.linear_model import Ridge
param_grid = [
#     { 'solver': ['auto'], 'alpha': [12.5, 12.75, 12.8] },
#     { 'solver': ['svd'], 'alpha': [12.5, 12.75, 12.8] },
    { 'solver': ['auto', 'svd', 'cholesky', 'lsqr', 'sparse_cg', 'sag', 'saga'], 'alpha': [12.5, 12.75, 12.8] }
]
model = Ridge()
grid_search = GridSearchCV(model, param_grid, cv=10, scoring='neg_mean_squared_error', return_train_score=True)
grid_search.fit(X_scaled, Y)
show_grid_results(grid_search)

Best parameters:
 {'alpha': 12.75, 'solver': 'svd'} 
 -85.91270381437569 

-85.91348064581788 {'alpha': 12.5, 'solver': 'auto'}
-85.91348064581783 {'alpha': 12.5, 'solver': 'svd'}
-85.91348064581788 {'alpha': 12.5, 'solver': 'cholesky'}
-86.04793716946004 {'alpha': 12.5, 'solver': 'lsqr'}
-86.02041389652803 {'alpha': 12.5, 'solver': 'sparse_cg'}
-86.0837901945741 {'alpha': 12.5, 'solver': 'sag'}
-86.11659065962623 {'alpha': 12.5, 'solver': 'saga'}
-85.9127038143757 {'alpha': 12.75, 'solver': 'auto'}
-85.91270381437569 {'alpha': 12.75, 'solver': 'svd'}
-85.9127038143757 {'alpha': 12.75, 'solver': 'cholesky'}
-86.03943370342851 {'alpha': 12.75, 'solver': 'lsqr'}
-86.0394337111661 {'alpha': 12.75, 'solver': 'sparse_cg'}
-86.0742749133118 {'alpha': 12.75, 'solver': 'sag'}
-86.07525990117492 {'alpha': 12.75, 'solver': 'saga'}
-85.91277733191797 {'alpha': 12.8, 'solver': 'auto'}
-85.91277733191788 {'alpha': 12.8, 'solver': 'svd'}
-85.91277733191797 {'alpha': 12.8, 'solver': 'cholesky'}
-86.0



In [33]:
aux = {
    'estimator': grid_search.best_estimator_,
    'X': X_scaled,
    'score': grid_search.best_score_
}
estimators.save(aux)

# Lasso

In [54]:
from sklearn.linear_model import Lasso
param_grid = [
    { 'alpha': [1.35, 1.355, 1.3555, 1.35555] }
]
model = Lasso()
grid_search = GridSearchCV(model, param_grid, cv=10, scoring='neg_mean_squared_error', return_train_score=True)
grid_search.fit(X_scaled, Y)
show_grid_results(grid_search)

Best parameters:
 {'alpha': 1.3555} 
 -87.44201088090824 

-87.44344232164515 {'alpha': 1.35}
-87.44224971257792 {'alpha': 1.355}
-87.44201088090824 {'alpha': 1.3555}
-87.44201280692893 {'alpha': 1.35555}




In [55]:
aux = {
    'estimator': grid_search.best_estimator_,
    'X': X,
    'score': grid_search.best_score_
}
estimators.save(aux)

# Neural Network

In [156]:
from sklearn.neural_network import MLPRegressor
param_grid = [
    # Relu is better than other activations
    { 'hidden_layer_sizes': [ (30,30), (40, 40), (30, 40) ], 'learning_rate_init': [0.005, 0.001] }
]
model = MLPRegressor()
grid_search = GridSearchCV(model, param_grid, cv=10, scoring='neg_mean_squared_error', return_train_score=True)
grid_search.fit(X, Y)
show_grid_results(grid_search)

Best parameters:
 {'hidden_layer_sizes': (40, 40), 'learning_rate_init': 0.005} 
 -88.6711336892311 

-101.37710053253483 {'hidden_layer_sizes': (30, 30), 'learning_rate_init': 0.005}
-160.13333305075471 {'hidden_layer_sizes': (30, 30), 'learning_rate_init': 0.001}
-88.6711336892311 {'hidden_layer_sizes': (40, 40), 'learning_rate_init': 0.005}
-238.34178732600085 {'hidden_layer_sizes': (40, 40), 'learning_rate_init': 0.001}
-101.36645092123045 {'hidden_layer_sizes': (30, 40), 'learning_rate_init': 0.005}
-215.87637816130294 {'hidden_layer_sizes': (30, 40), 'learning_rate_init': 0.001}




In [157]:
aux = {
    'estimator': grid_search.best_estimator_,
    'X': X,
    'score': grid_search.best_score_
}
estimators.save(aux)

# RandomForest

In [60]:
from sklearn.ensemble import RandomForestRegressor
param_grid = [
    { 'n_estimators': [305, 310, 315] }
]
model = RandomForestRegressor()
grid_search = GridSearchCV(model, param_grid, cv=10, scoring='neg_mean_squared_error', return_train_score=True)
grid_search.fit(X, Y)
show_grid_results(grid_search)



Best parameters:
 {'n_estimators': 305} 
 -83.13582480632836 

-83.13582480632836 {'n_estimators': 305}
-83.6144696164924 {'n_estimators': 310}
-83.8018791971109 {'n_estimators': 315}


In [61]:
aux = {
    'estimator': grid_search.best_estimator_,
    'X': X,
    'score': grid_search.best_score_
}
estimators.save(aux)

# AdaBoost Regressor

In [120]:
from sklearn.ensemble import AdaBoostRegressor
param_grid = [
#     { 'n_estimators': [350, 400], 'loss': ['linear', 'square', 'exponential'], 'learning_rate': [0.5, 1] }
    { 'n_estimators': [300, 320], 'loss': ['exponential'], 'learning_rate': [0.8, 1, 1.2] }
]
model = AdaBoostRegressor()
grid_search = GridSearchCV(model, param_grid, cv=10, scoring='neg_mean_squared_error', return_train_score=True)
grid_search.fit(X, Y)
show_grid_results(grid_search)



Best parameters:
 {'learning_rate': 1, 'loss': 'exponential', 'n_estimators': 300} 
 -89.05106469394772 

-91.69176166887173 {'learning_rate': 0.8, 'loss': 'exponential', 'n_estimators': 300}
-94.33850212047268 {'learning_rate': 0.8, 'loss': 'exponential', 'n_estimators': 320}
-89.05106469394772 {'learning_rate': 1, 'loss': 'exponential', 'n_estimators': 300}
-91.91590660902946 {'learning_rate': 1, 'loss': 'exponential', 'n_estimators': 320}
-90.89424870359683 {'learning_rate': 1.2, 'loss': 'exponential', 'n_estimators': 300}
-91.6959814772328 {'learning_rate': 1.2, 'loss': 'exponential', 'n_estimators': 320}


In [121]:
aux = {
    'estimator': grid_search.best_estimator_,
    'X': X,
    'score': grid_search.best_score_
}
estimators.save(aux)

# XGBoost

In [64]:
xgb.XGBRegressor()

XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=1, gamma=0,
             importance_type='gain', learning_rate=0.1, max_delta_step=0,
             max_depth=3, min_child_weight=1, missing=None, n_estimators=100,
             n_jobs=1, nthread=None, objective='reg:linear', random_state=0,
             reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
             silent=None, subsample=1, verbosity=1)

In [71]:
param_grid = [
#     { 'n_estimators': [55], 'objective': ['reg:squarederror'] }
#         { 'n_estimators': [50, 55, 60], 'colsample_bytree': [0.2], 'objective': ['reg:squarederror'] }
    { 'n_estimators': [60, 65, 70],  'eta': [0.05, 0.1], 
      'colsample_bytree': [0.25, 0.3], 'objective': ['reg:squarederror'] }
]
model = xgb.XGBRegressor()
grid_search = GridSearchCV(model, param_grid, cv=10, scoring='neg_mean_squared_error', return_train_score=True)
grid_search.fit(X, Y)
show_grid_results(grid_search)

Best parameters:
 {'colsample_bytree': 0.25, 'eta': 0.05, 'n_estimators': 65, 'objective': 'reg:squarederror'} 
 -79.07382221471613 

-79.62863849768557 {'colsample_bytree': 0.25, 'eta': 0.05, 'n_estimators': 60, 'objective': 'reg:squarederror'}
-79.07382221471613 {'colsample_bytree': 0.25, 'eta': 0.05, 'n_estimators': 65, 'objective': 'reg:squarederror'}
-79.30281273210471 {'colsample_bytree': 0.25, 'eta': 0.05, 'n_estimators': 70, 'objective': 'reg:squarederror'}
-79.62863849768557 {'colsample_bytree': 0.25, 'eta': 0.1, 'n_estimators': 60, 'objective': 'reg:squarederror'}
-79.07382221471613 {'colsample_bytree': 0.25, 'eta': 0.1, 'n_estimators': 65, 'objective': 'reg:squarederror'}
-79.30281273210471 {'colsample_bytree': 0.25, 'eta': 0.1, 'n_estimators': 70, 'objective': 'reg:squarederror'}
-84.56614521615492 {'colsample_bytree': 0.3, 'eta': 0.05, 'n_estimators': 60, 'objective': 'reg:squarederror'}
-84.27056743231799 {'colsample_bytree': 0.3, 'eta': 0.05, 'n_estimators': 65, 'objecti



In [72]:
aux = {
    'estimator': grid_search.best_estimator_,
    'X': X,
    'score': grid_search.best_score_
}
estimators.save(aux)

# VotingRegressor

In [159]:
estis = list()
skip = ['VotingRegressor']
# skip = []
for k, v in zip(estimators.names, estimators.estimators):
#     print(k, ':', v)
    if k not in skip:
        estis.append((k, v))
estis

[('Ridge', Ridge(alpha=12.75, copy_X=True, fit_intercept=True, max_iter=None,
        normalize=False, random_state=None, solver='svd', tol=0.001)),
 ('Lasso', Lasso(alpha=1.3555, copy_X=True, fit_intercept=True, max_iter=1000,
        normalize=False, positive=False, precompute=False, random_state=None,
        selection='cyclic', tol=0.0001, warm_start=False)),
 ('RandomForestRegressor',
  RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
                        max_features='auto', max_leaf_nodes=None,
                        min_impurity_decrease=0.0, min_impurity_split=None,
                        min_samples_leaf=1, min_samples_split=2,
                        min_weight_fraction_leaf=0.0, n_estimators=305,
                        n_jobs=None, oob_score=False, random_state=None,
                        verbose=0, warm_start=False)),
 ('XGBRegressor',
  XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
               colsample_bynode=1, cols

In [161]:
from sklearn.ensemble import VotingRegressor
param_grid = [
    { 'weights': [ [0.15, 0.15, 0.2, 0.4, 0.1, 0], [0.1, 0.1, 0.2, 0.5, 0.1, 0], [0.1, 0.1, 0.2, 0.5, 0.05, 0.05] ] }
#     { 'weights': [ [0.1, 0.3, 0.1, 0.5], [0.125, 0.25, 0.125, 0.5], [0.05, 0.2, 0.15, 0.5] ] }
]
model = VotingRegressor(estimators=estis)
grid_search = GridSearchCV(model, param_grid, cv=10, scoring='neg_mean_squared_error', return_train_score=True)
grid_search.fit(X, Y)
show_grid_results(grid_search)



Best parameters:
 {'weights': [0.15, 0.15, 0.2, 0.4, 0.1, 0]} 
 -72.98273083930076 

-72.98273083930076 {'weights': [0.15, 0.15, 0.2, 0.4, 0.1, 0]}
-74.57169742776755 {'weights': [0.1, 0.1, 0.2, 0.5, 0.1, 0]}
-73.33454943788806 {'weights': [0.1, 0.1, 0.2, 0.5, 0.05, 0.05]}


In [162]:
aux = {
    'estimator': grid_search.best_estimator_,
    'X': X,
    'score': grid_search.best_score_
}
estimators.save(aux)

# MSE of each Estimator

## Our predictor College_Persistence_Rate is a value ranged from 0 to 100

In [167]:
results = estimators.get_df()
results['MSE'] = results['MSE'] * -1
results['STD'] = np.sqrt( results['MSE'] )
results.sort_values(by='MSE', ascending=True)

Unnamed: 0,Estimator,MSE,STD
6,VotingRegressor,72.982731,8.542993
3,XGBRegressor,79.073822,8.892346
2,RandomForestRegressor,83.135825,9.117885
0,Ridge,85.912704,9.268911
1,Lasso,87.442011,9.351043
5,MLPRegressor,88.671134,9.416535
4,AdaBoostRegressor,89.051065,9.436687


# Get best estimator and predict

In [137]:
estimators.estimators[5]

VotingRegressor(estimators=[('Ridge',
                             Ridge(alpha=12.75, copy_X=True, fit_intercept=True,
                                   max_iter=None, normalize=False,
                                   random_state=None, solver='svd',
                                   tol=0.001)),
                            ('Lasso',
                             Lasso(alpha=1.3555, copy_X=True,
                                   fit_intercept=True, max_iter=1000,
                                   normalize=False, positive=False,
                                   precompute=False, random_state=None,
                                   selection='cyclic', tol=0.0001,
                                   warm_start=False)),
                            ('R...
                                          missing=None, n_estimators=65,
                                          n_jobs=1, nthread=None,
                                          objective='reg:squarederror',
                    

In [138]:
best_model = estimators.estimators[2]
Yt = best_model.predict(Xt)

In [139]:
output = pd.DataFrame()
output['unique_id'] = test['unique_id']
output[Ylabel] = Yt.astype(float)
output[Ylabel] = output[Ylabel].apply(lambda x: round(x, 2))
output.head()

Unnamed: 0,unique_id,College_Persistence_Rate
0,21,73.64
1,100,68.92
2,77,68.06
3,63,76.07
4,16,56.95


In [140]:
import datetime
tnow = datetime.datetime.now()
_result_file = 'results/result_'+tnow.strftime("%b-%d %H %M %S")+'.csv'
output.to_csv(_result_file, index=False)