## Import needed libraries

In [1]:
import pandas as pd
import numpy as np
import math
from sklearn.metrics import mean_squared_error
from sklearn.decomposition import PCA
from sklearn.neural_network import MLPRegressor
from sklearn import tree
from sklearn.ensemble import RandomForestClassifier, AdaBoostRegressor, BaggingRegressor, AdaBoostClassifier, RandomForestRegressor
from xgboost import XGBClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.dummy import DummyRegressor
import lightgbm
from sklearn import neighbors
from sklearn.model_selection import GridSearchCV

## Load Data

In [24]:
train_data = './data/train_preprocess.csv'    # origin path 'data/train_pyspark.csv'
test_data = './data/test_preprocess.csv'      # origin path 'data/test_pyspark.csv'

train = pd.read_csv(train_data, sep='\t')
test = pd.read_csv(test_data, sep='\t')


## Seperate data

In [25]:
train_x = train.dropna() # clean empty values
train_y = np.array(train_x['Correct First Attempt'])
train_x = train_x.drop(labels='Correct First Attempt',axis = 1)
valid_x = test.dropna() # clean empty values
valid_y = np.array(valid_x['Correct First Attempt'])
valid_x = valid_x.drop(labels='Correct First Attempt',axis = 1)

In [26]:
print("train_x's shape is", train_x.shape)
print("train_y's shape is", train_y.shape)
print("valid_x's shape is", valid_x.shape)
print("valid_y's shape is", valid_y.shape)

train_x's shape is (232744, 15)
train_y's shape is (232744,)
valid_x's shape is (666, 15)
valid_y's shape is (666,)


## Define RMSE for later evaluation

In [4]:
def RMSE(x, y):
    MSE = mean_squared_error(x,y)
    RMSE = math.sqrt(MSE)
    return RMSE

## Adaboost

In [27]:
model = AdaBoostRegressor()
model.fit(train_x, train_y)
valid_y_hat = model.predict(valid_x)
default_AdaboostRegressor_RMSE = RMSE(valid_y, valid_y_hat)
print("RMSE for adaboostRegressor is", default_AdaboostRegressor_RMSE)

RMSE for adaboostRegressor is 0.38568308323957284


In [28]:
model = AdaBoostClassifier()
model.fit(train_x, train_y)
valid_y_hat = model.predict(valid_x)
default_AdaboostClassifier_RMSE = RMSE(valid_y, valid_y_hat)
print("RMSE for adaboostClassifier is", default_AdaboostClassifier_RMSE)

RMSE for adaboostClassifier is 0.4173417953838007


## DecisionTreeClassifier

In [29]:
model = tree.DecisionTreeClassifier()
model.fit(train_x, train_y)
valid_y_hat = model.predict(valid_x)
defalut_DecisionTreeClassifier_RMSE = RMSE(valid_y, valid_y_hat)
print("RMSE for Decision Tree is", defalut_DecisionTreeClassifier_RMSE)

RMSE for Decision Tree is 0.4633731916228157


## RandomForest

In [30]:
model = RandomForestRegressor()
model.fit(train_x, train_y)
valid_y_hat = model.predict(valid_x)
defalut_RandomForestRegressor_RMSE = RMSE(valid_y, valid_y_hat)
print("RMSE for RandomForestRegressor is", defalut_RandomForestRegressor_RMSE)

RMSE for RandomForestRegressor is 0.3497440348131438


In [31]:
model = RandomForestClassifier()
model.fit(train_x, train_y)
valid_y_hat = model.predict(valid_x)
defalut_RandomForestClassifier_RMSE = RMSE(valid_y, valid_y_hat)
print("RMSE for RandomForestClassifier is", defalut_RandomForestClassifier_RMSE)

RMSE for RandomForestClassifier is 0.4155390146215788


## XGBoost

In [32]:
model = XGBClassifier()
model.fit(train_x, train_y)
valid_y_hat = model.predict(valid_x)
default_XGBClassifier_RMSE = RMSE(valid_y, valid_y_hat)
print("RMSE for XGBoost is", default_XGBClassifier_RMSE)

RMSE for XGBoost is 0.39516598557587945


## LightGBM

In [33]:
model = lightgbm.LGBMRegressor()
model.fit(train_x, train_y)
valid_y_hat = model.predict(valid_x)
default_LightbgmRegressor_RMSE = RMSE(valid_y, valid_y_hat)
print("RMSE for LightbgmRegressor is", default_LightbgmRegressor_RMSE)

RMSE for LightbgmRegressor is 0.3537449722295338


In [34]:
model = lightgbm.LGBMClassifier()
model.fit(train_x, train_y)
valid_y_hat = model.predict(valid_x)
print("RMSE for LightbgmClassifier is", RMSE(valid_y, valid_y_hat))

RMSE for LightbgmClassifier is 0.4026936331284146


## KNN

In [35]:
model = neighbors.KNeighborsRegressor()
model.fit(train_x, train_y)
valid_y_hat = model.predict(valid_x)
valid_y_hat
print("RMSE for KNeighborsRegressor is", RMSE(valid_y, valid_y_hat))

RMSE for KNeighborsRegressor is 0.4015734817704741


In [36]:
model = neighbors.KNeighborsClassifier()
model.fit(train_x, train_y)
valid_y_hat = model.predict(valid_x)
print("RMSE for KNeighborsClassifier is", RMSE(valid_y, valid_y_hat))

RMSE for KNeighborsClassifier is 0.4568464826489405


## MLP

In [30]:
model = MLPRegressor(hidden_layer_sizes=(100, 5, 100), activation='tanh', solver='adam')
model.fit(train_x, train_y)
valid_y_hat = model.predict(valid_x)
print("RMSE for MLPRegressor is", RMSE(valid_y, valid_y_hat))

RMSE for MLPRegressor is 0.3715221816894079


## RSME visualization

For tuning hyperparameters, here we choose the 3 algorithms to do the further step.   
They are **AdaboostRegresser, RandomForestRegressor, LGBMRegressor**.

## Hyperparameter Tuning

#### AdaBoostRegressor part

In [48]:
# for AdaboostRegressor
param_dist = {
    'n_estimators': range(20, 100, 4),
    'learning_rate': np.linspace(0.01, 2, 20)
    # 'loss': ['linear', 'square', 'exponential']
}
model_adaboost = GridSearchCV(estimator=XGBClassifier(),
                        param_grid=param_dist, scoring='neg_mean_squared_error', 
                        verbose=3, n_jobs=-1, cv=5)
model_adaboost.fit(train_x, train_y)

In [51]:
_ # the output of Hyperparameter tuning of adaboost after 200minutes' training

GridSearchCV(cv=5,
             estimator=XGBClassifier(base_score=None, booster=None,
                                     callbacks=None, colsample_bylevel=None,
                                     colsample_bynode=None,
                                     colsample_bytree=None,
                                     early_stopping_rounds=None,
                                     enable_categorical=False, eval_metric=None,
                                     feature_types=None, gamma=None,
                                     gpu_id=None, grow_policy=None,
                                     importance_type=None,
                                     interaction_constraints=None,
                                     learning_rate=None,...
                                     random_state=None, ...),
             n_jobs=-1,
             param_grid={'learning_rate': array([0.01      , 0.11473684, 0.21947368, 0.32421053, 0.42894737,
       0.53368421, 0.63842105, 0.74315789, 0.8478947

In [63]:
# print("best parameter:",model_adaboost.best_params_)
# print("best score:", model_adaboost.best_score_)
# print(model.best_estimator_)
# the output was learning_rate = 0.11473684 and n_estimator = 28

In [64]:
model = AdaBoostRegressor(learning_rate=0.11473684, n_estimators=28)
model.fit(train_x, train_y)
valid_y_hat = model.predict(valid_x)
print("RMSE for adaboostRegressor is", RMSE(valid_y, valid_y_hat))

RMSE for adaboostRegressor is 0.3796168526901849


In [65]:
# the result of the tuned hyperparameter of AdaBoostRegressor wasn't that good
# manually use the parameters of learning_rate=0.01, n_estimators=50
model = AdaBoostRegressor(learning_rate=0.01, n_estimators=50)
model.fit(train_x, train_y)
valid_y_hat = model.predict(valid_x)
print("RMSE for adaboostRegressor is", RMSE(valid_y, valid_y_hat))

RMSE for adaboostRegressor is 0.3733618521673437


#### RandomForestRegressor Part

In [7]:
# For RandomForestRegressor
param_dist = {
    'n_estimators' : range(10, 200 ,10)
    # 'criterion': ['squared_error', 'absolute_error', 'friedman_mse', 'poisson']
}
model_randomforest = GridSearchCV(estimator=RandomForestRegressor(),
                        param_grid=param_dist, scoring='neg_mean_squared_error', 
                        verbose=3, n_jobs=-1, cv=5)
model_randomforest.fit(train_x, train_y)


Fitting 5 folds for each of 19 candidates, totalling 95 fits


GridSearchCV(cv=5, estimator=RandomForestRegressor(), n_jobs=-1,
             param_grid={'n_estimators': range(10, 200, 10)},
             scoring='neg_mean_squared_error', verbose=3)

In [8]:
_

GridSearchCV(cv=5, estimator=RandomForestRegressor(), n_jobs=-1,
             param_grid={'n_estimators': range(10, 200, 10)},
             scoring='neg_mean_squared_error', verbose=3)

In [9]:
print("best parameter:",model_randomforest.best_params_)
print("best score:", model_randomforest.best_score_)
# print(model.best_estimator_)
# the output was n_estimators=150

best parameter: {'n_estimators': 150}
best score: -0.15268760875134982


In [10]:
model = RandomForestRegressor(n_estimators=150)
model.fit(train_x, train_y)
valid_y_hat = model.predict(valid_x)
print("RMSE for RandomForestRegressor is", RMSE(valid_y, valid_y_hat))

RMSE for RandomForestRegressor is 0.3530385561029303


In [11]:
model = RandomForestRegressor(n_estimators=100)
model.fit(train_x, train_y)
valid_y_hat = model.predict(valid_x)
print("RMSE for RandomForestRegressor is", RMSE(valid_y, valid_y_hat))

RMSE for RandomForestRegressor is 0.35115575941502875


In [12]:
model = RandomForestRegressor(n_estimators=190, max_depth=15, max_leaf_nodes=900)
model.fit(train_x, train_y)
valid_y_hat = model.predict(valid_x)
print("RMSE for RandomForestRegressor is", RMSE(valid_y, valid_y_hat))

RMSE for RandomForestRegressor is 0.3535061846695293


#### LGBMRegressor Part

In [18]:
param_dist = {
    'num_leaves': range(40, 200, 5),
    'n_estimator': range(60, 200, 5)
    # 'criterion': ['squared_error', 'absolute_error', 'friedman_mse', 'poisson']
}
model_LGBM = GridSearchCV(estimator=lightgbm.LGBMRegressor(),
                        param_grid=param_dist, scoring='neg_mean_squared_error', 
                        verbose=3, n_jobs=-1, cv=5)
model_LGBM.fit(train_x, train_y)

Fitting 5 folds for each of 896 candidates, totalling 4480 fits


GridSearchCV(cv=5, estimator=LGBMRegressor(), n_jobs=-1,
             param_grid={'n_estimator': range(60, 200, 5),
                         'num_leaves': range(40, 200, 5)},
             scoring='neg_mean_squared_error', verbose=3)

In [22]:
print("best parameter:",model_LGBM.best_params_)
print("best score:", model_LGBM.best_score_)
# print(model.best_estimator_)

best parameter: {'n_estimator': 60, 'num_leaves': 150}
best score: -0.14274718547591378


In [37]:
model = lightgbm.LGBMRegressor(n_estimators=60, num_leaves=150)
model.fit(train_x, train_y)
valid_y_hat = model.predict(valid_x)
tuned_LightbgmRegressor_RMSE = RMSE(valid_y, valid_y_hat)
print("RMSE for LightbgmRegressor is", tuned_LightbgmRegressor_RMSE)

RMSE for LightbgmRegressor is 0.3489215935347196


## Output result to test.csv

In [40]:
def output_testCSV():
    output_x = test
    output_y = np.array(output_x['Correct First Attempt']).astype(float)
    output_x = output_x.drop(['Correct First Attempt'], axis=1)
    model = lightgbm.LGBMRegressor(n_estimators=60, num_leaves=150)
    model.fit(train_x, train_y)
    output_res = model.predict(output_x)
    for id, val in enumerate(output_y):
        if np.isnan(val):
            output_y[id] = output_res[id]
    new_test = pd.read_csv('./data/test.csv', sep='\t')
    new_test['Correct First Attempt'] = output_y
    new_test.to_csv('test.csv', sep='\t', index=False)

In [42]:
output_testCSV()