# Multi-Stage Classification and Regression
## Gradient Boosting Classifer and Regressor
Cheng-Hao Tai (A09340296)

In [1]:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import os
import pdb
import pickle
import collections
import string

from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import accuracy_score
from sklearn.metrics import mean_absolute_error

## Load the Data

In [2]:
parent_path = os.getcwd() + '/'
train_path = parent_path + 'train_data/'
test_path = parent_path + 'test_data/'

In [3]:
path_train_df = train_path + 'train_processed.csv'
path_train_labels = train_path + 'train_labels.pickle'
path_test_df = test_path + 'test_processed.csv'

In [4]:
train_df = pd.read_csv(path_train_df)
test_df = pd.read_csv(path_test_df)
with open(path_train_labels, 'rb') as handle:
    train_labels = pickle.load(handle)

## Classify 0's

In [5]:
classify_0_labels = np.array([train_labels==0], dtype=np.int8)[0]
xtrain0, xval0, ytrain0, yval0 = train_test_split(train_df, classify_0_labels, test_size=0.2, random_state=0)
clf = GradientBoostingClassifier(n_estimators=1000, max_depth=3, learning_rate=0.1, random_state=0, verbose=0)
clf.fit(xtrain0, ytrain0)
pred0t = clf.predict(xtrain0)
pred0v = clf.predict(xval0)
print(accuracy_score(ytrain0, pred0t))
print(accuracy_score(yval0, pred0v))

0.96181875
0.959325


In [6]:
clf.fit(train_df, classify_0_labels)
clf_train_0_predict = clf.predict(train_df)
clf_test_0_predict = clf.predict(test_df)
print(accuracy_score(clf_train_0_predict, classify_0_labels))

0.961135


In [7]:
# Find indexes of correct train predictions
pred_0_correct_idx_reference = np.where(clf_train_0_predict==1)[0]
pred_0_correct_idx_actual = train_df.iloc[pred_0_correct_idx_reference].index

# Find indexes for next train classification
classify_1_train_indexes_reference = np.where(clf_train_0_predict==0)[0]
classify_1_train_indexes_actual = train_df.iloc[classify_1_train_indexes_reference].index

# Find indexes of correct test predictions
test_0_correct_idx_reference = np.where(clf_test_0_predict==1)[0]
test_0_correct_idx_actual = test_df.iloc[test_0_correct_idx_reference].index

# Find indexes for next test set
classify_1_test_indexes_reference = np.where(clf_test_0_predict==0)[0]
classify_1_test_indexes_actual = test_df.iloc[classify_1_test_indexes_reference].index

print('Number of training elements going to the next stage:', len(classify_1_train_indexes_actual))
print('Number of test elements going to the next stage:', len(classify_1_test_indexes_actual))

Number of training elements going to the next stage: 62487
Number of test elements going to the next stage: 4368


## Classify 1's

In [8]:
# Train
train_1_df = train_df.loc[classify_1_train_indexes_actual]
train_1_labels = train_labels[classify_1_train_indexes_actual]
# Test
test_1_df = test_df.loc[classify_1_test_indexes_actual]

In [9]:
classify_1_labels = np.array([train_1_labels==1], dtype=np.int8)[0]
xtrain1, xval1, ytrain1, yval1 = train_test_split(train_1_df, classify_1_labels, test_size=0.2, random_state=0)
clf = GradientBoostingClassifier(n_estimators=1000, max_depth=3, learning_rate=0.1, random_state=0, verbose=0)
clf.fit(xtrain1, ytrain1)
pred1t = clf.predict(xtrain1)
pred1v = clf.predict(xval1)
print(accuracy_score(ytrain1, pred1t))
print(accuracy_score(yval1, pred1v))

0.8156594450779171
0.8133301328212514


In [10]:
clf.fit(train_1_df, classify_1_labels)
clf_train_1_predict = clf.predict(train_1_df)
clf_test_1_predict = clf.predict(test_1_df)
print(accuracy_score(clf_train_1_predict, classify_1_labels))

0.8145534271128395


In [11]:
# Find indexes of correct train predictions
pred_1_correct_idx_reference = np.where(clf_train_1_predict==1)[0]
pred_1_correct_idx_actual = train_1_df.iloc[pred_1_correct_idx_reference].index

# Find indexes for next train classification
classify_2_train_indexes_reference = np.where(clf_train_1_predict==0)[0]
classify_2_train_indexes_actual = train_1_df.iloc[classify_2_train_indexes_reference].index

# Find indexes of correct test predictions
test_1_correct_idx_reference = np.where(clf_test_1_predict==1)[0]
test_1_correct_idx_actual = test_1_df.iloc[test_1_correct_idx_reference].index

# Find indexes for next test set
classify_2_test_indexes_reference = np.where(clf_test_1_predict==0)[0]
classify_2_test_indexes_actual = test_1_df.iloc[classify_2_test_indexes_reference].index

print('Number of training elements going to the next stage:', len(classify_2_train_indexes_actual))
print('Number of test elements going to the next stage:', len(classify_2_test_indexes_actual))

Number of training elements going to the next stage: 32972
Number of test elements going to the next stage: 2298


## Classify 2's

In [12]:
# Train
train_2_df = train_df.loc[classify_2_train_indexes_actual]
train_2_labels = train_labels[classify_2_train_indexes_actual]
# Test
test_2_df = test_df.loc[classify_2_test_indexes_actual]

In [13]:
classify_2_labels = np.array([train_2_labels==2], dtype=np.int8)[0]
xtrain2, xval2, ytrain2, yval2 = train_test_split(train_2_df, classify_2_labels, test_size=0.2, random_state=0)
clf = GradientBoostingClassifier(n_estimators=100, max_depth=3, learning_rate=0.1, random_state=0, verbose=0)
clf.fit(xtrain2, ytrain2)
pred2t = clf.predict(xtrain2)
pred2v = clf.predict(xval2)
print(accuracy_score(ytrain2, pred2t))
print(accuracy_score(yval2, pred2v))

0.7951245403192175
0.7928733889310083


In [14]:
clf.fit(train_2_df, classify_2_labels)
clf_train_2_predict = clf.predict(train_2_df)
clf_test_2_predict = clf.predict(test_2_df)
print(accuracy_score(clf_train_2_predict, classify_2_labels))

0.7951898580613854


In [15]:
# Find indexes of correct train predictions
pred_2_correct_idx_reference = np.where(clf_train_2_predict==1)[0]
pred_2_correct_idx_actual = train_2_df.iloc[pred_2_correct_idx_reference].index

# Find indexes for next train classification
classify_3_train_indexes_reference = np.where(clf_train_2_predict==0)[0]
classify_3_train_indexes_actual = train_2_df.iloc[classify_3_train_indexes_reference].index

# Find indexes of correct test predictions
test_2_correct_idx_reference = np.where(clf_test_2_predict==1)[0]
test_2_correct_idx_actual = test_2_df.iloc[test_2_correct_idx_reference].index

# Find indexes for next test set
classify_3_test_indexes_reference = np.where(clf_test_2_predict==0)[0]
classify_3_test_indexes_actual = test_2_df.iloc[classify_3_test_indexes_reference].index

print('Number of training elements going to the next stage:', len(classify_3_train_indexes_actual))
print('Number of test elements going to the next stage:', len(classify_3_test_indexes_actual))

Number of training elements going to the next stage: 23531
Number of test elements going to the next stage: 1638


## Classify 3's

In [16]:
# Train
train_3_df = train_df.loc[classify_3_train_indexes_actual]
train_3_labels = train_labels[classify_3_train_indexes_actual]
# Test
test_3_df = test_df.loc[classify_3_test_indexes_actual]

In [17]:
classify_3_labels = np.array([train_3_labels==3], dtype=np.int8)[0]
xtrain3, xval3, ytrain3, yval3 = train_test_split(train_3_df, classify_3_labels, test_size=0.2, random_state=0)
clf = GradientBoostingClassifier(n_estimators=100, max_depth=3, learning_rate=0.1, random_state=0, verbose=0)
clf.fit(xtrain3, ytrain3)
pred3t = clf.predict(xtrain3)
pred3v = clf.predict(xval3)
print(accuracy_score(ytrain3, pred3t))
print(accuracy_score(yval3, pred3v))

0.8167233319167021
0.8143190992139366


In [18]:
clf.fit(train_3_df, classify_3_labels)
clf_train_3_predict = clf.predict(train_3_df)
clf_test_3_predict = clf.predict(test_3_df)
print(accuracy_score(clf_train_3_predict, classify_3_labels))

0.816752369215078


In [19]:
# Find indexes of correct train predictions
pred_3_correct_idx_reference = np.where(clf_train_3_predict==1)[0]
pred_3_correct_idx_actual = train_3_df.iloc[pred_3_correct_idx_reference].index

# Find indexes for next train classification
classify_4_train_indexes_reference = np.where(clf_train_3_predict==0)[0]
classify_4_train_indexes_actual = train_3_df.iloc[classify_4_train_indexes_reference].index

# Find indexes of correct test predictions
test_3_correct_idx_reference = np.where(clf_test_3_predict==1)[0]
test_3_correct_idx_actual = test_3_df.iloc[test_3_correct_idx_reference].index

# Find indexes for next test set
classify_4_test_indexes_reference = np.where(clf_test_3_predict==0)[0]
classify_4_test_indexes_actual = test_3_df.iloc[classify_4_test_indexes_reference].index

print('Number of training elements going to the next stage:', len(classify_4_train_indexes_actual))
print('Number of test elements going to the next stage:', len(classify_4_test_indexes_actual))

Number of training elements going to the next stage: 19510
Number of test elements going to the next stage: 1331


## Classify 4's

In [20]:
# Train
train_4_df = train_df.loc[classify_4_train_indexes_actual]
train_4_labels = train_labels[classify_4_train_indexes_actual]
# Test
test_4_df = test_df.loc[classify_4_test_indexes_actual]

In [21]:
classify_4_labels = np.array([train_4_labels==4], dtype=np.int8)[0]
xtrain4, xval4, ytrain4, yval4 = train_test_split(train_4_df, classify_4_labels, test_size=0.2, random_state=0)
clf = GradientBoostingClassifier(n_estimators=80, max_depth=4, learning_rate=0.025, random_state=0, verbose=0)
clf.fit(xtrain4, ytrain4)
pred4t = clf.predict(xtrain4)
pred4v = clf.predict(xval4)
print(accuracy_score(ytrain4, pred4t))
print(accuracy_score(yval4, pred4v))

0.851742696053306
0.8552024602767812


In [22]:
clf.fit(train_4_df, classify_4_labels)
clf_train_4_predict = clf.predict(train_4_df)
clf_test_4_predict = clf.predict(test_4_df)
print(accuracy_score(clf_train_4_predict, classify_4_labels))

0.8518708354689902


In [23]:
# Find indexes of correct train predictions
pred_4_correct_idx_reference = np.where(clf_train_4_predict==1)[0]
pred_4_correct_idx_actual = train_4_df.iloc[pred_4_correct_idx_reference].index

# Find indexes for next train classification
classify_5_train_indexes_reference = np.where(clf_train_4_predict==0)[0]
classify_5_train_indexes_actual = train_4_df.iloc[classify_5_train_indexes_reference].index

# Find indexes of correct test predictions
test_4_correct_idx_reference = np.where(clf_test_4_predict==1)[0]
test_4_correct_idx_actual = test_4_df.iloc[test_4_correct_idx_reference].index

# Find indexes for next test set
classify_5_test_indexes_reference = np.where(clf_test_4_predict==0)[0]
classify_5_test_indexes_actual = test_4_df.iloc[classify_5_test_indexes_reference].index

print('Number of training elements going to the next stage:', len(classify_5_train_indexes_actual))
print('Number of test elements going to the next stage:', len(classify_5_test_indexes_actual))

Number of training elements going to the next stage: 19376
Number of test elements going to the next stage: 1324


## Classify 5's

In [24]:
# Train
train_5_df = train_df.loc[classify_5_train_indexes_actual]
train_5_labels = train_labels[classify_5_train_indexes_actual]
# Test
test_5_df = test_df.loc[classify_5_test_indexes_actual]

In [25]:
classify_5_labels = np.array([train_5_labels==5], dtype=np.int8)[0]
xtrain5, xval5, ytrain5, yval5 = train_test_split(train_5_df, classify_5_labels, test_size=0.2, random_state=0)
clf = GradientBoostingClassifier(n_estimators=100, max_depth=3, learning_rate=0.05, random_state=0, verbose=0)
clf.fit(xtrain5, ytrain5)
pred5t = clf.predict(xtrain5)
pred5v = clf.predict(xval5)
print(accuracy_score(ytrain5, pred5t))
print(accuracy_score(yval5, pred5v))

0.9076129032258065
0.9086687306501547


In [26]:
clf.fit(train_5_df, classify_5_labels)
clf_train_5_predict = clf.predict(train_5_df)
clf_test_5_predict = clf.predict(test_5_df)
print(accuracy_score(clf_train_5_predict, classify_5_labels))

0.9073596201486375


In [27]:
# Find indexes of correct train predictions
pred_5_correct_idx_reference = np.where(clf_train_5_predict==1)[0]
pred_5_correct_idx_actual = train_5_df.iloc[pred_5_correct_idx_reference].index

# Find train indexes for next regression stages
regression_train_indexes_reference = np.where(clf_train_5_predict==0)[0]
regression_train_indexes_actual = train_5_df.iloc[regression_train_indexes_reference].index

# Find indexes of correct test predictions
test_5_correct_idx_reference = np.where(clf_test_5_predict==1)[0]
test_5_correct_idx_actual = test_5_df.iloc[test_5_correct_idx_reference].index

# Find indexes for next test set in regression stage
regression_test_indexes_reference = np.where(clf_test_5_predict==0)[0]
regression_test_indexes_actual = test_5_df.iloc[regression_test_indexes_reference].index

print('Number of training elements going to the next stage:', len(regression_train_indexes_actual))
print('Number of test elements going to the next stage:', len(regression_test_indexes_actual))

Number of training elements going to the next stage: 18577
Number of test elements going to the next stage: 1281


## Regression

In [28]:
# Create regression sets
# Train
reg_train = train_df.iloc[regression_train_indexes_actual]
reg_label = train_labels[regression_train_indexes_actual]
# Test
reg_test = test_df.iloc[regression_test_indexes_actual]

In [29]:
# Small-value regression set
reg_cut = 15
# Training
small_train = reg_train[reg_train.iloc[:, -1] < reg_cut]
small_train_idx = small_train.index
small_labels = train_labels[small_train_idx]
# Test
small_test = reg_test[reg_test.iloc[:, -1] < reg_cut]
small_test_idx = small_test.index

# Large-value regression set
# Training
large_train = reg_train[reg_train.iloc[:, -1] >= reg_cut]
large_train_idx = large_train.index
large_labels = train_labels[large_train_idx]
# Test
large_test = reg_test[reg_test.iloc[:, -1] >= reg_cut]
large_test_idx = large_test.index

### Regression Stage 1: Small Set

In [30]:
xtrains, xvals, ytrains, yvals = train_test_split(small_train, small_labels, test_size=0.2, random_state=0)
est = GradientBoostingRegressor(n_estimators=110, max_depth=3, learning_rate=0.05, random_state=0, verbose=0)
est.fit(xtrains, ytrains)
preds_train = est.predict(xtrains)
preds_val = est.predict(xvals)
print(mean_absolute_error(ytrains, preds_train))
print(mean_absolute_error(yvals, preds_val))

0.8403328024234439
0.8549405874844691


In [31]:
small_est = est
small_est.fit(small_train, small_labels)
small_est_train_predict = small_est.predict(small_train)
small_est_test_predict = small_est.predict(small_test)
print('Mean absolute error on train set:', mean_absolute_error(small_est_train_predict, small_labels))

Mean absolute error on train set: 0.8422724714421939


### Regression Stage 2: Large Set

In [32]:
xtrainl, xvall, ytrainl, yvall = train_test_split(large_train, large_labels, test_size=0.5, random_state=0)
est = GradientBoostingRegressor(n_estimators=110, max_depth=3, learning_rate=.08, random_state=0, verbose=0)
est.fit(xtrainl, ytrainl)
predl_train = est.predict(xtrainl)
predl_val = est.predict(xvall)
print(mean_absolute_error(ytrainl, predl_train))
print(mean_absolute_error(yvall, predl_val))

1.7741534572618822
2.7787880577057713


In [33]:
est.fit(large_train, large_labels)
large_est_train_predict = est.predict(large_train)
large_est_test_predict = est.predict(large_test)
print('Mean absolute error on train set:', mean_absolute_error(large_est_train_predict, large_labels))

Mean absolute error on train set: 1.9208655929287737


## Stitch Together Predictions

In [34]:
# Train predictions
master_train_pred = np.zeros(len(train_labels))

# Stage 0
master_train_pred[pred_0_correct_idx_actual] = 0
# Stage 1
master_train_pred[pred_1_correct_idx_actual] = 1
# Stage 2
master_train_pred[pred_2_correct_idx_actual] = 2
# Stage 3
master_train_pred[pred_3_correct_idx_actual] = 3
# Stage 4
master_train_pred[pred_4_correct_idx_actual] = 4
# Stage 5
master_train_pred[pred_5_correct_idx_actual] = 5

# Small Regression
master_train_pred[small_train_idx] = small_est_train_predict
# Large Regression
master_train_pred[large_train_idx] = large_est_train_predict

In [35]:
print('Overall train MAE:', mean_absolute_error(train_labels, master_train_pred))

Overall train MAE: 0.16316402664958665


In [36]:
# Test predictions
master_test_pred = np.zeros(len(test_df))

# Stage 0
master_test_pred[test_0_correct_idx_actual] = 0
# Stage 1
master_test_pred[test_1_correct_idx_actual] = 1
# Stage 2
master_test_pred[test_2_correct_idx_actual] = 2
# Stage 3
master_test_pred[test_3_correct_idx_actual] = 3
# Stage 4
master_test_pred[test_4_correct_idx_actual] = 4
# Stage 5
master_test_pred[test_5_correct_idx_actual] = 5
# Small Regression
master_test_pred[small_test_idx] = small_est_test_predict
# Large Regression
master_test_pred[large_test_idx] = large_est_test_predict

In [37]:
template_path = parent_path + 'pairs_Helpful.txt'
submit_path = parent_path + 'submit_kaggle_hailmary.csv'
# Convert to DF and save
format_df = pd.read_csv(template_path)
format_df.prediction = master_test_pred
format_df.to_csv(submit_path, index=None)