In [1]:
import pandas as pd
from utils import *

In [5]:
raw = numerize_csv('train.csv', expand_classes=True)
#raw = combine_related_columns(raw)

from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import roc_auc_score
from sklearn.metrics import accuracy_score

Y = raw['Discontinued']
X = raw.drop('Discontinued', axis=1)
X_train, X_test, y_train, y_test = train_test_split(X, Y, random_state=0, train_size = .8)

In [6]:
max_depths = [1, 2, 3, 4, 5]
for depth in max_depths:
    print("Depth: " + str(depth))
    clf = GradientBoostingClassifier(max_depth=depth, random_state=0)
    clf.fit(X_train, y_train)
    y_test_preds = clf.predict(X_test)
    y_train_preds = clf.predict(X_train)
    y_test_prob_preds = clf.predict_proba(X_test)
    y_train_prob_preds = clf.predict_proba(X_train)

    prob_roc_auc_testing_accuracy = roc_auc_score(y_test, y_test_prob_preds[:, 1])
    prob_roc_auc_training_accuracy = roc_auc_score(y_train, y_train_prob_preds[:, 1])
    testing_accuracy = accuracy_score(y_test, y_test_preds)
    training_accuracy = accuracy_score(y_train, y_train_preds)

    print(f'ROC AUC Testing Accuracy with Probabilities: {prob_roc_auc_testing_accuracy}')
    print(f'ROC AUC Training Accuracy with Probabilities: {prob_roc_auc_training_accuracy}')
    print(f'Testing Accuracy: {testing_accuracy}')
    print(f'Training Accuracy: {training_accuracy}')

# Depth 1 seems to be best, model is very prone to overfitting, maybe RandomForest is the way to go

Depth: 1
ROC AUC Testing Accuracy with Probabilities: 0.8638486147375175
ROC AUC Training Accuracy with Probabilities: 0.8450486163603388
Testing Accuracy: 0.7960710944808232
Training Accuracy: 0.802526906878802
Depth: 2
ROC AUC Testing Accuracy with Probabilities: 0.8643186861170769
ROC AUC Training Accuracy with Probabilities: 0.8603106748293575
Testing Accuracy: 0.8072965388213283
Training Accuracy: 0.817735142723444
Depth: 3
ROC AUC Testing Accuracy with Probabilities: 0.8605402924690556
ROC AUC Training Accuracy with Probabilities: 0.8832350450922843
Testing Accuracy: 0.8072965388213283
Training Accuracy: 0.8331773514272345
Depth: 4
ROC AUC Testing Accuracy with Probabilities: 0.855304900327045
ROC AUC Training Accuracy with Probabilities: 0.9136992924608606
Testing Accuracy: 0.8016838166510758
Training Accuracy: 0.8547028544688816
Depth: 5
ROC AUC Testing Accuracy with Probabilities: 0.8496217150699092
ROC AUC Training Accuracy with Probabilities: 0.9462247838453174
Testing Accur

In [7]:
# Test with different n_estimators
num_estimators = [50, 100, 120, 150, 170, 200]
for num in num_estimators:
    print("Number of Estimators: " + str(num))
    clf = GradientBoostingClassifier(n_estimators=num, max_depth=2, random_state=0)
    clf.fit(X_train, y_train)
    y_test_preds = clf.predict(X_test)
    y_train_preds = clf.predict(X_train)
    y_test_prob_preds = clf.predict_proba(X_test)
    y_train_prob_preds = clf.predict_proba(X_train)

    prob_roc_auc_testing_accuracy = roc_auc_score(y_test, y_test_prob_preds[:, 1])
    prob_roc_auc_training_accuracy = roc_auc_score(y_train, y_train_prob_preds[:, 1])
    testing_accuracy = accuracy_score(y_test, y_test_preds)
    training_accuracy = accuracy_score(y_train, y_train_preds)

    print(f'ROC AUC Testing Accuracy with Probabilities: {prob_roc_auc_testing_accuracy}')
    print(f'ROC AUC Training Accuracy with Probabilities: {prob_roc_auc_training_accuracy}')
    print(f'Testing Accuracy: {testing_accuracy}')
    print(f'Training Accuracy: {training_accuracy}')

# 200 seems to be best, after that model is overfitting on training data

Number of Estimators: 50
ROC AUC Testing Accuracy with Probabilities: 0.8630644198294376
ROC AUC Training Accuracy with Probabilities: 0.8513034257878381
Testing Accuracy: 0.7998129092609916
Training Accuracy: 0.8088441740758072
Number of Estimators: 100
ROC AUC Testing Accuracy with Probabilities: 0.8643186861170769
ROC AUC Training Accuracy with Probabilities: 0.8603106748293575
Testing Accuracy: 0.8072965388213283
Training Accuracy: 0.817735142723444
Number of Estimators: 120
ROC AUC Testing Accuracy with Probabilities: 0.8630755589616546
ROC AUC Training Accuracy with Probabilities: 0.8628281760380987
Testing Accuracy: 0.8082319925163705
Training Accuracy: 0.8191389798783341
Number of Estimators: 150
ROC AUC Testing Accuracy with Probabilities: 0.8619081779053085
ROC AUC Training Accuracy with Probabilities: 0.8667960246605878
Testing Accuracy: 0.8101028999064547
Training Accuracy: 0.8226485727655592
Number of Estimators: 170
ROC AUC Testing Accuracy with Probabilities: 0.861413600

In [None]:
# Generate submission using test.csv

raw_test = numerize_csv_test('test.csv', expand_classes=True)
#_test = combine_related_columns_test(raw_test)

clf = GradientBoostingClassifier(max_depth=2, n_estimators=100)
clf.fit(X_train, y_train)
y_test_prob_preds = clf.predict_proba(raw_test)
write_submission(y_test_prob_preds[:, 1])