In [49]:
import numpy as np
import pandas as pd
from sklearn.ensemble import GradientBoostingRegressor, GradientBoostingClassifier
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
plt.style.use('seaborn-notebook')
%matplotlib inline
import csv
import src.scrubbing as scrubbing
import src.utility 
from sklearn.model_selection import cross_val_score

In [50]:
feature_names = ['p7_9','p10_12', 'p13_18', 'p19_24', 'p25_36', '94013', '94014', '94080', '94101', '94102', '94103', '94104',
       '94105', '94107', '94108', '94109', '94110', '94111', '94112', '94114',
       '94115', '94116', '94117', '94118', '94120', '94121', '94122', '94123',
       '94124', '94127', '94129', '94130', '94131', '94132', '94133', '94134',
       '94143', '94158']

In [51]:
df8 = pd.read_pickle('data/sf_inspection_master.pkl')

In [52]:
y = df8['y_label']
X = df8[feature_names]

In [53]:
X_tr, X_test, y_tr, y_test = train_test_split(X, y, test_size=0.20, random_state=38)

In [54]:
X_train, X_validation, y_train, y_validation = train_test_split(X_tr, y_tr, test_size=0.25, random_state=28)

# Gradient Boosting Model

In [66]:
learning_rates = [0.05, 0.1, 0.25, 0.5, 0.75, 1]
for learning_rate in learning_rates:
    gb = GradientBoostingClassifier(n_estimators=20, learning_rate = learning_rate, max_features=2, max_depth = 5, random_state = 0)
    gb.fit(X_train, y_train)
    print("Learning rate: ", learning_rate)
    print("Accuracy score (training): {0:.3f}".format(gb.score(X_train, y_train)))
    print("Accuracy score (validation): {0:.3f}".format(gb.score(X_validation, y_validation)))

Learning rate:  0.05
Accuracy score (training): 0.633
Accuracy score (validation): 0.613
Learning rate:  0.1
Accuracy score (training): 0.720
Accuracy score (validation): 0.667
Learning rate:  0.25
Accuracy score (training): 0.742
Accuracy score (validation): 0.704
Learning rate:  0.5
Accuracy score (training): 0.752
Accuracy score (validation): 0.702
Learning rate:  0.75
Accuracy score (training): 0.758
Accuracy score (validation): 0.707
Learning rate:  1
Accuracy score (training): 0.761
Accuracy score (validation): 0.702


In [78]:
# Let's use learning rate of 0.5
gb = GradientBoostingClassifier(n_estimators=40, learning_rate = 0.5, max_features=2, max_depth = 5, random_state = 0)
gb.fit(X_train, y_train)
print("Accuracy score (training): {0:.3f}".format(gb.score(X_train, y_train)))
print("Accuracy score (validation): {0:.3f}".format(gb.score(X_validation, y_validation)))
# The "validation" here is a correct term. It is not a "test" set.

Accuracy score (training): 0.776
Accuracy score (validation): 0.711


In [79]:
scores = cross_val_score(gb, X_tr, y_tr, cv=5)

In [80]:
scores

array([0.69299363, 0.70498084, 0.71392082, 0.70498084, 0.68965517])

In [81]:
sum(scores)/len(scores)

0.701306261235978

## Max depth

In [71]:
max_depths = [2, 3, 4, 5, 6, 7, 8, 100]
for max_depth in max_depths:
    gb = GradientBoostingClassifier(n_estimators=20, learning_rate = 0.5, max_features=2, max_depth = max_depth, random_state = 38)
    gb.fit(X_train, y_train)
    print("Max Depth: ", max_depth)
    print("Accuracy score (training): {0:.3f}".format(gb.score(X_train, y_train)))
    print("Accuracy score (validation): {0:.3f}".format(gb.score(X_validation, y_validation)))

Max Depth:  2
Accuracy score (training): 0.646
Accuracy score (validation): 0.639
Max Depth:  3
Accuracy score (training): 0.687
Accuracy score (validation): 0.680
Max Depth:  4
Accuracy score (training): 0.733
Accuracy score (validation): 0.709
Max Depth:  5
Accuracy score (training): 0.758
Accuracy score (validation): 0.720
Max Depth:  6
Accuracy score (training): 0.774
Accuracy score (validation): 0.717
Max Depth:  7
Accuracy score (training): 0.813
Accuracy score (validation): 0.705
Max Depth:  8
Accuracy score (training): 0.831
Accuracy score (validation): 0.708
Max Depth:  100
Accuracy score (training): 0.886
Accuracy score (validation): 0.687


## n_estimators

In [77]:
n_estimators = [20, 25, 30, 35, 40, 45, 50, 55, 60]
for n_estimator in n_estimators:
    gb = GradientBoostingClassifier(n_estimators=n_estimator, learning_rate = 0.5, max_features=2, max_depth = 5, random_state = 38)
    gb.fit(X_train, y_train)
    print("n_estimator: ", n_estimator)
    print("Accuracy score (training): {0:.3f}".format(gb.score(X_train, y_train)))
    print("Accuracy score (validation): {0:.3f}".format(gb.score(X_validation, y_validation)))

n_estimator:  20
Accuracy score (training): 0.758
Accuracy score (validation): 0.720
n_estimator:  25
Accuracy score (training): 0.762
Accuracy score (validation): 0.727
n_estimator:  30
Accuracy score (training): 0.772
Accuracy score (validation): 0.723
n_estimator:  35
Accuracy score (training): 0.774
Accuracy score (validation): 0.720
n_estimator:  40
Accuracy score (training): 0.782
Accuracy score (validation): 0.728
n_estimator:  45
Accuracy score (training): 0.784
Accuracy score (validation): 0.727
n_estimator:  50
Accuracy score (training): 0.790
Accuracy score (validation): 0.724
n_estimator:  55
Accuracy score (training): 0.794
Accuracy score (validation): 0.730
n_estimator:  60
Accuracy score (training): 0.801
Accuracy score (validation): 0.718
