In [1]:
import numpy as np
import pandas as pd
from sklearn.ensemble import GradientBoostingRegressor, GradientBoostingClassifier
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
plt.style.use('seaborn-notebook')
%matplotlib inline
import csv
import src.scrubbing as scrubbing
import src.utility 
from sklearn.model_selection import cross_val_score

In [2]:
feature_names = ['p7_9','p10_12', 'p13_18', 'p19_24', 'p25_36', '94013', '94014', '94080', '94101', '94102', '94103', '94104',
       '94105', '94107', '94108', '94109', '94110', '94111', '94112', '94114',
       '94115', '94116', '94117', '94118', '94120', '94121', '94122', '94123',
       '94124', '94127', '94129', '94130', '94131', '94132', '94133', '94134',
       '94143', '94158']

In [3]:
df8 = pd.read_pickle('data/sf_inspection2.pkl')

In [4]:
y = df8['y_label']
X = df8[feature_names]

In [5]:
X_tr, X_test, y_tr, y_test = train_test_split(X, y, test_size=0.20, random_state=38)

In [6]:
X_train, X_validation, y_train, y_validation = train_test_split(X_tr, y_tr, test_size=0.25, random_state=28)

# Gradient Boosting Model

In [7]:
learning_rates = [0.05, 0.1, 0.25, 0.5, 0.75, 1]
for learning_rate in learning_rates:
    gb = GradientBoostingClassifier(n_estimators=20, learning_rate = learning_rate, max_features=2, max_depth = 2, random_state = 0)
    gb.fit(X_train, y_train)
    print("Learning rate: ", learning_rate)
    print("Accuracy score (training): {0:.3f}".format(gb.score(X_train, y_train)))
    print("Accuracy score (validation): {0:.3f}".format(gb.score(X_validation, y_validation)))

Learning rate:  0.05
Accuracy score (training): 0.571
Accuracy score (validation): 0.569
Learning rate:  0.1
Accuracy score (training): 0.600
Accuracy score (validation): 0.601
Learning rate:  0.25
Accuracy score (training): 0.674
Accuracy score (validation): 0.676
Learning rate:  0.5
Accuracy score (training): 0.698
Accuracy score (validation): 0.705
Learning rate:  0.75
Accuracy score (training): 0.694
Accuracy score (validation): 0.703
Learning rate:  1
Accuracy score (training): 0.687
Accuracy score (validation): 0.696


In [14]:
# Let's use learning rate of 0.75
gb = GradientBoostingClassifier(n_estimators=20, learning_rate = 0.75, max_features=2, max_depth = 2, random_state = 0)
gb.fit(X_train, y_train)
print("Accuracy score (training): {0:.3f}".format(gb.score(X_train, y_train)))
print("Accuracy score (validation): {0:.3f}".format(gb.score(X_validation, y_validation)))
# The "validation" here is a correct term. It is not a "test" set.

Accuracy score (training): 0.694
Accuracy score (validation): 0.703


In [9]:
scores = cross_val_score(gb, X_tr, y_tr, cv=5)

In [10]:
scores

array([0.68280255, 0.70753512, 0.68710089, 0.67943806, 0.66794381])

In [11]:
sum(scores)/len(scores)

0.6849640855439231

## Max depth

In [16]:
max_depths = [2, 10, 100]
for max_depth in max_depths:
    gb = GradientBoostingClassifier(n_estimators=20, learning_rate = 0.75, max_features=2, max_depth = max_depth, random_state = 38)
    gb.fit(X_train, y_train)
    print("Max Depth: ", max_depth)
    print("Accuracy score (training): {0:.3f}".format(gb.score(X_train, y_train)))
    print("Accuracy score (validation): {0:.3f}".format(gb.score(X_validation, y_validation)))

Max Depth:  2
Accuracy score (training): 0.644
Accuracy score (validation): 0.642
Max Depth:  10
Accuracy score (training): 0.865
Accuracy score (validation): 0.702
Max Depth:  100
Accuracy score (training): 0.886
Accuracy score (validation): 0.684
