In [1]:
import numpy as np
import pandas as pd
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
plt.style.use('seaborn-notebook')
%matplotlib inline
import csv
import src.scrubbing_develop as scrubbing
import src.utility 
from sklearn.model_selection import cross_val_score

In [2]:
feature_names = ['p7_9','p10_12', 'p13_18', 'p19_24', 'p25_36', '94013', '94014', '94080', '94101', '94102', '94103', '94104',
       '94105', '94107', '94108', '94109', '94110', '94111', '94112', '94114',
       '94115', '94116', '94117', '94118', '94120', '94121', '94122', '94123',
       '94124', '94127', '94129', '94130', '94131', '94132', '94133', '94134',
       '94143', '94158']

In [3]:
df2 = pd.read_pickle('data/sf_inspection_master.pkl')
df8 = scrubbing.remove_rows_zero_violation2(df2) 

In [4]:
y = df8['y_label']
X = df8[feature_names]

In [5]:
X_tr, X_test, y_tr, y_test = train_test_split(X, y, test_size=0.20, random_state=38)

In [6]:
X_train, X_validation, y_train, y_validation = train_test_split(X_tr, y_tr, test_size=0.25, random_state=28)

# Gradient Boosting Model

In [7]:
# Randomized Search for Algorithm Tuning
from scipy.stats import uniform as sp_rand
from sklearn import datasets
from sklearn.linear_model import Ridge
from sklearn.model_selection import RandomizedSearchCV
# load the diabetes datasets
#dataset = datasets.load_diabetes()
# prepare a uniform distribution to sample for the alpha parameter
param_grid = {'learning_rate': [x/200 for x in range(1,20,1)], 'max_depth': range(1,10,1), 
              'max_features':range(2,10,1), 'n_estimators': range(10,120,2), 'subsample': [x/100 for x in range(10,100,5)]}
# create and fit a ridge regression model, testing random alpha values
model = GradientBoostingClassifier()
rsearch = RandomizedSearchCV(estimator=model, param_distributions=param_grid, n_iter=100, n_jobs=-1)
rsearch.fit(X_train.values, y_train.values)
print(rsearch)
# summarize the results of the random parameter search
print(rsearch.best_score_)
print(rsearch.best_estimator_.learning_rate, rsearch.best_estimator_.max_depth, 
      rsearch.best_estimator_.max_features, rsearch.best_estimator_.n_estimators,
      rsearch.best_estimator_.subsample)

RandomizedSearchCV(cv=None, error_score='raise',
          estimator=GradientBoostingClassifier(criterion='friedman_mse', init=None,
              learning_rate=0.1, loss='deviance', max_depth=3,
              max_features=None, max_leaf_nodes=None,
              min_impurity_decrease=0.0, min_impurity_split=None,
              min_samples_leaf=1, min_samples_split=2,
              min_weight_fraction_leaf=0.0, n_estimators=100,
              presort='auto', random_state=None, subsample=1.0, verbose=0,
              warm_start=False),
          fit_params=None, iid=True, n_iter=100, n_jobs=-1,
          param_distributions={'learning_rate': [0.005, 0.01, 0.015, 0.02, 0.025, 0.03, 0.035, 0.04, 0.045, 0.05, 0.055, 0.06, 0.065, 0.07, 0.075, 0.08, 0.085, 0.09, 0.095], 'max_depth': range(1, 10), 'max_features': range(2, 10), 'n_estimators': range(10, 120, 2), 'subsample': [0.1, 0.15, 0.2, 0.25, 0.3, 0.35, 0.4, 0.45, 0.5, 0.55, 0.6, 0.65, 0.7, 0.75, 0.8, 0.85, 0.9, 0.95]},
          pre_disp

In [8]:
# Let's use learning rate of 0.5
gb = GradientBoostingClassifier(n_estimators=40, learning_rate = 0.08, max_features=4, max_depth = 8, subsample=0.4, random_state = 0)
gb.fit(X_train, y_train)
print("Accuracy score (training): {0:.3f}".format(gb.score(X_train, y_train)))
print("Accuracy score (validation): {0:.3f}".format(gb.score(X_validation, y_validation)))
print("Accuracy score (test): {0:.3f}".format(gb.score(X_test, y_test)))
# The "validation" here is a correct term. It is not a "test" set.

Accuracy score (training): 0.773
Accuracy score (validation): 0.731
Accuracy score (test): 0.706


In [9]:
scores = cross_val_score(gb, X_tr, y_tr, cv=5)

In [10]:
scores

array([0.70573248, 0.72413793, 0.70114943, 0.68710089, 0.69604087])

In [11]:
sum(scores)/len(scores)

0.7028323205700759

# Decision tree Model

In [12]:
# Randomized Search for Algorithm Tuning
from sklearn.tree import DecisionTreeClassifier
from scipy.stats import uniform as sp_rand
from sklearn.linear_model import Ridge
from sklearn.model_selection import RandomizedSearchCV
param_grid = {'max_depth': range(5,50,1), 'min_samples_split': range(2,60,1), 
              'max_features': range(2,30,1), 'min_samples_leaf': range(2,40,1)}
# create and fit a ridge regression model, testing random alpha values
model = DecisionTreeClassifier()
rsearch = RandomizedSearchCV(estimator=model, param_distributions=param_grid, n_iter=8000, n_jobs=-1)
rsearch.fit(X_train.values, y_train.values)
print(rsearch)
# summarize the results of the random parameter search
print(rsearch.best_score_)
print(rsearch.best_estimator_.max_depth, rsearch.best_estimator_.min_samples_split, 
      rsearch.best_estimator_.max_features, rsearch.best_estimator_.min_samples_leaf)

RandomizedSearchCV(cv=None, error_score='raise',
          estimator=DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best'),
          fit_params=None, iid=True, n_iter=8000, n_jobs=-1,
          param_distributions={'max_depth': range(5, 50), 'min_samples_split': range(2, 60), 'max_features': range(2, 30), 'min_samples_leaf': range(2, 40)},
          pre_dispatch='2*n_jobs', random_state=None, refit=True,
          return_train_score='warn', scoring=None, verbose=0)
0.6979911474293496
15 54 27 10


In [13]:
dt = DecisionTreeClassifier(max_depth=12, min_samples_split=53, max_features=16, 
                            min_samples_leaf=7, random_state = 0)
dt.fit(X_train, y_train)
print("Accuracy score (training): {0:.3f}".format(dt.score(X_train, y_train)))
print("Accuracy score (validation): {0:.3f}".format(dt.score(X_validation, y_validation)))
print("Accuracy score (test): {0:.3f}".format(dt.score(X_test, y_test)))
# The "validation" here is a correct term. It is not a "test" set.

Accuracy score (training): 0.712
Accuracy score (validation): 0.693
Accuracy score (test): 0.696


# Random Forest Model

In [14]:
# Randomized Search for Algorithm Tuning
from scipy.stats import uniform as sp_rand
from sklearn.linear_model import Ridge
from sklearn.model_selection import RandomizedSearchCV
param_grid = {'max_depth': range(5,50,1), 'min_samples_split': range(2,60,1), 'n_estimators': range(2,40,2),
              'max_features': range(2,30,1), 'min_samples_leaf': range(2,40,1)}
# create and fit a ridge regression model, testing random alpha values
model = RandomForestClassifier()
rsearch = RandomizedSearchCV(estimator=model, param_distributions=param_grid, n_iter=8000, n_jobs=-1)
rsearch.fit(X_train.values, y_train.values)
print(rsearch)
# summarize the results of the random parameter search
print(rsearch.best_score_)
print(rsearch.best_estimator_.max_depth, rsearch.best_estimator_.min_samples_split, 
      rsearch.best_estimator_.n_estimators, rsearch.best_estimator_.max_features, 
      rsearch.best_estimator_.min_samples_leaf)

RandomizedSearchCV(cv=None, error_score='raise',
          estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False),
          fit_params=None, iid=True, n_iter=8000, n_jobs=-1,
          param_distributions={'max_depth': range(5, 50), 'min_samples_split': range(2, 60), 'n_estimators': range(2, 40, 2), 'max_features': range(2, 30), 'min_samples_leaf': range(2, 40)},
          pre_dispatch='2*n_jobs', random_state=None, refit=True,
          return_train_score='warn', scoring=None, verbose=0)
0.7007150153217568
43 22 24 9 3


In [15]:
rf = RandomForestClassifier(max_depth=11, min_samples_split=4, max_features=16, n_estimators=34,
                            min_samples_leaf=4, random_state = 0)
rf.fit(X_train, y_train)
print("Accuracy score (training): {0:.3f}".format(rf.score(X_train, y_train)))
print("Accuracy score (validation): {0:.3f}".format(rf.score(X_validation, y_validation)))
print("Accuracy score (test): {0:.3f}".format(rf.score(X_test, y_test)))
# The "validation" here is a correct term. It is not a "test" set.

Accuracy score (training): 0.750
Accuracy score (validation): 0.714
Accuracy score (test): 0.703


# Ada Boost Model

In [16]:
# Randomized Search for Algorithm Tuning
from scipy.stats import uniform as sp_rand
from sklearn.linear_model import Ridge
from sklearn.model_selection import RandomizedSearchCV
param_grid = {'n_estimators': range(10,60,1), 'learning_rate': [x/300 for x in range(20,80,2)]}
# create and fit a ridge regression model, testing random alpha values
model = AdaBoostClassifier()
rsearch = RandomizedSearchCV(estimator=model, param_distributions=param_grid, n_iter=1000, n_jobs=-1)
rsearch.fit(X_train.values, y_train.values)
print(rsearch)
# summarize the results of the random parameter search
print(rsearch.best_score_)
print(rsearch.best_estimator_.learning_rate,  rsearch.best_estimator_.n_estimators)

RandomizedSearchCV(cv=None, error_score='raise',
          estimator=AdaBoostClassifier(algorithm='SAMME.R', base_estimator=None,
          learning_rate=1.0, n_estimators=50, random_state=None),
          fit_params=None, iid=True, n_iter=1000, n_jobs=-1,
          param_distributions={'n_estimators': range(10, 60), 'learning_rate': [0.06666666666666667, 0.07333333333333333, 0.08, 0.08666666666666667, 0.09333333333333334, 0.1, 0.10666666666666667, 0.11333333333333333, 0.12, 0.12666666666666668, 0.13333333333333333, 0.14, 0.14666666666666667, 0.1533333333333333...22, 0.22666666666666666, 0.23333333333333334, 0.24, 0.24666666666666667, 0.25333333333333335, 0.26]},
          pre_dispatch='2*n_jobs', random_state=None, refit=True,
          return_train_score='warn', scoring=None, verbose=0)
0.6789240721824992
0.18666666666666668 51


In [17]:
adc = AdaBoostClassifier(learning_rate=0.17, n_estimators=56, random_state = 0)
adc.fit(X_train, y_train)
print("Accuracy score (training): {0:.3f}".format(adc.score(X_train, y_train)))
print("Accuracy score (validation): {0:.3f}".format(adc.score(X_validation, y_validation)))
print("Accuracy score (test): {0:.3f}".format(adc.score(X_test, y_test)))
# The "validation" here is a correct term. It is not a "test" set.

Accuracy score (training): 0.673
Accuracy score (validation): 0.681
Accuracy score (test): 0.685
