In [116]:
import numpy as np
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import GradientBoostingRegressor, GradientBoostingClassifier
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
plt.style.use('seaborn-notebook')
%matplotlib inline

In [117]:
df = pd.read_csv('data/data_zipcode_cleaned_0423.csv')

In [118]:
# Using 9 months period as y label
df['y_label'] = (df['p1_3'] + df['p4_6'] + df['p7_9']) > 0

# Make dummies for zipcodes

In [119]:
#pd.get_dummies(df['business_postal_code'])

In [120]:
df.columns

Index(['Unnamed: 0', 'Unnamed: 0.1', 'business_id', 'business_name',
       'business_address', 'business_city', 'business_state',
       'business_postal_code', 'business_latitude', 'business_longitude',
       'business_location', 'business_phone_number', 'inspection_id',
       'inspection_date', 'inspection_score', 'inspection_type',
       'violation_id', 'violation_description', 'risk_category',
       'short_inspect_date', 'short_violation_id', 'inspect_date', 'p1_3',
       'p4_6', 'p7_9', 'p10_12', 'p13_18', 'p19_24', 'p25_36', 'y_label'],
      dtype='object')

In [121]:
df = pd.concat([df, pd.get_dummies(df['business_postal_code'])], axis=1)

In [122]:
df2 = df.drop(['zzzzz'], axis=1)

In [123]:
df2.columns

Index(['Unnamed: 0', 'Unnamed: 0.1', 'business_id', 'business_name',
       'business_address', 'business_city', 'business_state',
       'business_postal_code', 'business_latitude', 'business_longitude',
       'business_location', 'business_phone_number', 'inspection_id',
       'inspection_date', 'inspection_score', 'inspection_type',
       'violation_id', 'violation_description', 'risk_category',
       'short_inspect_date', 'short_violation_id', 'inspect_date', 'p1_3',
       'p4_6', 'p7_9', 'p10_12', 'p13_18', 'p19_24', 'p25_36', 'y_label',
       '92672', '94013', '94014', '94080', '94101', '94102', '94103', '94104',
       '94105', '94107', '94108', '94109', '94110', '94111', '94112', '94114',
       '94115', '94116', '94117', '94118', '94120', '94121', '94122', '94123',
       '94124', '94127', '94129', '94130', '94131', '94132', '94133', '94134',
       '94143', '94158', '95105'],
      dtype='object')

In [138]:
y = df['y_label']
X = df[['p10_12', 'p13_18', 'p19_24', 'p25_36', '94013', '94014', '94080', '94101', '94102', '94103', '94104',
       '94105', '94107', '94108', '94109', '94110', '94111', '94112', '94114',
       '94115', '94116', '94117', '94118', '94120', '94121', '94122', '94123',
       '94124', '94127', '94129', '94130', '94131', '94132', '94133', '94134',
       '94143', '94158']]

In [139]:
X_tr, X_test, y_tr, y_test = train_test_split(X, y, test_size=0.25, random_state=38)

## Let's stash X_test and y_test away for only test set purpose. Split X_train and y_train again for train and validation.

In [140]:
X_train, X_validation, y_train, y_validation = train_test_split(X_tr, y_tr, test_size=0.25, random_state=28)

In [141]:
log_model = LogisticRegression()
log_model.fit(X_train,y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [142]:
log_model.coef_

array([[-4.66558055e-01, -1.11990050e-01,  1.05898411e-01,
         1.23484062e-01,  0.00000000e+00, -5.90566334e-01,
         3.35342536e-01,  4.44933150e-01, -6.36710163e-04,
         4.62486247e-01,  1.36534566e+00,  7.82003937e-02,
        -1.86514528e-01,  3.98857014e-02,  6.84782621e-01,
         1.93099061e+00, -1.33660726e-01,  5.48631869e-01,
         8.44519730e-01, -1.21759785e+00, -1.85391697e-01,
        -2.68148788e-01,  4.51186873e-01, -3.76180639e-01,
         2.23023097e-01,  8.09788400e-01,  1.51897142e+00,
        -5.15936881e-02,  1.23665556e-01, -4.43700320e-01,
        -7.31322483e-01,  7.08667670e-01,  1.30931565e+00,
         3.65174352e-01,  5.94399438e-01,  8.31343452e-01,
        -9.57495433e-01]])

In [143]:
log_model.intercept_

array([-0.03105758])

In [144]:
#log_model.predict(X_validation)

In [145]:
log_model.score(X_validation, y_validation)

0.7033639143730887

## Find out if my data set is unbalanced or not

In [146]:
y.value_counts()

True     3310
False    1919
Name: y_label, dtype: int64

In [147]:
num_true = y.value_counts()[True]
num_false = y.value_counts()[False]

In [148]:
True_rate = num_true/(num_true + num_false)
True_rate

0.6330082233696691

## Gradient Boosting Model

In [149]:
learning_rates = [0.05, 0.1, 0.25, 0.5, 0.75, 1]
for learning_rate in learning_rates:
    gb = GradientBoostingClassifier(n_estimators=20, learning_rate = learning_rate, max_features=2, max_depth = 2, random_state = 0)
    gb.fit(X_train, y_train)
    print("Learning rate: ", learning_rate)
    print("Accuracy score (training): {0:.3f}".format(gb.score(X_train, y_train)))
    print("Accuracy score (validation): {0:.3f}".format(gb.score(X_validation, y_validation)))

Learning rate:  0.05
Accuracy score (training): 0.628
Accuracy score (validation): 0.626
Learning rate:  0.1
Accuracy score (training): 0.647
Accuracy score (validation): 0.642
Learning rate:  0.25
Accuracy score (training): 0.704
Accuracy score (validation): 0.682
Learning rate:  0.5
Accuracy score (training): 0.713
Accuracy score (validation): 0.708
Learning rate:  0.75
Accuracy score (training): 0.719
Accuracy score (validation): 0.712
Learning rate:  1
Accuracy score (training): 0.717
Accuracy score (validation): 0.718


In [150]:
# Let's use learning rate of 0.75
gb = GradientBoostingClassifier(n_estimators=20, learning_rate = 0.75, max_features=2, max_depth = 2, random_state = 0)
gb.fit(X_train, y_train)
print("Accuracy score (training): {0:.3f}".format(gb.score(X_train, y_train)))
print("Accuracy score (validation): {0:.3f}".format(gb.score(X_validation, y_validation)))
# The "validation" here is a correct term. It is not a "test" set.

Accuracy score (training): 0.719
Accuracy score (validation): 0.712
