In [200]:
import numpy as np
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import GradientBoostingRegressor, GradientBoostingClassifier
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
plt.style.use('seaborn-notebook')
%matplotlib inline

In [201]:
df = pd.read_csv('data/data_zipcode_cleaned_0423.csv')

In [202]:
# Using 9 months period as y label
df['y_label'] = (df['p1_3'] + df['p4_6'] + df['p7_9']) > 0

# Make dummies for zipcodes

In [203]:
#pd.get_dummies(df['business_postal_code'])

In [204]:
df.columns

Index(['Unnamed: 0', 'Unnamed: 0.1', 'Unnamed: 0.1.1', 'business_id',
       'business_name', 'business_address', 'business_city', 'business_state',
       'business_postal_code', 'business_latitude', 'business_longitude',
       'business_location', 'business_phone_number', 'inspection_id',
       'inspection_date', 'inspection_score', 'inspection_type',
       'violation_id', 'violation_description', 'risk_category',
       'short_inspect_date', 'short_violation_id', 'inspect_date', 'p1_3',
       'p4_6', 'p7_9', 'p10_12', 'p13_18', 'p19_24', 'p25_36', 'y_label'],
      dtype='object')

In [205]:
df = pd.concat([df, pd.get_dummies(df['business_postal_code'])], axis=1)

In [206]:
df2 = df.drop(['zzzzz'], axis=1)

In [207]:
df2.columns

Index(['Unnamed: 0', 'Unnamed: 0.1', 'Unnamed: 0.1.1', 'business_id',
       'business_name', 'business_address', 'business_city', 'business_state',
       'business_postal_code', 'business_latitude', 'business_longitude',
       'business_location', 'business_phone_number', 'inspection_id',
       'inspection_date', 'inspection_score', 'inspection_type',
       'violation_id', 'violation_description', 'risk_category',
       'short_inspect_date', 'short_violation_id', 'inspect_date', 'p1_3',
       'p4_6', 'p7_9', 'p10_12', 'p13_18', 'p19_24', 'p25_36', 'y_label',
       '92672', '94013', '94014', '94080', '94101', '94102', '94103', '94104',
       '94105', '94107', '94108', '94109', '94110', '94111', '94112', '94114',
       '94115', '94116', '94117', '94118', '94120', '94121', '94122', '94123',
       '94124', '94127', '94129', '94130', '94131', '94132', '94133', '94134',
       '94143', '94158', '95105'],
      dtype='object')

In [208]:
y = df['y_label']
X = df[['p10_12', 'p13_18', 'p19_24', 'p25_36', '94013', '94014', '94080', '94101', '94102', '94103', '94104',
       '94105', '94107', '94108', '94109', '94110', '94111', '94112', '94114',
       '94115', '94116', '94117', '94118', '94120', '94121', '94122', '94123',
       '94124', '94127', '94129', '94130', '94131', '94132', '94133', '94134',
       '94143', '94158']]

In [209]:
X_tr, X_test, y_tr, y_test = train_test_split(X, y, test_size=0.25, random_state=38)

## Let's stash X_test and y_test away for only test set purpose. Split X_train and y_train again for train and validation.

In [210]:
X_train, X_validation, y_train, y_validation = train_test_split(X_tr, y_tr, test_size=0.25, random_state=28)

In [211]:
log_model = LogisticRegression()
log_model.fit(X_train,y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [212]:
log_model.coef_

array([[-0.43839924, -0.07915832,  0.13217221,  0.01327126,  0.        ,
        -0.4607488 ,  0.36312195,  0.42414452,  0.03396266,  0.46095747,
         1.41283528,  0.12920134, -0.20170406,  0.07326997,  0.72164796,
         1.92400157, -0.11243817,  0.55662967,  0.85845043, -1.1790387 ,
        -0.13737307, -0.2332825 ,  0.49191878, -0.41121283,  0.28387545,
         0.88415598,  1.54680027, -0.06004234,  0.17983336, -0.4449442 ,
        -0.69428182,  0.67522965,  1.27355397,  0.44143793,  0.60578207,
         0.78705684, -0.93323218]])

In [213]:
log_model.intercept_

array([0.19493948])

In [214]:
#log_model.predict(X_validation)

In [215]:
log_model.score(X_validation, y_validation)

0.6901121304791029

## Find out if my data set is unbalanced or not

In [216]:
y.value_counts()

True     3310
False    1919
Name: y_label, dtype: int64

In [217]:
num_true = y.value_counts()[True]
num_false = y.value_counts()[False]

In [218]:
True_rate = num_true/(num_true + num_false)
True_rate

0.6330082233696691

## Gradient Boosting Model

In [219]:
learning_rates = [0.05, 0.1, 0.25, 0.5, 0.75, 1]
for learning_rate in learning_rates:
    gb = GradientBoostingClassifier(n_estimators=20, learning_rate = learning_rate, max_features=2, max_depth = 2, random_state = 0)
    gb.fit(X_train, y_train)
    print("Learning rate: ", learning_rate)
    print("Accuracy score (training): {0:.3f}".format(gb.score(X_train, y_train)))
    print("Accuracy score (validation): {0:.3f}".format(gb.score(X_validation, y_validation)))

Learning rate:  0.05
Accuracy score (training): 0.628
Accuracy score (validation): 0.626
Learning rate:  0.1
Accuracy score (training): 0.645
Accuracy score (validation): 0.639
Learning rate:  0.25
Accuracy score (training): 0.706
Accuracy score (validation): 0.683
Learning rate:  0.5
Accuracy score (training): 0.709
Accuracy score (validation): 0.700
Learning rate:  0.75
Accuracy score (training): 0.719
Accuracy score (validation): 0.712
Learning rate:  1
Accuracy score (training): 0.716
Accuracy score (validation): 0.715


In [220]:
# Let's use learning rate of 0.75
gb = GradientBoostingClassifier(n_estimators=20, learning_rate = 0.75, max_features=2, max_depth = 2, random_state = 0)
gb.fit(X_train, y_train)
print("Accuracy score (training): {0:.3f}".format(gb.score(X_train, y_train)))
print("Accuracy score (validation): {0:.3f}".format(gb.score(X_validation, y_validation)))
# The "validation" here is a correct term. It is not a "test" set.

Accuracy score (training): 0.719
Accuracy score (validation): 0.712
