In [2]:
import numpy as np
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import GradientBoostingRegressor, GradientBoostingClassifier
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
plt.style.use('seaborn-notebook')
%matplotlib inline

In [37]:
df = pd.read_pickle('data/sf_inspection_master.pkl')

# Make dummies for geo coords

In [38]:
#pd.get_dummies(df['business_postal_code'])

In [39]:
df.columns

Index(['business_id', 'business_name', 'business_address', 'business_city',
       'business_state', 'business_postal_code', 'business_latitude',
       'business_longitude', 'business_location', 'business_phone_number',
       'inspection_id', 'inspection_date', 'inspection_score',
       'inspection_type', 'violation_id', 'violation_description',
       'risk_category', 'short_violation_id', 'inspect_date', 'p1_3', 'p4_6',
       'p7_9', 'p10_12', 'p13_18', 'p19_24', 'p25_36', 'number_turnovers',
       'start_date', 'duration_business', 'y_label', '92672', '94013', '94014',
       '94080', '94101', '94102', '94103', '94104', '94105', '94107', '94108',
       '94109', '94110', '94111', '94112', '94114', '94115', '94116', '94117',
       '94118', '94120', '94121', '94122', '94123', '94124', '94127', '94129',
       '94130', '94131', '94132', '94133', '94134', '94143', '94158', '95105',
       'p7_36', 'rating', 'price', 'longitude_round', 'latitude_round'],
      dtype='object')

In [40]:
len((df['longitude_round']*1000000 + df['latitude_round']*100).unique())

122

In [79]:
geo_combo = abs(df['longitude_round']*1000000 + df['latitude_round']*100)

In [83]:
geo_str = [ str(int(x)) for x in geo_combo]
df['geo_coord_combo'] = geo_str

In [43]:
df = pd.concat([df, pd.get_dummies(df['geo_coord_combo'])], axis=1)
df.columns

Index(['business_id', 'business_name', 'business_address', 'business_city',
       'business_state', 'business_postal_code', 'business_latitude',
       'business_longitude', 'business_location', 'business_phone_number',
       ...
       '122496225', '122496226', '122496227', '122496228', '122496229',
       '122506222', '122506223', '122506224', '122506225', '122506226'],
      dtype='object', length=193)

In [44]:
df2 = df.drop(['122496225'], axis=1)

In [45]:
df2.columns

Index(['business_id', 'business_name', 'business_address', 'business_city',
       'business_state', 'business_postal_code', 'business_latitude',
       'business_longitude', 'business_location', 'business_phone_number',
       ...
       '122496224', '122496226', '122496227', '122496228', '122496229',
       '122506222', '122506223', '122506224', '122506225', '122506226'],
      dtype='object', length=192)

In [46]:
df2.columns.values

array(['business_id', 'business_name', 'business_address',
       'business_city', 'business_state', 'business_postal_code',
       'business_latitude', 'business_longitude', 'business_location',
       'business_phone_number', 'inspection_id', 'inspection_date',
       'inspection_score', 'inspection_type', 'violation_id',
       'violation_description', 'risk_category', 'short_violation_id',
       'inspect_date', 'p1_3', 'p4_6', 'p7_9', 'p10_12', 'p13_18',
       'p19_24', 'p25_36', 'number_turnovers', 'start_date',
       'duration_business', 'y_label', '92672', '94013', '94014', '94080',
       '94101', '94102', '94103', '94104', '94105', '94107', '94108',
       '94109', '94110', '94111', '94112', '94114', '94115', '94116',
       '94117', '94118', '94120', '94121', '94122', '94123', '94124',
       '94127', '94129', '94130', '94131', '94132', '94133', '94134',
       '94143', '94158', '95105', 'p7_36', 'rating', 'price',
       'longitude_round', 'latitude_round', 'geo_coord_com

In [47]:
# create a text file with column names, so that they can be used for feature 
# selection. Remove 95105 and 92672, which do not belong to SF.
s = ''
for i in df2.columns.values:
    s += i + ', '

In [48]:
with open('data/col_names.txt', 'w') as f:
    f.write(s)

In [64]:
y = df2['y_label']
X = df2[['p10_12', 'p13_18', 'p19_24', 'p25_36', '122376226', '122376227', '122386220', '122386221', '122386222',
       '122386223', '122386224', '122386225', '122386226', '122386227',
       '122386228', '122386229', '122396219', '122396220', '122396221',
       '122396222', '122396223', '122396224', '122396225', '122396226',
       '122396227', '122396228', '122396229', '122406219', '122406220',
       '122406221', '122406222', '122406223', '122406224', '122406225',
       '122406226', '122406227', '122406228', '122406229', '122416219',
       '122416220', '122416221', '122416222', '122416223', '122416224',
       '122416225', '122416226', '122416227', '122416228', '122416229',
       '122426219', '122426220', '122426221', '122426222', '122426223',
       '122426224', '122426225', '122426226', '122426227', '122426228',
       '122426229', '122436219', '122436220', '122436221', '122436222',
       '122436223', '122436224', '122436225', '122436226', '122436227',
       '122436228', '122436229', '122446219', '122446220', '122446221',
       '122446222', '122446223', '122446224', '122446225', '122446226',
       '122446227', '122446228', '122446229', '122456221', '122456222',
       '122456223', '122456224', '122456225', '122456226', '122456227',
       '122456228', '122456229', '122466222', '122466223', '122466224',
       '122466225', '122466226', '122466227', '122466228', '122466229',
       '122476222', '122476223', '122476224', '122476225', '122476226',
       '122476227', '122476228', '122476229', '122486222', '122486224',
       '122486225', '122486226', '122486227', '122486228', '122496221',
       '122496222', '122496224', '122496226', '122496227', '122496228',
       '122496229', '122506222', '122506223', '122506224', '122506225',
       '122506226', '94013', '94014', '94080', '94101', '94102', '94103', '94104',
       '94105', '94107', '94108', '94109', '94110', '94111', '94112', '94114',
       '94115', '94116', '94117', '94118', '94120', '94121', '94122', '94123',
       '94124', '94127', '94129', '94130', '94131', '94132', '94133', '94134',
       '94143', '94158']]

In [65]:
X_tr, X_test, y_tr, y_test = train_test_split(X, y, test_size=0.2, random_state=38)

## Let's stash X_test and y_test away for only test set purpose. Split X_train and y_train again for train and validation.

In [66]:
X_train, X_validation, y_train, y_validation = train_test_split(X_tr, y_tr, test_size=0.25, random_state=28)

In [67]:
log_model = LogisticRegression()
log_model.fit(X_train,y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [68]:
log_model.coef_

array([[-0.1529798 ,  0.10124806,  0.13424308,  0.05603751,  0.13040801,
         0.19057994,  0.11711988, -0.47072977, -1.25425618, -1.08742991,
        -0.42488751, -0.19682666,  1.07755222,  0.31402353, -0.79692458,
         0.        ,  0.5875758 , -0.7854608 ,  0.24447651,  0.32042389,
        -0.87143111,  1.21033518,  0.19314095, -0.08556268, -0.09969861,
        -0.36240114,  0.51255917,  0.29509485,  0.13405638, -0.17013423,
         0.62407076, -0.46419711,  0.40087761, -0.34683572,  0.36923077,
         1.14574527,  0.        , -1.24342167,  0.49072135,  1.08156122,
         0.85175174, -0.34838347, -0.13943042,  0.28981441,  0.83755585,
         0.06733671,  0.23876461,  0.51666884, -0.32870876, -0.47597521,
         0.74340514, -0.55106834, -0.04955092, -0.31066673,  0.27266866,
         0.45936049,  0.52231471,  0.02236847, -0.59511792,  0.31227459,
        -0.35295831,  0.05682354, -0.5787271 , -1.03305266, -0.69584885,
        -0.84450606, -0.20754456,  1.2415662 ,  0.7

In [69]:
log_model.intercept_

array([-0.90214557])

In [70]:
#log_model.predict(X_validation)

In [71]:
log_model.score(X_validation, y_validation)

0.6469387755102041

## Find out if my data set is unbalanced or not

In [72]:
y.value_counts()

False    2781
True     2116
Name: y_label, dtype: int64

In [73]:
num_true = y.value_counts()[True]
num_false = y.value_counts()[False]

In [74]:
True_rate = num_true/(num_true + num_false)
True_rate

0.43210128650193996

## Gradient Boosting Model

In [75]:
learning_rates = [0.05, 0.1, 0.25, 0.5, 0.75, 1]
for learning_rate in learning_rates:
    gb = GradientBoostingClassifier(n_estimators=20, learning_rate = learning_rate, max_features=2, max_depth = 2, random_state = 0)
    gb.fit(X_train, y_train)
    print("Learning rate: ", learning_rate)
    print("Accuracy score (training): {0:.3f}".format(gb.score(X_train, y_train)))
    print("Accuracy score (validation): {0:.3f}".format(gb.score(X_validation, y_validation)))

Learning rate:  0.05
Accuracy score (training): 0.571
Accuracy score (validation): 0.570
Learning rate:  0.1
Accuracy score (training): 0.579
Accuracy score (validation): 0.577
Learning rate:  0.25
Accuracy score (training): 0.623
Accuracy score (validation): 0.617
Learning rate:  0.5
Accuracy score (training): 0.630
Accuracy score (validation): 0.624
Learning rate:  0.75
Accuracy score (training): 0.627
Accuracy score (validation): 0.632
Learning rate:  1
Accuracy score (training): 0.633
Accuracy score (validation): 0.624


In [76]:
# Let's use learning rate of 0.75
gb = GradientBoostingClassifier(n_estimators=20, learning_rate = 0.75, max_features=2, max_depth = 2, random_state = 0)
gb.fit(X_train, y_train)
print("Accuracy score (training): {0:.3f}".format(gb.score(X_train, y_train)))
print("Accuracy score (validation): {0:.3f}".format(gb.score(X_validation, y_validation)))
# The "validation" here is a correct term. It is not a "test" set.

Accuracy score (training): 0.627
Accuracy score (validation): 0.632
