In [49]:
import numpy as np
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import GradientBoostingRegressor, GradientBoostingClassifier
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
plt.style.use('seaborn-notebook')
%matplotlib inline

In [50]:
df = pd.read_csv('data/ready4model_v2.csv')

In [51]:
# Using 9 months period as y label
df['y_label'] = (df['p1_3'] + df['p4_6'] + df['p7_9']) > 0

In [52]:
df.head(15)

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,business_id,business_name,business_address,business_city,business_state,business_postal_code,business_latitude,business_longitude,...,short_violation_id,inspect_date,p1_3,p4_6,p7_9,p10_12,p13_18,p19_24,p25_36,y_label
0,0,0,1757,Dar Bar Pakistani/Indian Cusine,1412 Polk St,San Francisco,CA,94109.0,37.789784,-122.420455,...,103131,2017-09-28,0,0,5,0,0,6,5,True
1,1,1,4864,DRAGON CITY BAKERY & CAFE,2367 MISSION St,San Francisco,CA,94110.0,37.759174,-122.419066,...,103157,2016-12-06,0,6,0,0,5,5,3,True
2,2,2,79782,Deli 23,2449 23rd St,San Francisco,CA,94110.0,,,...,103120,2016-05-03,4,0,2,0,3,3,2,True
3,3,3,73840,L'acajou Bakery and Cafe,498 09th St Ste. C,San Francisco,CA,94103.0,,,...,103105,2017-12-07,0,5,0,0,6,0,0,True
4,4,4,76437,Sweetheart Cafe,909 Grant Ave,San Francisco,CA,94108.0,,,...,103113,2016-03-29,0,11,0,0,0,6,6,True
5,5,5,82123,Cafe New Honolulu,888 Stockton St,San Francisco,CA,94108.0,,,...,103131,2017-12-07,0,3,0,0,0,8,8,True
6,6,6,1146,Aux Delices Vietnam,2327 Polk St,San Francisco,CA,94109.0,37.798274,-122.422264,...,103103,2018-02-23,6,0,0,0,0,8,0,True
7,7,7,3838,CAFE PICARO,3120 16th St,San Francisco,CA,94103.0,37.764908,-122.422442,...,103142,2018-02-22,7,0,0,0,0,1,2,True
8,8,8,38929,Jelly Donut,3198 24th St,San Francisco,CA,94110.0,37.752439,-122.41625,...,103139,2016-01-14,0,0,7,0,3,4,6,True
9,9,9,81161,Limon Peruvian Rotisserie,1001 South Van Ness Ave,San Francisco,CA,94110.0,,,...,103149,2016-03-25,1,0,3,2,4,0,3,True


In [53]:
y = df['y_label']
X = df[['p10_12', 'p13_18', 'p19_24', 'p25_36']]

In [54]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=38)

In [55]:
log_model = LogisticRegression()
log_model.fit(X_train,y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [56]:
log_model.coef_

array([[-0.39302937, -0.08415884,  0.15803904,  0.03339293]])

In [57]:
log_model.intercept_

array([0.51972381])

In [58]:
log_model.predict(X_test)

array([ True,  True,  True, ...,  True,  True, False])

In [59]:
log_model.score(X_test, y_test)

0.658256880733945

## Find out if my data set is unbalanced or not

In [60]:
y.value_counts()

True     3310
False    1919
Name: y_label, dtype: int64

In [61]:
num_true = y.value_counts()[True]
num_false = y.value_counts()[False]

In [62]:
True_rate = num_true/(num_true + num_false)
True_rate

0.6330082233696691

## Gradient Boosting Model

In [63]:
learning_rates = [0.05, 0.1, 0.25, 0.5, 0.75, 1]
for learning_rate in learning_rates:
    gb = GradientBoostingClassifier(n_estimators=20, learning_rate = learning_rate, max_features=2, max_depth = 2, random_state = 0)
    gb.fit(X_train, y_train)
    print("Learning rate: ", learning_rate)
    print("Accuracy score (training): {0:.3f}".format(gb.score(X_train, y_train)))
    print("Accuracy score (validation): {0:.3f}".format(gb.score(X_test, y_test)))

Learning rate:  0.05
Accuracy score (training): 0.686
Accuracy score (validation): 0.684
Learning rate:  0.1
Accuracy score (training): 0.684
Accuracy score (validation): 0.681
Learning rate:  0.25
Accuracy score (training): 0.686
Accuracy score (validation): 0.680
Learning rate:  0.5
Accuracy score (training): 0.687
Accuracy score (validation): 0.671
Learning rate:  0.75
Accuracy score (training): 0.685
Accuracy score (validation): 0.675
Learning rate:  1
Accuracy score (training): 0.679
Accuracy score (validation): 0.672


In [64]:
# Let's use learning rate of 0.5
gb = GradientBoostingClassifier(n_estimators=20, learning_rate = 0.5, max_features=2, max_depth = 2, random_state = 0)
gb.fit(X_train, y_train)
print("Accuracy score (training): {0:.3f}".format(gb.score(X_train, y_train)))
print("Accuracy score (validation): {0:.3f}".format(gb.score(X_test, y_test)))
# The "validation" here is a correct term. It is not a "test" set.

Accuracy score (training): 0.687
Accuracy score (validation): 0.671
