# Some data cleaning

Since we wanted to estimate each restaurant's probability of failure for its most recent inspection, we deleted all inspections that were not the most recent for each restaurant. Importantly, we did use information about past inspections of each restaurant as covariates. Also, we deleted variables from the model by looking at variable importances generated from a random forest model and variables with coefficients set to zero by lasso logistic regression.

In [1]:
# import modules
import numpy as np
import pandas as pd
import scipy as sp
import math
from IPython.display import display
import matplotlib.pyplot as plt
import matplotlib
%matplotlib inline
import sklearn
from sklearn.neighbors import KNeighborsClassifier
from sklearn import linear_model
import matplotlib.pyplot as plt
from sklearn import discriminant_analysis
from sklearn.decomposition import PCA
from sklearn import preprocessing
from sklearn import tree
from sklearn import ensemble
from sklearn.cross_validation import KFold
from sklearn.ensemble import RandomForestClassifier as RandomForest
from sklearn.svm import SVC
import csv
import StringIO
import string

In [2]:
# load data
final = pd.read_csv('final_data.csv')
final.columns

Index([u'Inspection ID', u'DBA Name', u'AKA Name', u'License #',
       u'Facility Type', u'Risk', u'Address', u'City', u'State', u'Zip',
       u'Inspection Date', u'Inspection Type', u'Results', u'Violations',
       u'Latitude', u'Longitude', u'Location', u'Day of Week', u'Month',
       u'Community Area', u'num_crimes_comm_area', u'num_complaints_comm_area',
       u'COMMUNITY AREA NAME', u'PERCENT OF HOUSING CROWDED',
       u'PERCENT HOUSEHOLDS BELOW POVERTY', u'PERCENT AGED 16+ UNEMPLOYED',
       u'PERCENT AGED 25+ WITHOUT HIGH SCHOOL DIPLOMA',
       u'PERCENT AGED UNDER 18 OR OVER 64', u'PER CAPITA INCOME ',
       u'HARDSHIP INDEX', u'Birth Rate', u'General Fertility Rate',
       u'Low Birth Weight', u'Prenatal Care Beginning in First Trimester',
       u'Preterm Births', u'Teen Birth Rate', u'Assault (Homicide)',
       u'Breast cancer in females', u'Cancer (All Sites)',
       u'Colorectal Cancer', u'Diabetes-related', u'Firearm-related',
       u'Infant Mortality Rate', 

In [3]:
# divide into canvass and complaint datasets
final_canvass = final[final['Inspection Type'] == 'Canvass']
final_complaint = final[final['Inspection Type'] == 'Complaint']

In [4]:
# sort by id, then by date
canvass_sort = final_canvass.sort_values(by = ['restaurant_id', 'num prev canvass inspections'])
canvass_sort.reset_index(drop = True, inplace = True)

In [5]:
# delete all observations that are not the most recent observation for each restaurant
canvass_sort_copy = canvass_sort.copy()

for i in range(1, len(canvass_sort)):
    if i % 1000 == 0:
        print i,
    if canvass_sort['restaurant_id'].values[i - 1] == canvass_sort['restaurant_id'].values[i]:
        canvass_sort_copy = canvass_sort_copy.drop(i - 1)

1000 2000 3000 4000 5000 6000 7000 8000 9000 10000 11000 12000 13000 14000 15000 16000 17000 18000 19000 20000 21000 22000 23000 24000 25000 26000 27000 28000 29000 30000 31000 32000 33000 34000 35000 36000 37000 38000 39000 40000 41000 42000 43000 44000 45000 46000 47000 48000 49000 50000 51000 52000 53000 54000 55000


In [21]:
# delete duplicates
canvass_sort_copy_2 = canvass_sort_copy.copy()

for i in range(1, len(canvass_sort_copy)):
    if canvass_sort_copy['AKA Name'].values[i - 1][0:10] == canvass_sort_copy['AKA Name'].values[i][0:10] and \
       canvass_sort_copy['Address'].values[i - 1][0:10] == canvass_sort_copy['Address'].values[i][0:10] and \
       canvass_sort_copy['Inspection Date'].values[i - 1] == canvass_sort_copy['Inspection Date'].values[i]:
            canvass_sort_copy_2 = canvass_sort_copy_2.drop(canvass_sort_copy.index[i - 1])

We had some duplicate inspections in the data that needed to be deleted.

In [99]:
# drop columns that won't be used in model
final_canvass_2 = canvass_sort_copy_2.drop(['Inspection ID', 'DBA Name', 'AKA Name', 'License #', 'Address', 'City', 'State', 
                                            'Zip', 'Inspection Type', 'Violations', 'Location', 'Day of Week', 
                                            'Month', 'COMMUNITY AREA NAME', 'restaurant_id', 'Inspection Date', 'Community Area'], axis = 1)

In [100]:
# create dummy variables
canvass_dum = pd.get_dummies(final_canvass_2)

canvass_dum['Inspection Date'] = pd.Series(canvass_sort_copy_2['Inspection Date'].values, index = canvass_dum.index)

In [101]:
# want to see columns created from get_dummies
for i in range(len(canvass_dum.columns)):
    print canvass_dum.columns[i] + ',',

Latitude, Longitude, num_crimes_comm_area, num_complaints_comm_area, PERCENT OF HOUSING CROWDED, PERCENT HOUSEHOLDS BELOW POVERTY, PERCENT AGED 16+ UNEMPLOYED, PERCENT AGED 25+ WITHOUT HIGH SCHOOL DIPLOMA, PERCENT AGED UNDER 18 OR OVER 64, PER CAPITA INCOME , HARDSHIP INDEX, Birth Rate, General Fertility Rate, Low Birth Weight, Prenatal Care Beginning in First Trimester, Preterm Births, Teen Birth Rate, Assault (Homicide), Breast cancer in females, Cancer (All Sites), Colorectal Cancer, Diabetes-related, Firearm-related, Infant Mortality Rate, Lung Cancer, Prostate Cancer in Males, Stroke (Cerebrovascular Disease), Tuberculosis, Dependency, canvass fail rate, complaint fail rate, num prev canvass inspections, num prev complaint inspections, num canvass failures, num complaint failures, last inspection fail, num consec failures, consec rate, complaint-canvass-rate interaction, complaint-canvass-fail interaction, inspection-canvass-rate interaction, inspection-complaint-rate interaction,

In [102]:
# combining daycares into one facility type
Facility_Type_Daycare = canvass_dum['Facility Type_Children\'s Services Facility'].values + \
                        canvass_dum['Facility Type_Daycare'].values

In [103]:
# add combined daycare column and drop other daycare columns
to_delete = ['Facility Type_Children\'s Services Facility', 'Facility Type_Daycare', \
             'Results_Pass', 'Facility Type_Hospital', 'Facility Type_Liquor', 
             'Facility Type_Long Term Care', 'Facility Type_Catering', 'Facility Type_Wholesale',
             'Facility Type_Golden Diner']

canvass_dum['Facility Type_Daycare 2'] = pd.Series(Facility_Type_Daycare, index = canvass_dum.index)

canvass_dum_2 = canvass_dum.drop(to_delete, axis = 1)

In [104]:
# get latitude-longitude interaction
lat_long_interaction = canvass_dum_2['Longitude'].values * canvass_dum_2['Latitude'].values
canvass_dum_2['lat_long_interaction'] = pd.Series(lat_long_interaction, index = canvass_dum_2.index)

In [105]:
# break into train and test and x and y
canvass_train = canvass_dum_2[canvass_dum_2['Inspection Date'] < '2016-07-01']
canvass_test = canvass_dum_2[(canvass_dum_2['Inspection Date'] > '2016-07-01') & (canvass_dum_2['Inspection Date'] < '2016-11-01')]

canvass_train_y = canvass_train['Results_Fail']
canvass_test_y = canvass_test['Results_Fail']

canvass_train_x = canvass_train.drop(['Results_Fail', 'Inspection Date'], axis = 1)
canvass_test_x = canvass_test.drop(['Results_Fail', 'Inspection Date'], axis = 1)

In [106]:
# random forest; want to see variable importances
rand_forest_2 = RandomForest(n_estimators = 501, max_features = 10, class_weight = {0 : 1, 1 : 1.0}, max_depth = 10)
rand_forest_2.fit(canvass_train_x, canvass_train_y)
print 'Random forest test score: ' + str(rand_forest_2.score(canvass_test_x, canvass_test_y))

Random forest test score: 0.740093240093


In [107]:
# create data frame of feature importances from random forest
var_imp = pd.Series(rand_forest_2.feature_importances_)
predictors = pd.Series(canvass_train_x.columns)

var_imp_df = pd.concat([predictors, var_imp], axis = 1, ignore_index = True)

var_imp_df_sort = var_imp_df.sort_values(by = [1], ascending = False)
var_imp_df_sort.reset_index(drop = True, inplace = True)
var_imp_df_sort

Unnamed: 0,0,1
0,Latitude,0.086758
1,Longitude,0.085018
2,lat_long_interaction,0.083567
3,canvass fail rate,0.044164
4,complaint-canvass-rate interaction,0.039616
5,inspection-fail interaction,0.030424
6,inspection-complaint-rate interaction,0.03001
7,inspection-canvass-rate interaction,0.028505
8,num prev canvass inspections,0.027547
9,inspection-complaint interaction,0.024135


In [108]:
# dropping variables with low variable importance
canvass_train_x_dropped = canvass_train_x.drop(var_imp_df_sort.iloc[:, 0].values[30:], axis = 1)
canvass_test_x_dropped = canvass_test_x.drop(var_imp_df_sort.iloc[:, 0].values[30:], axis = 1)

canvass_train_x2 = canvass_train_x.drop(canvass_train_x_dropped.columns, axis = 1)
canvass_test_x2 = canvass_test_x.drop(canvass_test_x_dropped.columns, axis = 1)

canvass_train_x2.columns

Index([u'num_crimes_comm_area', u'PERCENT OF HOUSING CROWDED',
       u'PERCENT AGED 25+ WITHOUT HIGH SCHOOL DIPLOMA',
       u'PERCENT AGED UNDER 18 OR OVER 64', u'PER CAPITA INCOME ',
       u'HARDSHIP INDEX', u'Birth Rate', u'General Fertility Rate',
       u'Breast cancer in females', u'Cancer (All Sites)',
       u'Colorectal Cancer', u'Infant Mortality Rate',
       u'Prostate Cancer in Males', u'Stroke (Cerebrovascular Disease)',
       u'Tuberculosis', u'Dependency', u'num complaint failures',
       u'last inspection fail', u'complaint-canvass-fail interaction',
       u'consec-last interaction', u'Facility Type_Bakery',
       u'Facility Type_Other', u'Facility Type_School',
       u'Risk_Risk 2 (Medium)', u'Risk_Risk 3 (Low)',
       u'Facility Type_Daycare 2'],
      dtype='object')

In [109]:
# lasso logistic regression; want to see variable coefficients
log_reg_2 = linear_model.LogisticRegression(penalty = 'l1', class_weight = {0 : 1, 1 : 1.0}, C = 0.01)
log_reg_2.fit(canvass_train_x2, canvass_train_y)

print 'Logistic regression test score: ' + str(log_reg_2.score(canvass_test_x2, canvass_test_y))

Logistic regression test score: 0.73951048951


In [110]:
# create data frame of logistic coefficients
coef_series = pd.Series(log_reg_2.coef_[0])
predictors = pd.Series(canvass_train_x2.columns)

coef_df = pd.concat([predictors, coef_series], axis = 1, ignore_index = True)

coef_df_sort = coef_df.sort_values(by = [1], ascending = False)
coef_df_sort.reset_index(drop = True, inplace = True)
coef_df_sort

Unnamed: 0,0,1
0,last inspection fail,0.17473
1,num complaint failures,0.066085
2,complaint-canvass-fail interaction,0.050979
3,HARDSHIP INDEX,0.010154
4,Birth Rate,0.009385
5,PERCENT OF HOUSING CROWDED,0.00651
6,Prostate Cancer in Males,0.004708
7,Breast cancer in females,0.001853
8,PERCENT AGED UNDER 18 OR OVER 64,0.001393
9,Cancer (All Sites),0.001027


After running a random forest model and a lasso logistic regression, I was able to eliminate several variables. I deleted variables with low variable importance in the random forest model or variables set to coefficients of zero by the lasso logistic regression model. 

Note that what we have on the website for variables included in the final model differs from the results here because the random forest model is non-deterministic.