In [1]:
import pandas as pd
import numpy as np
from collections import Counter
import math

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn import svm

In [2]:
df = pd.read_csv('../data/spatial_merged_solid.csv')
topics = pd.read_csv('../data/topic_output.csv')
complaint = pd.read_csv('../data/311_complaint.csv')
insp = pd.read_csv('../data/clean_bz.csv')

In [3]:
# merge original inspection stuff with main df
df = df.merge(insp, on='CAMIS', how='left')

In [4]:
# merge 311 stuff with main df
df = df.merge(complaint, on='CAMIS', how='left')

In [5]:
# feature selection
df = df.loc[:, ['CAMIS', # inspection data
                'new_grade',
                'INSPECTION DATE_y',
                'VIOLATION CODE',
                'SCORE',
                'GRADE_y',
                'CD', # pluto data, BBL-level (spatially merged)
                'HealthArea',
                'SanitDistr',
                'AssessTot',
                'YearAlter1',
                'YearAlter2',
                'YearBuilt',
                'NumFloors',
                'yelp_rating', # yelp data, restaurant-level
                'yelp_categories',
                'yelp_review_count',
                'stats.checkinsCount', # foursquare data, restaurant-level
                'stats.tipCount',
                'stats.usersCount',
                '311_complaint_count', # 311 food complaints, restaurant-level
                'goog_lat', # google location
                'goog_lng']]

In [6]:
# rename a couple weird columns post-merge
df.columns =   ['CAMIS', # inspection data
                'new_grade',
                'INSPECTION DATE',
                'VIOLATION CODE',
                'SCORE',
                'GRADE',
                'CD', # pluto data, BBL-level (spatially merged)
                'HealthArea',
                'SanitDistr',
                'AssessTot',
                'YearAlter1',
                'YearAlter2',
                'YearBuilt',
                'NumFloors',
                'yelp_rating', # yelp data, restaurant-level
                'yelp_categories',
                'yelp_review_count',
                'stats.checkinsCount', # foursquare data, restaurant-level
                'stats.tipCount',
                'stats.usersCount',
                '311_complaint_count', # 311 food complaints, restaurant-level
                'goog_lat', # google location
                'goog_lng']

In [7]:
# only care about the records with a y-label (either A, B, or C)
df = df.loc[map(lambda g: type(g)==str, df.GRADE), :]
df = df.loc[df.GRADE.isin(['A', 'B', 'C']), :]

In [8]:
def clean_cat(cat):
    """
    cleans a single yelp category string
    """
    cat = cat.replace('[', '')\
    .replace(']', '')\
    .replace('u', '')\
    .replace("'", "")\
    .replace(" ", "")
    
    return cat

In [9]:
def get_top_categories(n):
    """
    returns a list of top n yelp categories from observed data
    """
    yelp_cats = list(df.yelp_categories)
    yelp_cats = filter(lambda cat: type(cat)==str, yelp_cats)
    yelp_cats = ','.join(yelp_cats).split(',')
    yelp_cats = map(clean_cat, yelp_cats)
    
    return map(lambda x: x[0], Counter(yelp_cats).most_common(n))

In [10]:
# testing top categories function
get_top_categories(10)

['coffee',
 'pizza',
 'sandwiches',
 'chinese',
 'hotdogs',
 'italian',
 'breakfast_brnch',
 'newamerican',
 'bakeries',
 'brgers']

In [11]:
def substring(cat, cats):
    """
    checks if a string contains a substring, with error handling
    """
    try:
        return int(cat in cats)
    except TypeError:
        return 0

In [12]:
def category_one_hot(df, n):
    """
    appends 'n' columns to a dataframe
    corresponding to yes/no for the 'n' most common yelp categories
    """
    for cat in get_top_categories(n):
        col_name = 'is_' + cat
        df.loc[:, col_name] = df.apply(lambda row: substring(cat, row['yelp_categories']), axis=1)
    return df

## SET NUMBER OF FOOD CATEGORIES HERE

In [13]:
# SET NUMBER OF FOOD CATEGORIES HERE
food_num = 20

In [14]:
# append cuisine category one-hots to dataframe
df = category_one_hot(df, food_num)
df.head()

Unnamed: 0,CAMIS,new_grade,INSPECTION DATE,VIOLATION CODE,SCORE,GRADE,CD,HealthArea,SanitDistr,AssessTot,...,is_tradamerican,is_bars,is_mexican,is_donts,is_cafes,is_salad,is_japanese,is_desserts,is_seafood,is_icecream
225,50060711,A,03/22/2017,10H,7.0,A,107.0,3500.0,7.0,2049750.0,...,0,0,0,0,0,0,0,0,0,0
259,50060463,A,03/02/2017,06F,13.0,A,310.0,7820.0,10.0,141750.0,...,0,0,0,0,0,0,0,0,0,0
266,50060418,A,03/22/2017,06C,9.0,A,106.0,5000.0,6.0,15238350.0,...,0,0,0,0,0,0,0,0,0,0
270,50060396,A,03/06/2017,10F,7.0,A,414.0,3700.0,14.0,329850.0,...,0,0,0,0,0,0,0,0,0,0
276,50060377,A,03/23/2017,04L,13.0,A,401.0,400.0,1.0,27216.0,...,0,0,0,0,0,0,0,0,0,0


In [15]:
def create_y(row):
    """
    output label: 1 for bad inspection, 0 for normal
    """
    if row['GRADE'] in ['B', 'C']:
        return 1
    else:
        return 0

In [16]:
# create the final label variable
df.loc[:, 'failed'] = df.apply(create_y, axis=1)

In [17]:
df.groupby('failed').count()

Unnamed: 0_level_0,CAMIS,new_grade,INSPECTION DATE,VIOLATION CODE,SCORE,GRADE,CD,HealthArea,SanitDistr,AssessTot,...,is_tradamerican,is_bars,is_mexican,is_donts,is_cafes,is_salad,is_japanese,is_desserts,is_seafood,is_icecream
failed,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,8078,7872,8078,7946,8078,8078,8078,8078,8068,8078,...,8078,8078,8078,8078,8078,8078,8078,8078,8078,8078
1,846,813,846,843,846,846,846,846,845,846,...,846,846,846,846,846,846,846,846,846,846


In [18]:
# years since renovation
def get_yrs_since_reno(row):
    last_reno = max(row['YearAlter1'], row['YearAlter2'])
    if last_reno == 0:
        return 2017 - row['YearBuilt']
    else:
        return 2017 - last_reno

In [19]:
# years since renovation
df.loc[:, 'yrs_since_reno'] = df.apply(get_yrs_since_reno, axis=1)

In [20]:
# building age
df.loc[:, 'bldg_age'] = df.apply(lambda row: 2017 - row['YearBuilt'], axis=1)

## feature selection here

In [21]:
numerics = [u'AssessTot', u'NumFloors', u'yelp_rating', u'yelp_review_count',
       u'stats.checkinsCount', u'stats.tipCount', u'stats.usersCount',
       u'311_complaint_count', u'yrs_since_reno', u'bldg_age']

In [22]:
#categoricals = [u'CD', u'HealthArea', u'SanitDistr'] + \
#    map(lambda cuisine: 'is_' + cuisine, get_top_categories(food_num))
    
categoricals = ['CD'] + map(lambda cuisine: 'is_' + cuisine, get_top_categories(food_num))

In [23]:
# standardize numeric columns
for col in numerics:
    vect = df.loc[:, col]
    mean = np.nanmean(vect)
    std = np.nanstd(vect)
    df.loc[:, col] = (vect - mean) / std

In [24]:
# these columns look like numericals, but are actually categorical
# convert to strings (with a prefix), so that scikitlearn algos can treat them as categorical
def stringize(num, prefix):
    if math.isnan(num):
        return np.nan
    else:
        return prefix + '_' + str(int(num))

for col in [u'CD', u'HealthArea', u'SanitDistr']:
    df.loc[:, col] = map(lambda val: stringize(val, col), df.loc[:, col])

In [25]:
# fill missing FS, Yelp, and 311 data with zeroes
df = df.fillna(0)

In [26]:
# create X and y from all selected features
X = pd.concat([df.loc[:, numerics], df.loc[:, categoricals]], axis=1)
y = df.failed

In [27]:
# one-hot encoding for the three high-cardinality columns
if 'CD' in X.columns:
    X = pd.concat([X, pd.get_dummies(X.loc[:, 'CD'])], axis=1)
    X.drop('CD', axis=1, inplace=True)

if 'HealthArea' in X.columns:
    X = pd.concat([X, pd.get_dummies(X.loc[:, 'HealthArea'])], axis=1)
    X.drop('HealthArea', axis=1, inplace=True)

if 'SanitDistr' in X.columns:
    X = pd.concat([X, pd.get_dummies(X.loc[:, 'SanitDistr'])], axis=1)
    X.drop('SanitDistr', axis=1, inplace=True)

In [28]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [50]:
X.to_csv('feature.csv')
df['SCORE'].to_csv('yscore.csv')

## GaussianNB

In [29]:
gnb = GaussianNB()
gnb.fit(X_train, y_train)
y_pred = gnb.predict(X_test)

In [30]:
TP = sum((y_pred==1)&(y_test==1))
FP = sum((y_pred==1)&(y_test==0))
FN = sum((y_pred==0)&(y_test==1))
TN = sum((y_pred==0)&(y_test==0))

In [31]:
print '{} predicted failures'.format(sum(y_pred))

2758 predicted failures


In [32]:
# calculate precision, accuracy, and recall
Acc = 100.0 * (TP + TN) / (TP + TN + FP + FN) # correct predictions / all records
Rec = 100.0 * TP / (TP + FN) # correctly predicted positives / all positive records
Prec = 100.0 * TP / (TP + FP) # correctly predicted positives / all predicted positives

print('GaussianNB:')
print('Precision = %.2f%%\nAccuracy = %.2f%%\nRecall = %.2f%%'%(Prec,Acc,Rec))

GaussianNB:
Precision = 9.61%
Accuracy = 14.80%
Recall = 94.31%


## Logistic Regression

In [33]:
log_model = LogisticRegression(C=1e6)
log_model.fit(X_train, y_train)
y_pred = log_model.predict(X_test)

In [34]:
TP = sum((y_pred==1)&(y_test==1))
FP = sum((y_pred==1)&(y_test==0))
FN = sum((y_pred==0)&(y_test==1))
TN = sum((y_pred==0)&(y_test==0))

In [35]:
float(sum(y_train)) / len(y_train)

0.09449740759324302

In [36]:
print '{} predicted failures'.format(sum(y_pred))

2 predicted failures


In [37]:
# calculate precision, accuracy, and recall
Acc = 100.0 * (TP + TN) / (TP + TN + FP + FN)
Rec = 100.0 * TP / (TP + FN)
Prec = 100.0 * TP / (TP + FP)

print('Logistic Regression:')
print('Precision = %.2f%%\nAccuracy = %.2f%%\nRecall = %.2f%%'%(Prec,Acc,Rec))

Logistic Regression:
Precision = 50.00%
Accuracy = 90.46%
Recall = 0.36%


## SVM

In [38]:
svm_mod = svm.SVC() 
svm_mod.fit(X_train, y_train)
y_pred = svm_mod.predict(X_test)

In [39]:
TP = sum((y_pred==1)&(y_test==1))
FP = sum((y_pred==1)&(y_test==0))
FN = sum((y_pred==0)&(y_test==1))
TN = sum((y_pred==0)&(y_test==0))

In [40]:
print '{} predicted failures'.format(sum(y_pred))

0 predicted failures


In [41]:
# calculate precision, accuracy, and recall
Acc = 100.0 * (TP + TN) / (TP + TN + FP + FN)
Rec = 100.0 * TP / (TP + FN)
Prec = 100.0 * TP / (TP + FP)

print('Logistic Regression:')
print('Precision = %.2f%%\nAccuracy = %.2f%%\nRecall = %.2f%%'%(Prec,Acc,Rec))

ZeroDivisionError: float division by zero