In [174]:
import pandas as pd
import numpy as np
from category_encoders import TargetEncoder
from sklearn.preprocessing import OneHotEncoder
import xgboost

In [175]:
restaurant_csv_path = "data/DOHMH_New_York_City_Restaurant_Inspection_Results.csv"
df = pd.read_csv(restaurant_csv_path)
df = df.dropna()

In [176]:
# Convert A, B, C to 3, 2, 1 for encodings, drop all other ratings
grades = ["A", "B", "C"]
df = df[df['GRADE'].isin(grades)]

df.loc[df["GRADE"] == "A", "GRADE"] = 3
df.loc[df["GRADE"] == "B", "GRADE"] = 2
df.loc[df["GRADE"] == "C", "GRADE"] = 1


In [177]:
# Check to see how often restaurants are inspected (to use for previous violations):
print("Range of times when a restaurant has been visited: ")
print(df.groupby(['CAMIS']).size().max())
print(df.groupby(['CAMIS']).size().min())


Range of times when a restaurant has been visited: 
36
1


In [178]:
# Replace violation code, zipcode and cuisine type with its average grade:
target = df["GRADE"]
te = TargetEncoder(cols=["VIOLATION CODE", "ZIPCODE", "CUISINE DESCRIPTION"])
te.fit(df, target)
df = te.transform(df)


In [179]:
# Peform OHE on Boro and Critical Flag:
df = pd.get_dummies(df, columns=["BORO"])
df = pd.get_dummies(df, columns=["CRITICAL FLAG"])

In [180]:
print(df.columns)
df = df.drop(["DBA", 
              "BUILDING", 
              "STREET",
              "PHONE",
              "ACTION",
              "VIOLATION DESCRIPTION",
              "SCORE", 
              "Latitude",
              "Longitude",
              "Community Board",
              "Council District",
              "Census Tract", 
              "RECORD DATE",
              "INSPECTION DATE",
              "INSPECTION TYPE",
              "CRITICAL FLAG_Critical",
              "BIN",
              "BBL",
              "NTA"], axis=1)


Index(['CAMIS', 'DBA', 'BUILDING', 'STREET', 'ZIPCODE', 'PHONE',
       'CUISINE DESCRIPTION', 'INSPECTION DATE', 'ACTION', 'VIOLATION CODE',
       'VIOLATION DESCRIPTION', 'SCORE', 'GRADE', 'GRADE DATE', 'RECORD DATE',
       'INSPECTION TYPE', 'Latitude', 'Longitude', 'Community Board',
       'Council District', 'Census Tract', 'BIN', 'BBL', 'NTA', 'BORO_Bronx',
       'BORO_Brooklyn', 'BORO_Manhattan', 'BORO_Queens', 'BORO_Staten Island',
       'CRITICAL FLAG_Critical', 'CRITICAL FLAG_Not Critical'],
      dtype='object')


In [181]:
df

Unnamed: 0,CAMIS,ZIPCODE,CUISINE DESCRIPTION,VIOLATION CODE,GRADE,GRADE DATE,BORO_Bronx,BORO_Brooklyn,BORO_Manhattan,BORO_Queens,BORO_Staten Island,CRITICAL FLAG_Not Critical
0,41015947,2.752896,2.834303,2.593379,2,08/17/2017,1,0,0,0,0,0
1,40520604,2.804124,2.888018,2.666100,3,08/05/2019,0,0,0,1,0,1
4,50053497,2.793103,2.888018,2.861388,3,08/04/2021,0,0,1,0,0,0
9,41695440,2.681974,2.724997,2.847353,3,02/23/2022,0,1,0,0,0,1
10,41695440,2.681974,2.724997,2.847353,3,02/23/2022,0,1,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...
332250,50034665,2.842721,2.705991,2.623058,2,07/25/2018,0,1,0,0,0,0
332254,40399217,2.818828,2.705991,2.910271,3,06/10/2019,0,0,0,1,0,1
332256,50033846,2.760197,2.781909,2.668837,3,11/30/2017,0,1,0,0,0,0
332257,50075701,2.942177,2.614637,2.910271,3,06/10/2019,0,0,0,1,0,1
