In [3]:
import pandas as pd
import numpy as np
import seaborn as sn
import matplotlib.pyplot as plt

from category_encoders import TargetEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split

from xgboost import XGBRFRegressor
from xgboost import XGBClassifier

from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import plot_confusion_matrix

from sklearn.utils.class_weight import compute_sample_weight

from sklearn.linear_model import LogisticRegression

from sklearn.model_selection import cross_validate
from sklearn.model_selection import RandomizedSearchCV

from imblearn.over_sampling import SMOTE
from imblearn.over_sampling import RandomOverSampler
from imblearn.pipeline import make_pipeline as imb_make_pipeline

  from pandas import MultiIndex, Int64Index


In [192]:
restaurant_csv_path = "data/DOHMH_New_York_City_Restaurant_Inspection_Results.csv"
df = pd.read_csv(restaurant_csv_path)
df = df.drop(["DBA", 
              "BUILDING", 
              "STREET",
              "PHONE",
              "ACTION",
              "VIOLATION DESCRIPTION",
              "Latitude",
              "Longitude",
              "Community Board",
              "Council District",
              "Census Tract", 
              "RECORD DATE",
              "INSPECTION DATE",
              "INSPECTION TYPE",
              "BIN",
              "BBL",
              "SCORE",
              "NTA"], axis=1)
df.shape

(334478, 8)

In [193]:
df[df.CAMIS == 41262792].sort_values(by = 'GRADE DATE')

Unnamed: 0,CAMIS,BORO,ZIPCODE,CUISINE DESCRIPTION,VIOLATION CODE,CRITICAL FLAG,GRADE,GRADE DATE
103615,41262792,Manhattan,10032.0,Chicken,10F,Not Critical,A,07/22/2019
103616,41262792,Manhattan,10032.0,Chicken,10F,Not Critical,A,07/22/2019
191016,41262792,Manhattan,10032.0,Chicken,06C,Critical,A,07/22/2019
280699,41262792,Manhattan,10032.0,Chicken,10B,Not Critical,A,07/22/2019
0,41262792,Manhattan,10032.0,Chicken,10F,Not Critical,A,08/26/2021
283439,41262792,Manhattan,10032.0,Chicken,08C,Not Critical,A,08/26/2021
141318,41262792,Manhattan,10032.0,Chicken,20A,Not Critical,,


In [107]:
# Convert A, B, C to 3, 2, 1 for encodings, drop all other ratings
grades = ["A", "B", "C"]
df = df[df['GRADE'].isin(grades)]
df['SCORE'] = df.loc[:, 'GRADE']
df.loc[df["SCORE"] == "A", "GRADE"] = 3
df.loc[df["SCORE"] == "B", "GRADE"] = 2
df.loc[df["SCORE"] == "C", "GRADE"] = 1

# Replace violation code, zipcode and cuisine type with its average grade:
target = df["GRADE"]
te = TargetEncoder(cols=["VIOLATION CODE", "ZIPCODE", "CUISINE DESCRIPTION"])
te.fit(df, target)
df = te.transform(df)

# Peform OHE on Boro and Critical Flag:
df = pd.get_dummies(df, columns=["BORO"])
df = pd.get_dummies(df, columns=["CRITICAL FLAG"])

print(df.columns)
df = df.drop(["DBA", 
              "BUILDING", 
              "STREET",
              "PHONE",
              "ACTION",
              "VIOLATION DESCRIPTION",
              "Latitude",
              "Longitude",
              "Community Board",
              "Council District",
              "Census Tract", 
              "RECORD DATE",
              "INSPECTION DATE",
              "INSPECTION TYPE",
              "CRITICAL FLAG_Critical",
              "BIN",
              "BBL",
              "SCORE",
              "NTA"], axis=1)
df = df.dropna()
df.head()

Index(['CAMIS', 'DBA', 'BUILDING', 'STREET', 'ZIPCODE', 'PHONE',
       'CUISINE DESCRIPTION', 'INSPECTION DATE', 'ACTION', 'VIOLATION CODE',
       'VIOLATION DESCRIPTION', 'SCORE', 'GRADE', 'GRADE DATE', 'RECORD DATE',
       'INSPECTION TYPE', 'Latitude', 'Longitude', 'Community Board',
       'Council District', 'Census Tract', 'BIN', 'BBL', 'NTA', 'BORO_0',
       'BORO_Bronx', 'BORO_Brooklyn', 'BORO_Manhattan', 'BORO_Queens',
       'BORO_Staten Island', 'CRITICAL FLAG_Critical',
       'CRITICAL FLAG_Not Applicable', 'CRITICAL FLAG_Not Critical'],
      dtype='object')


Unnamed: 0,CAMIS,ZIPCODE,CUISINE DESCRIPTION,VIOLATION CODE,GRADE,GRADE DATE,BORO_0,BORO_Bronx,BORO_Brooklyn,BORO_Manhattan,BORO_Queens,BORO_Staten Island,CRITICAL FLAG_Not Applicable,CRITICAL FLAG_Not Critical
0,41262792,2.622593,2.781846,2.910568,3,08/26/2021,0,0,0,1,0,0,0,1
2,50063071,2.793506,2.688846,2.62383,3,09/18/2018,0,0,0,1,0,0,0,0
3,40740446,2.807083,2.780662,2.488722,3,10/23/2019,0,1,0,0,0,0,0,0
4,50051826,2.586134,2.616895,2.605223,3,05/30/2017,0,0,0,0,1,0,0,0
5,50044207,2.823642,2.635659,2.910568,1,06/07/2019,0,0,1,0,0,0,0,1


In [99]:
meta_df = df.drop(columns=['GRADE', 'GRADE DATE', 'VIOLATION CODE','CRITICAL FLAG_Not Applicable', 'CRITICAL FLAG_Not Critical']).drop_duplicates()
meta_df.shape

(23550, 9)

In [14]:
# check whether CAMIS in unique ID
print("Number of restaurant: ", pd.unique(df['CAMIS']).shape)
print("Dataframe shape: ", df.shape)

Number of restaurant:  (23550,)
df shape:  (159437, 14)


In [108]:
# transform to datetime object
import datetime as dt 
df['GRADE DATE'] = [dt.datetime.strptime(str(date_string), '%m/%d/%Y') for date_string in df['GRADE DATE'].values ]
# Find records after 2019
df.drop_duplicates(inplace=True)
print("Number of records after 2019: ", sum(df['GRADE DATE'] > dt.datetime.strptime("01/01/2019", '%m/%d/%Y')))
df.reset_index(drop = True, inplace= True)

Number of records after 2019:  89504


In [109]:
print("Most recent grade date: ", df['GRADE DATE'].max())
print("Earliest grade date: ", df['GRADE DATE'].min())

Most recent grade date:  2022-03-15 00:00:00
Earliest grade date:  2013-06-07 00:00:00


In [185]:
subset = df[df.CAMIS == 41262792].sort_values(by = 'GRADE DATE')
subset[['GRADE', 'GRADE DATE']].drop_duplicates()

Unnamed: 0,GRADE,GRADE DATE
45605,3,2019-07-22
0,3,2021-08-26


In [188]:
subset = df[df.CAMIS == 30191841].sort_values(by = 'GRADE DATE')
prev_df = pd.DataFrame(index=subset.index.values)
prev_df['GRADE DATE'] = subset['GRADE DATE']
# prev_df['PREV_CRITICAL FLAG_Not Applicable'] = subset.shift(1)['CRITICAL FLAG_Not Applicable']
# prev_df['PREV_CRITICAL FLAG_Not Critical'] = subset.shift(1)['CRITICAL FLAG_Not Critical']
# find previous grading
prev_df['PREV_GRADE'] = subset.shift(1)['GRADE']
# find previous grading date and RECENCY
df_date = subset[['GRADE', 'GRADE DATE']].drop_duplicates()
df_date['PREV_GRADE DATE'] = df_date.shift(1)['GRADE DATE']
df_date['GRADE'] = df_date.shift(1)['GRADE']
df_date['RECENCY'] = df_date['GRADE DATE'] - df_date.shift(1)['GRADE DATE']
prev_df = prev_df.merge(df_date, how = 'left', on='GRADE DATE').set_index(subset.index.values)
prev_df
# append to new dataframe
# new_df = pd.concat([new_df, prev_df], axis=0)
# new_df

Unnamed: 0,GRADE DATE,PREV_GRADE,GRADE,PREV_GRADE DATE,RECENCY
2770,2019-06-06,,,NaT,NaT
103258,2019-06-06,3.0,,NaT,NaT
37292,2022-01-04,3.0,3.0,2019-06-06,943 days
38245,2022-01-04,3.0,3.0,2019-06-06,943 days
109061,2022-01-04,3.0,3.0,2019-06-06,943 days


In [163]:
subset

Unnamed: 0,CAMIS,ZIPCODE,CUISINE DESCRIPTION,VIOLATION CODE,GRADE,GRADE DATE,BORO_0,BORO_Bronx,BORO_Brooklyn,BORO_Manhattan,BORO_Queens,BORO_Staten Island,CRITICAL FLAG_Not Applicable,CRITICAL FLAG_Not Critical
45605,41262792,2.622593,2.781846,2.910568,3,2019-07-22,0,0,0,1,0,0,0,1
84239,41262792,2.622593,2.781846,2.810283,3,2019-07-22,0,0,0,1,0,0,0,0
123822,41262792,2.622593,2.781846,2.84813,3,2019-07-22,0,0,0,1,0,0,0,1
0,41262792,2.622593,2.781846,2.910568,3,2021-08-26,0,0,0,1,0,0,0,1
125070,41262792,2.622593,2.781846,2.785904,3,2021-08-26,0,0,0,1,0,0,0,1


In [149]:
# # Calculate recency: recent visit - previous visit
# camis = df['CAMIS'].unique()
# new_df = pd.DataFrame()
# for idx in camis:
#     # filter the records of specific restuarant using CAMIS
#     subset = df[df.CAMIS == idx].sort_values(by = 'GRADE DATE')
#     if subset.shape[0] == 1:
#         continue
#     prev_df = pd.DataFrame()
#     prev_df['PREV_GRADE'] = subset.shift(1)['GRADE']
#     # prev_df['PREV_CRITICAL FLAG_Not Applicable'] = subset.shift(1)['CRITICAL FLAG_Not Applicable']
#     # prev_df['PREV_CRITICAL FLAG_Not Critical'] = subset.shift(1)['CRITICAL FLAG_Not Critical']
#     prev_df['PREV_GRADE DATE'] = subset.shift(1)['GRADE DATE']
#     prev_df['RECENCY'] = subset['GRADE DATE'] - subset.shift(1)['GRADE DATE']
#     new_df = pd.concat([new_df, prev_df], axis=0)

In [169]:
# Calculate recency: recent visit - previous visit
camis = df['CAMIS'].unique()
new_df = pd.DataFrame()
for idx in camis:
    # filter the records of specific restuarant using CAMIS
    subset = df[df.CAMIS == idx].sort_values(by = 'GRADE DATE')
    if subset.shape[0] == 1:
        continue
    prev_df = pd.DataFrame(index=subset.index.values)
    prev_df['GRADE DATE'] = subset['GRADE DATE']
    # prev_df['PREV_CRITICAL FLAG_Not Applicable'] = subset.shift(1)['CRITICAL FLAG_Not Applicable']
    # prev_df['PREV_CRITICAL FLAG_Not Critical'] = subset.shift(1)['CRITICAL FLAG_Not Critical']
    # find previous grading
    prev_df['PREV_GRADE'] = subset.shift(1)['GRADE']
    # find previous grading date and RECENCY
    df_date = pd.DataFrame(subset['GRADE DATE'].unique(), columns=['GRADE DATE'])
    df_date['PREV_GRADE DATE'] = df_date.shift(1)['GRADE DATE']
    df_date['RECENCY'] = df_date['GRADE DATE'] - df_date.shift(1)['GRADE DATE']
    prev_df = prev_df.merge(df_date, how = 'left', on='GRADE DATE').set_index(subset.index.values)
    # append to new dataframe
    new_df = pd.concat([new_df, prev_df], axis=0)

In [172]:
df_concat = pd.concat([df, new_df.drop(columns=['GRADE DATE'])], axis=1)
# filter rows with date
df_concat[df['GRADE DATE'] > dt.datetime.strptime("01/01/2019", '%m/%d/%Y')]

Unnamed: 0,CAMIS,ZIPCODE,CUISINE DESCRIPTION,VIOLATION CODE,GRADE,GRADE DATE,BORO_0,BORO_Bronx,BORO_Brooklyn,BORO_Manhattan,BORO_Queens,BORO_Staten Island,CRITICAL FLAG_Not Applicable,CRITICAL FLAG_Not Critical,PREV_GRADE,PREV_GRADE DATE,RECENCY
0,41262792,2.622593,2.781846,2.910568,3,2021-08-26,0,0,0,1,0,0,0,1,3,2019-07-22,766 days
2,40740446,2.807083,2.780662,2.488722,3,2019-10-23,0,1,0,0,0,0,0,0,3,2019-06-27,118 days
4,50044207,2.823642,2.635659,2.910568,1,2019-06-07,0,0,1,0,0,0,0,1,2,2019-01-07,151 days
5,50097960,2.807083,2.780849,2.861638,3,2019-12-19,0,1,0,0,0,0,0,0,,NaT,NaT
7,50068160,2.864326,2.890200,2.488722,1,2019-02-06,0,0,1,0,0,0,0,0,3,2018-08-31,159 days
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
147508,41222982,2.682526,2.688846,2.785904,3,2019-03-26,0,0,0,0,1,0,0,1,3,NaT,NaT
147512,50070206,2.773529,2.616895,2.623830,2,2019-10-23,0,0,0,0,1,0,0,0,2,2019-05-06,170 days
147513,50105556,2.793769,2.680976,2.532057,3,2021-12-17,0,1,0,0,0,0,0,0,,NaT,NaT
147514,41384689,2.773529,2.665853,2.655434,3,2019-03-19,0,0,0,0,1,0,0,0,3,NaT,NaT


In [180]:
df_concat.isnull().sum()

CAMIS                               0
ZIPCODE                             0
CUISINE DESCRIPTION                 0
VIOLATION CODE                      0
GRADE                               0
GRADE DATE                          0
BORO_0                              0
BORO_Bronx                          0
BORO_Brooklyn                       0
BORO_Manhattan                      0
BORO_Queens                         0
BORO_Staten Island                  0
CRITICAL FLAG_Not Applicable        0
CRITICAL FLAG_Not Critical          0
PREV_GRADE                      23550
PREV_GRADE DATE                 54109
RECENCY                         54109
dtype: int64

In [175]:
# Calculate frequency: time of grading within three year
pd.DataFrame(df_concat.groupby(['CAMIS']).size(), columns=["frequency"])

Unnamed: 0_level_0,frequency
CAMIS,Unnamed: 1_level_1
30075445,5
30112340,3
30191841,5
40356018,4
40356483,3
...,...
50119022,3
50119057,2
50119087,1
50119223,3


In [51]:
# treat each record as a sample.
# How do we include datetime feature? Previous grade, previous grade date interval
# What if new restaurant include? Train-test split has to be in time order.

0        2021-08-26
2        2018-09-18
3        2019-10-23
4        2017-05-30
5        2019-06-07
            ...    
334466   2019-10-23
334469   2021-12-17
334472   2019-03-19
334474   2018-02-15
334475   2019-09-26
Name: GRADE DATE, Length: 159437, dtype: datetime64[ns]

In [42]:
df.head()

Unnamed: 0,CAMIS,ZIPCODE,CUISINE DESCRIPTION,VIOLATION CODE,GRADE,GRADE DATE,BORO_0,BORO_Bronx,BORO_Brooklyn,BORO_Manhattan,BORO_Queens,BORO_Staten Island,CRITICAL FLAG_Not Applicable,CRITICAL FLAG_Not Critical
0,41262792,2.622593,2.781846,2.910568,3,2021-08-26,0,0,0,1,0,0,0,1
2,50063071,2.793506,2.688846,2.62383,3,2018-09-18,0,0,0,1,0,0,0,0
3,40740446,2.807083,2.780662,2.488722,3,2019-10-23,0,1,0,0,0,0,0,0
4,50051826,2.586134,2.616895,2.605223,3,2017-05-30,0,0,0,0,1,0,0,0
5,50044207,2.823642,2.635659,2.910568,1,2019-06-07,0,0,1,0,0,0,0,1


In [None]:
# treat each restaurant as a sample.
# How do we treat ['GRADE', 'GRADE DATE', 'VIOLATION CODE','CRITICAL FLAG_Not Applicable', 'CRITICAL FLAG_Not Critical']