In [3]:
import pandas as pd
import numpy as np
import seaborn as sn
import matplotlib.pyplot as plt

from category_encoders import TargetEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split

from xgboost import XGBRFRegressor
from xgboost import XGBClassifier

from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import plot_confusion_matrix

from sklearn.utils.class_weight import compute_sample_weight

from sklearn.linear_model import LogisticRegression

from sklearn.model_selection import cross_validate
from sklearn.model_selection import RandomizedSearchCV

from imblearn.over_sampling import SMOTE
from imblearn.over_sampling import RandomOverSampler
from imblearn.pipeline import make_pipeline as imb_make_pipeline

  from pandas import MultiIndex, Int64Index


In [9]:
restaurant_csv_path = "data/DOHMH_New_York_City_Restaurant_Inspection_Results.csv"
df = pd.read_csv(restaurant_csv_path)

# Convert A, B, C to 3, 2, 1 for encodings, drop all other ratings
grades = ["A", "B", "C"]
df = df[df['GRADE'].isin(grades)]
df['SCORE'] = df.loc[:, 'GRADE']
df.loc[df["SCORE"] == "A", "GRADE"] = 3
df.loc[df["SCORE"] == "B", "GRADE"] = 2
df.loc[df["SCORE"] == "C", "GRADE"] = 1

# Replace violation code, zipcode and cuisine type with its average grade:
target = df["GRADE"]
te = TargetEncoder(cols=["VIOLATION CODE", "ZIPCODE", "CUISINE DESCRIPTION"])
te.fit(df, target)
df = te.transform(df)

# Peform OHE on Boro and Critical Flag:
df = pd.get_dummies(df, columns=["BORO"])
df = pd.get_dummies(df, columns=["CRITICAL FLAG"])

print(df.columns)
df = df.drop(["DBA", 
              "BUILDING", 
              "STREET",
              "PHONE",
              "ACTION",
              "VIOLATION DESCRIPTION",
              "Latitude",
              "Longitude",
              "Community Board",
              "Council District",
              "Census Tract", 
              "RECORD DATE",
              "INSPECTION DATE",
              "INSPECTION TYPE",
              "CRITICAL FLAG_Critical",
              "BIN",
              "BBL",
              "SCORE",
              "NTA"], axis=1)
df = df.dropna()
df.head()

Index(['CAMIS', 'DBA', 'BUILDING', 'STREET', 'ZIPCODE', 'PHONE',
       'CUISINE DESCRIPTION', 'INSPECTION DATE', 'ACTION', 'VIOLATION CODE',
       'VIOLATION DESCRIPTION', 'SCORE', 'GRADE', 'GRADE DATE', 'RECORD DATE',
       'INSPECTION TYPE', 'Latitude', 'Longitude', 'Community Board',
       'Council District', 'Census Tract', 'BIN', 'BBL', 'NTA', 'BORO_0',
       'BORO_Bronx', 'BORO_Brooklyn', 'BORO_Manhattan', 'BORO_Queens',
       'BORO_Staten Island', 'CRITICAL FLAG_Critical',
       'CRITICAL FLAG_Not Applicable', 'CRITICAL FLAG_Not Critical'],
      dtype='object')


Unnamed: 0,CAMIS,ZIPCODE,CUISINE DESCRIPTION,VIOLATION CODE,GRADE,GRADE DATE,BORO_0,BORO_Bronx,BORO_Brooklyn,BORO_Manhattan,BORO_Queens,BORO_Staten Island,CRITICAL FLAG_Not Applicable,CRITICAL FLAG_Not Critical
0,41262792,2.622593,2.781846,2.910568,3,08/26/2021,0,0,0,1,0,0,0,1
2,50063071,2.793506,2.688846,2.62383,3,09/18/2018,0,0,0,1,0,0,0,0
3,40740446,2.807083,2.780662,2.488722,3,10/23/2019,0,1,0,0,0,0,0,0
4,50051826,2.586134,2.616895,2.605223,3,05/30/2017,0,0,0,0,1,0,0,0
5,50044207,2.823642,2.635659,2.910568,1,06/07/2019,0,0,1,0,0,0,0,1


In [38]:
meta_df = df.drop(columns=['GRADE', 'GRADE DATE', 'VIOLATION CODE','CRITICAL FLAG_Not Applicable', 'CRITICAL FLAG_Not Critical']).drop_duplicates()
meta_df.shape

(23550, 9)

In [14]:
# check whether CAMIS in unique ID
print("Number of restaurant: ", pd.unique(df['CAMIS']).shape)
print("Dataframe shape: ", df.shape)

Number of restaurant:  (23550,)
df shape:  (159437, 14)


In [15]:
# transform to datetime object
import datetime as dt 
df['GRADE DATE'] = [dt.datetime.strptime(str(date_string), '%m/%d/%Y') for date_string in df['GRADE DATE'].values ]
# Find records after 2019
sum(df['GRADE DATE'] > dt.datetime.strptime("01/01/2019", '%m/%d/%Y'))
df.reset_index(drop = True, inplace= True)

In [20]:
print("Most recent grade date: ", df['GRADE DATE'].max())
print("Earliest grade date: ", df['GRADE DATE'].min())

Most recent grade date:  2022-03-15 00:00:00
Earliest grade date:  2013-06-07 00:00:00


In [81]:
subset = df[df.CAMIS == 41262792].sort_values(by = 'GRADE DATE') # ascending
subset['RECENCY'] = subset['GRADE DATE'] - subset.shift(1)['GRADE DATE']
subset

Unnamed: 0,CAMIS,ZIPCODE,CUISINE DESCRIPTION,VIOLATION CODE,GRADE,GRADE DATE,BORO_0,BORO_Bronx,BORO_Brooklyn,BORO_Manhattan,BORO_Queens,BORO_Staten Island,CRITICAL FLAG_Not Applicable,CRITICAL FLAG_Not Critical,RECENCY
49133,41262792,2.622593,2.781846,2.910568,3,2019-07-22,0,0,0,1,0,0,0,1,NaT
49134,41262792,2.622593,2.781846,2.910568,3,2019-07-22,0,0,0,1,0,0,0,1,0 days
90890,41262792,2.622593,2.781846,2.810283,3,2019-07-22,0,0,0,1,0,0,0,0,0 days
133747,41262792,2.622593,2.781846,2.84813,3,2019-07-22,0,0,0,1,0,0,0,1,0 days
0,41262792,2.622593,2.781846,2.910568,3,2021-08-26,0,0,0,1,0,0,0,1,766 days
135095,41262792,2.622593,2.781846,2.785904,3,2021-08-26,0,0,0,1,0,0,0,1,0 days


In [75]:
camis = df['CAMIS'].unique()
for i, idx in enumerate(camis):
    # filter the records of specific restuarant using CAMIS
    subset = df[df.CAMIS == idx].sort_values(by = 'GRADE DATE')
    subset['PREV_GRADE'] = subset.shift(1)['GRADE']
    subset['PREV_CRITICAL FLAG_Not Applicable'] = subset.shift(1)['CRITICAL FLAG_Not Applicable']
    subset['PREV_CRITICAL FLAG_Not Critical'] = subset.shift(1)['CRITICAL FLAG_Not Critical']
    subset['PREV_GRADE DATE'] = subset.shift(1)['GRADE DATE']
    subset['RECENCY'] = subset['GRADE DATE'] - subset.shift(1)['GRADE DATE']

In [31]:
# Calculate recency: recent visit - previous visit
# Calculate frequency: time of grading within three year
pd.DataFrame(df.groupby(['CAMIS']).size(), columns=["frequency"])

Unnamed: 0_level_0,frequency
CAMIS,Unnamed: 1_level_1
30075445,5
30112340,3
30191841,5
40356018,4
40356483,5
...,...
50119022,3
50119057,2
50119087,1
50119223,4


In [None]:
def calculate_recency(df:pd.DataFrame(), uid_column:str, date_column:str, threshold:datetime.datetime.strptime() = None):
    if threshold is not None:
        df = df[df[date_column] > threshold]
    
    freq = pd.DataFrame(df.groupby([uid_column]).size(), columns=["frequency"])
    df.groupby([uid_column])
    return df

In [51]:
# treat each record as a sample.
# How do we include datetime feature? Previous grade, previous grade date interval
# What if new restaurant include? Train-test split has to be in time order.



0        2021-08-26
2        2018-09-18
3        2019-10-23
4        2017-05-30
5        2019-06-07
            ...    
334466   2019-10-23
334469   2021-12-17
334472   2019-03-19
334474   2018-02-15
334475   2019-09-26
Name: GRADE DATE, Length: 159437, dtype: datetime64[ns]

In [42]:
df.head()

Unnamed: 0,CAMIS,ZIPCODE,CUISINE DESCRIPTION,VIOLATION CODE,GRADE,GRADE DATE,BORO_0,BORO_Bronx,BORO_Brooklyn,BORO_Manhattan,BORO_Queens,BORO_Staten Island,CRITICAL FLAG_Not Applicable,CRITICAL FLAG_Not Critical
0,41262792,2.622593,2.781846,2.910568,3,2021-08-26,0,0,0,1,0,0,0,1
2,50063071,2.793506,2.688846,2.62383,3,2018-09-18,0,0,0,1,0,0,0,0
3,40740446,2.807083,2.780662,2.488722,3,2019-10-23,0,1,0,0,0,0,0,0
4,50051826,2.586134,2.616895,2.605223,3,2017-05-30,0,0,0,0,1,0,0,0
5,50044207,2.823642,2.635659,2.910568,1,2019-06-07,0,0,1,0,0,0,0,1


In [None]:
# treat each restaurant as a sample.
# How do we treat ['GRADE', 'GRADE DATE', 'VIOLATION CODE','CRITICAL FLAG_Not Applicable', 'CRITICAL FLAG_Not Critical']