In [63]:
# Import required libraries

# eda tools
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# model preprocessing
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

# feature selection
from statsmodels.stats.outliers_influence import variance_inflation_factor
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2

# algorithms
from statsmodels.api import OLS
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestClassifier

# evaluation
from sklearn.metrics import mean_squared_error, r2_score

### Bring in the datasets

In [98]:
# Read data
inspections_df = pd.read_csv('data/food_inspections.csv')
inspections_df.dropna(subset='License #', inplace=True)
inspections_df['License #'].astype('int32')

restaurant_df = inspections_df[inspections_df['Facility Type'] == 'Restaurant'].copy()
restaurant_df = restaurant_df[~(restaurant_df['License #'] == 0)]
restaurant_df.drop(['DBA Name', 'Location'], axis=1, inplace=True)
restaurant_df = restaurant_df[~restaurant_df['Results'].isin(['Out of Business','No Entry', 'Not Ready', 'Business Not Located'])]
restaurant_df.columns = ['inspect_id', 'aka_name', 'license_num', 'facility_type', 'risk',
                           'address', 'city', 'state', 'zipcode', 'inspect_date', 'inspect_type',
                           'results', 'violations', 'lat', 'lon']
restaurant_df.head(2)

Unnamed: 0,inspect_id,aka_name,license_num,facility_type,risk,address,city,state,zipcode,inspect_date,inspect_type,results,violations,lat,lon
5,2579896,LAS ISLAS MARIAS,1943409.0,Restaurant,Risk 1 (High),2400 S PULASKI RD,CHICAGO,IL,60623.0,08/10/2023,Canvass Re-Inspection,Fail,,41.847858,-87.724795
8,2579839,QUERETACO,2918599.0,Restaurant,Risk 2 (Medium),2247 E 71ST ST,CHICAGO,IL,60649.0,08/09/2023,License,Pass,,41.766032,-87.56955


In [99]:
# Read data
business_info = pd.read_csv('data/active_licensed_businesses.csv')
business_info.drop(['ID', 'LICENSE STATUS', 'ACCOUNT NUMBER', ], axis=1, inplace=True)
business_info['ZIP CODE'] = pd.to_numeric(business_info['ZIP CODE'], errors='coerce')
business_info.dropna(subset='ZIP CODE', inplace=True)
business_info = business_info[business_info['CITY'] == 'CHICAGO']
business_info = business_info[~business_info['APPLICATION TYPE'].isin(['C_LOC', 'C_SBA', 'C_EXPA', 'C_CAPA'])]

# keep only pertinent columns
business_info = business_info[['DOING BUSINESS AS NAME','LICENSE ID', 'ADDRESS', 'WARD', 'PRECINCT', 'POLICE DISTRICT', 'LICENSE CODE', 'LICENSE DESCRIPTION', 'LICENSE NUMBER','BUSINESS ACTIVITY ID', 'BUSINESS ACTIVITY', 
               'APPLICATION TYPE','APPLICATION REQUIREMENTS COMPLETE', 'CONDITIONAL APPROVAL', 'LICENSE TERM START DATE', 
               'LICENSE TERM EXPIRATION DATE', 'LICENSE APPROVED FOR ISSUANCE', 'DATE ISSUED']]

business_info.columns = ['aka_name', 'license_id', 'address', 'ward', 'precint',
                           'police_district', 'license_code',
                           'license_description', 'license_num', 'bus_activity_id',
                           'bus_activity', 'application_type',
                           'app_complete', 'conditional_approval',
                           'license_start_date', 'license_end_date',
                           'license_issued_date', 'date_issued']
business_info.head(2)

  business_info = pd.read_csv('data/active_licensed_businesses.csv')


Unnamed: 0,aka_name,license_id,address,ward,precint,police_district,license_code,license_description,license_num,bus_activity_id,bus_activity,application_type,app_complete,conditional_approval,license_start_date,license_end_date,license_issued_date,date_issued
0,CRYSTAL BALLROOM/PLAZA BALLROOM,2858237,151 E WACKER DR PLAZA,42.0,44.0,1.0,1477,Outdoor Patio,1927,784,Sale of Liquor Outdoors on Private Property,RENEW,07/15/2022,N,09/16/2022,09/15/2024,08/05/2022,08/08/2022
1,STETSON'S,2858248,151 E WACKER DR PLAZA,42.0,44.0,1.0,1006,Retail Food Establishment,1932,775,Retail Sales of Perishable Foods,RENEW,07/15/2022,N,09/16/2022,09/15/2024,08/05/2022,08/08/2022


### Extract Violation information per inspection

In [101]:
# restaurant_df.dropna(subset=['violations'], inplace=True)
restaurant_df['violations'].fillna('-99.  No violations reported', inplace=True)
restaurant_df['violations_list'] = restaurant_df['violations'].apply(lambda x: x.split("|"))
restaurant_df['violation_count'] = restaurant_df['violations_list'].apply(lambda x: len(x))
restaurant_df.head(2)

Unnamed: 0,inspect_id,aka_name,license_num,facility_type,risk,address,city,state,zipcode,inspect_date,inspect_type,results,violations,lat,lon,violations_list,violation_count
5,2579896,LAS ISLAS MARIAS,1943409.0,Restaurant,Risk 1 (High),2400 S PULASKI RD,CHICAGO,IL,60623.0,08/10/2023,Canvass Re-Inspection,Fail,-99. No violations reported,41.847858,-87.724795,[-99. No violations reported],1
8,2579839,QUERETACO,2918599.0,Restaurant,Risk 2 (Medium),2247 E 71ST ST,CHICAGO,IL,60649.0,08/09/2023,License,Pass,-99. No violations reported,41.766032,-87.56955,[-99. No violations reported],1


In [104]:
import re

restaurant_df['violation_number'] = restaurant_df['violations_list'].apply(lambda x: [re.findall(r'\b\d+\b',i)[0] for i in x])
restaurant_df.head(3)

restaurant_df['must_comply'] = restaurant_df['violations_list'].apply(lambda x: [ 'MUST COMPLY' in i for i in x])
restaurant_df['must_comply_count'] = restaurant_df['must_comply'].apply(lambda x: sum(x))

restaurant_df['instructed_comply'] = restaurant_df['violations_list'].apply(lambda x: ['INSTRUCTED TO COMPLY' in i for i in x])
restaurant_df['instructed_comply_count'] = restaurant_df['instructed_comply'].apply(lambda x: sum(x))

restaurant_df['not_cited'] = restaurant_df['violations_list'].apply(lambda x: ['NO CITATION ISSUED' in i for i in x])
restaurant_df['citation_count'] = restaurant_df['not_cited'].apply(lambda x: len(x) - sum(x))

restaurant_df.head(2)

Unnamed: 0,inspect_id,aka_name,license_num,facility_type,risk,address,city,state,zipcode,inspect_date,...,lon,violations_list,violation_count,violation_number,must_comply,must_comply_count,instructed_comply,instructed_comply_count,not_cited,citation_count
5,2579896,LAS ISLAS MARIAS,1943409.0,Restaurant,Risk 1 (High),2400 S PULASKI RD,CHICAGO,IL,60623.0,08/10/2023,...,-87.724795,[-99. No violations reported],1,[99],[False],0,[False],0,[False],1
8,2579839,QUERETACO,2918599.0,Restaurant,Risk 2 (Medium),2247 E 71ST ST,CHICAGO,IL,60649.0,08/09/2023,...,-87.56955,[-99. No violations reported],1,[99],[False],0,[False],0,[False],1


### Create Summary Statistics

In [83]:
temp = restaurant_df.groupby(['aka_name', 'license_num']).count().iloc[:,0:2]
names = temp.index.get_level_values(0)
temp

Unnamed: 0_level_0,Unnamed: 1_level_0,inspect_id,facility_type
aka_name,license_num,Unnamed: 2_level_1,Unnamed: 3_level_1
"#1 CHINA EXPRESS , LTD",1869616.0,1,1
#1 CHOP SUEY,1970042.0,23,23
#1 CHOP SUEY,2042372.0,10,10
#1 CHOP SUEY,2137377.0,20,20
#1 CHOP SUEY,2695112.0,3,3
...,...,...,...
mr.daniel's,1899292.0,2,2
naansense,2098531.0,6,6
no signage,1292.0,2,2
the FRENCH LUNCHBOX,2511081.0,4,4


In [84]:
location_counts = []
for name in names:
    num = len(temp.xs(name, level='aka_name'))
    location_counts.append(num)

temp['chain_location_count'] = location_counts
temp = temp.rename(columns={'inspect_id': 'store_inspections'})
temp = temp.sort_values(by=['chain_location_count'], ascending=False)
temp.drop(labels='facility_type', axis=1, inplace=True)
temp

Unnamed: 0_level_0,Unnamed: 1_level_0,store_inspections,chain_location_count
aka_name,license_num,Unnamed: 2_level_1,Unnamed: 3_level_1
SUBWAY,2863267.0,2,349
SUBWAY,1947909.0,18,349
SUBWAY,1932383.0,18,349
SUBWAY,1932804.0,3,349
SUBWAY,1938179.0,34,349
...,...,...,...
KENMARE CATERING AND EVENTS,1842540.0,1,1
KENNEDY FISH CHICKEN AND GYROS,1575882.0,5,1
KENNEDY KING COLLEGE,2419579.0,4,1
KENNEDY SUB,2153140.0,10,1


In [107]:
# temp.xs('DUNKIN DONUTS', level='aka_name')

### Inspections per license

In [106]:
temp = restaurant_df.sort_values(by=['inspect_date']).groupby('license_num')['inspect_date'].apply(list)
temp2 = temp.apply(lambda x: len(x))
df = pd.concat([temp,temp2], axis=1)
df.columns=['Inspection_Date', 'Total_inspections']  


df = df.sort_values(by=['Total_inspections'], ascending=False)
df['Date_min'] = df['Inspection_Date'].apply(lambda x: min(x))
df['Date_max'] = df['Inspection_Date'].apply(lambda x: max(x))



df.head(10)

Unnamed: 0_level_0,Inspection_Date,Total_inspections,Date_min,Date_max
license_num,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1354323.0,"[05/23/2011, 05/23/2011, 05/23/2011, 05/23/201...",130,05/23/2011,11/16/2011
1574001.0,"[03/25/2011, 03/25/2011, 03/25/2011, 03/25/201...",68,03/25/2011,10/21/2015
60184.0,"[01/05/2012, 01/05/2021, 01/11/2011, 01/11/202...",60,01/05/2012,12/18/2012
1974745.0,"[01/12/2015, 02/08/2011, 02/08/2011, 02/25/201...",56,01/12/2015,12/01/2011
39623.0,"[01/07/2010, 01/14/2010, 01/31/2020, 02/07/202...",50,01/07/2010,12/16/2016
14616.0,"[03/25/2010, 03/25/2010, 03/25/2010, 03/25/201...",49,03/25/2010,08/31/2011
1273271.0,"[01/04/2012, 01/12/2012, 01/12/2018, 01/16/201...",49,01/04/2012,12/22/2022
4190.0,"[02/08/2016, 02/25/2010, 03/08/2019, 03/14/201...",47,02/08/2016,12/23/2011
1095992.0,"[01/10/2023, 02/18/2011, 02/25/2013, 03/02/201...",46,01/10/2023,12/09/2016
1909713.0,"[01/10/2014, 01/19/2018, 01/21/2014, 01/29/201...",46,01/10/2014,11/29/2022


### Final Dataframe Features
geoapify number of starbucks within 0.5 mile radius  
geoapify related business within 0.5 mile radius  
us census track info of income  
name  
license number  
number of inspections during most recent license period  
number of inspections during previous license period  
number of inspection during 3rd previous period  
number of inspections during 4th perious period  
business age (default start from 2010 inspection date)  
number of chains  
is chain boolean  
risk  
ward  
license code  
renew  
conditional approved  
buisness account  

### ML Idea  
Name of business is not important  

