In [1]:
# Import required libraries

# eda tools
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# model preprocessing
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

# feature selection
from statsmodels.stats.outliers_influence import variance_inflation_factor
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2

# algorithms
from statsmodels.api import OLS
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestClassifier

# evaluation
from sklearn.metrics import mean_squared_error, r2_score

In [2]:
# Read data
inspections_df = pd.read_csv('data/food_inspections.csv', parse_dates=['Inspection Date'])

inspections_df.dropna(subset='License #', inplace=True)

inspections_df['License #'].astype('int32')

# Display sample data
inspections_df.head()

Unnamed: 0,Inspection ID,DBA Name,AKA Name,License #,Facility Type,Risk,Address,City,State,Zip,Inspection Date,Inspection Type,Results,Violations,Latitude,Longitude,Location
0,2579974,PING PONG,PING PONG,1765705.0,Restaurant,Risk 1 (High),3322-3326 N BROADWAY,CHICAGO,IL,60657.0,2023-08-11,Non-Inspection,No Entry,,41.942623,-87.644625,"(41.94262315814179, -87.64462536460103)"
1,2579958,SOUTH SHORE FOOD MART INC.,SOUTH SHORE FOOD MART,2340833.0,Grocery Store,Risk 1 (High),7900 S SOUTH SHORE DR,CHICAGO,IL,60617.0,2023-08-11,Canvass,Fail,"3. MANAGEMENT, FOOD EMPLOYEE AND CONDITIONAL E...",41.751856,-87.54825,"(41.75185560246702, -87.54825033733893)"
2,2579936,GREAT LAKES ACADEMY,GREAT LAKES ACADEMY,3305802.0,School,Risk 1 (High),8401 S Saginaw AVE,CHICAGO,IL,60617.0,2023-08-11,Canvass Re-Inspection,Pass,,41.742635,-87.55987,"(41.74263536816081, -87.55986972603321)"
3,2579926,LUCKY STRIKE DOWNTOWN CHICAGO,LUCKY STRIKE DOWNTOWN CHICAGO,2917711.0,,Risk 3 (Low),322 E ILLINOIS ST,CHICAGO,IL,60611.0,2023-08-10,License Re-Inspection,Pass,,41.891133,-87.61916,"(41.891133170854566, -87.61916020704619)"
4,2579927,LUCKY STRIKE DOWNTOWN CHICAGO,LUCKY STRIKE DOWNTOWN CHICAGO,2917712.0,,Risk 3 (Low),322 E ILLINOIS ST,CHICAGO,IL,60611.0,2023-08-10,License Re-Inspection,Pass,,41.891133,-87.61916,"(41.891133170854566, -87.61916020704619)"


In [3]:
inspections_df.dtypes

Inspection ID               int64
DBA Name                   object
AKA Name                   object
License #                 float64
Facility Type              object
Risk                       object
Address                    object
City                       object
State                      object
Zip                       float64
Inspection Date    datetime64[ns]
Inspection Type            object
Results                    object
Violations                 object
Latitude                  float64
Longitude                 float64
Location                   object
dtype: object

In [4]:
len(inspections_df)

258045

In [5]:
temp = inspections_df['Facility Type'].value_counts()

In [6]:
temp.head(50)

Restaurant                           173206
Grocery Store                         32388
School                                16222
Children's Services Facility           5250
Bakery                                 3738
Daycare Above and Under 2 Years        3407
Daycare (2 - 6 Years)                  3083
Long Term Care                         1863
Catering                               1594
Liquor                                 1106
Mobile Food Dispenser                   923
Mobile Food Preparer                    911
Hospital                                764
Golden Diner                            723
Daycare Combo 1586                      691
Wholesale                               552
TAVERN                                  389
Shared Kitchen User (Long Term)         314
Daycare (Under 2 Years)                 312
Special Event                           286
Shared Kitchen                          219
GAS STATION                             198
BANQUET HALL                    

To be thorough, any food related business should be added including multipurpose businesses like 'Grocery/Restaurant'.  For simplicity, I will only use restaurant data.

In [7]:
restaurant_df = inspections_df[inspections_df['Facility Type'] == 'Restaurant'].copy()
restaurant_df.drop(['DBA Name', 'Location'], axis=1, inplace=True)
restaurant_df.head()

Unnamed: 0,Inspection ID,AKA Name,License #,Facility Type,Risk,Address,City,State,Zip,Inspection Date,Inspection Type,Results,Violations,Latitude,Longitude
0,2579974,PING PONG,1765705.0,Restaurant,Risk 1 (High),3322-3326 N BROADWAY,CHICAGO,IL,60657.0,2023-08-11,Non-Inspection,No Entry,,41.942623,-87.644625
5,2579896,LAS ISLAS MARIAS,1943409.0,Restaurant,Risk 1 (High),2400 S PULASKI RD,CHICAGO,IL,60623.0,2023-08-10,Canvass Re-Inspection,Fail,,41.847858,-87.724795
6,2579853,Lito's Taco and Empanada House,2762609.0,Restaurant,Risk 1 (High),2460 N CLARK ST,CHICAGO,IL,60614.0,2023-08-09,Canvass Re-Inspection,No Entry,,41.927067,-87.64153
7,2579910,SHU TIN XIA,2574892.0,Restaurant,Risk 1 (High),2428-2436 S WALLACE ST,CHICAGO,IL,60616.0,2023-08-09,Canvass,Out of Business,,41.848386,-87.64196
8,2579839,QUERETACO,2918599.0,Restaurant,Risk 2 (Medium),2247 E 71ST ST,CHICAGO,IL,60649.0,2023-08-09,License,Pass,,41.766032,-87.56955


In [8]:
restaurant_df.dtypes

Inspection ID               int64
AKA Name                   object
License #                 float64
Facility Type              object
Risk                       object
Address                    object
City                       object
State                      object
Zip                       float64
Inspection Date    datetime64[ns]
Inspection Type            object
Results                    object
Violations                 object
Latitude                  float64
Longitude                 float64
dtype: object

In [10]:
business_info = pd.read_csv('data/all_licensed_businesses.csv')
business_info.drop(['ID', 'LICENSE STATUS', 'ACCOUNT NUMBER', ], axis=1, inplace=True)
business_info.head()

  business_info = pd.read_csv('data/all_licensed_businesses.csv')


Unnamed: 0,LICENSE ID,SITE NUMBER,LEGAL NAME,DOING BUSINESS AS NAME,ADDRESS,CITY,STATE,ZIP CODE,WARD,PRECINCT,...,CONDITIONAL APPROVAL,LICENSE TERM START DATE,LICENSE TERM EXPIRATION DATE,LICENSE APPROVED FOR ISSUANCE,DATE ISSUED,LICENSE STATUS CHANGE DATE,SSA,LATITUDE,LONGITUDE,LOCATION
0,76522,1,THORNDALE CONSTRUCTION,THORNDALE CONSTRUCTION,11243 CHESAPEAKE PLAC 1ST,WESTCHESTER,IL,60154,,,...,N,02/16/2000,02/15/2001,12/22/2003,08/24/2009,,,,,
1,119268,1,"PAT HAMILTON, INC",PAT HAMILTON CO.,17021 S MAGNOLIA DR 1ST,HAZEL CREST,IL,60429,,,...,N,02/16/1996,02/15/1997,12/22/2003,04/12/2006,,,,,
2,2479692,2,"AMERICAN ARBITRATION ASSOCIATION, INC.","AMERICAN ARBITRATION ASSOCIATION, INC.",150 N MICHIGAN AVE 30TH 3050,CHICAGO,IL,60601,34.0,9.0,...,N,09/16/2016,09/15/2018,08/29/2016,08/30/2016,,,41.884531,-87.624602,"(41.88453096378336, -87.62460204460082)"
3,2375632,1,MEE INC,MEE INC,11845 W LARAWAY RD 1,FRANKFORT,IL,60423,,,...,N,03/16/2015,03/15/2017,03/12/2015,03/13/2015,,,,,
4,1940503,45,"FEDEX OFFICE AND PRINT SERVICES, INC.","FEDEX OFFICE AND PRINT SVCS, INC.",2301 S DR MARTIN LUTHER KING JR DR SOUTH BL,CHICAGO,IL,60616,4.0,3.0,...,N,01/16/2009,01/15/2011,01/08/2009,01/22/2009,,,,,


In [11]:
# Find features that are of little value
business_info.isna().sum()/len(business_info)*100

LICENSE ID                            0.000000
SITE NUMBER                           0.000000
LEGAL NAME                            0.000000
DOING BUSINESS AS NAME                0.017125
ADDRESS                               0.000000
CITY                                  0.000270
STATE                                 0.001082
ZIP CODE                              0.026588
WARD                                  7.501345
PRECINCT                             12.621483
WARD PRECINCT                         7.499002
POLICE DISTRICT                      12.044837
LICENSE CODE                          0.000000
LICENSE DESCRIPTION                   0.000000
BUSINESS ACTIVITY ID                 48.954909
BUSINESS ACTIVITY                    48.954909
LICENSE NUMBER                        0.000090
APPLICATION TYPE                      0.000000
APPLICATION CREATED DATE             76.708958
APPLICATION REQUIREMENTS COMPLETE     1.654047
PAYMENT DATE                          2.015645
CONDITIONAL A

In [12]:
business_info['CONDITIONAL APPROVAL'].value_counts()

N    1105249
Y       4272
Name: CONDITIONAL APPROVAL, dtype: int64

In [13]:
business_info['APPLICATION TYPE'].value_counts()

RENEW     844170
ISSUE     248420
C_LOC      14958
C_EXPA       811
C_SBA        661
C_CAPA       501
Name: APPLICATION TYPE, dtype: int64

In [14]:
# keep only pertinent columns
business_info = business_info[['DOING BUSINESS AS NAME','LICENSE ID', 'ADDRESS', 'WARD', 'PRECINCT', 'WARD PRECINCT', 'POLICE DISTRICT', 'LICENSE CODE', 'LICENSE DESCRIPTION', 'LICENSE NUMBER','BUSINESS ACTIVITY ID', 'BUSINESS ACTIVITY', 
               'APPLICATION TYPE','APPLICATION REQUIREMENTS COMPLETE', 'CONDITIONAL APPROVAL', 'LICENSE TERM START DATE', 
               'LICENSE TERM EXPIRATION DATE', 'LICENSE APPROVED FOR ISSUANCE', 'DATE ISSUED']]

In [15]:
# remove unknown Application Type
business_info = business_info[~business_info['APPLICATION TYPE'].isin(['C_LOC', 'C_SBA', 'C_EXPA', 'C_CAPA'])]

### Check to see how many matching business id's there are  
I would like to merge the datasets to obtain renewal/issue info and a few other features

In [16]:
# License # and LICENSE NUMBER are equivalent

restaurant_licenses = restaurant_df['License #'].unique()
business_licenses = business_info['LICENSE NUMBER'].unique()

print(f'Restaurant licenses: {len(restaurant_licenses)}')
print(f'Business licenses: {len(business_licenses)}')


compare=[]
for i in restaurant_licenses:
    compare.append(i in business_licenses)
    
print(f'Matches:  {len(restaurant_licenses[compare])} ({len(restaurant_licenses[compare])/len(restaurant_licenses)*100:0.0f}%)')


Restaurant licenses: 24083
Business licenses: 322425
Matches:  23336 (97%)


### Check a common Restaurant

In [17]:
restaurant_df[restaurant_df['AKA Name'] == 'SUBWAY'].sort_values(by='License #', ascending=True)

Unnamed: 0,Inspection ID,AKA Name,License #,Facility Type,Risk,Address,City,State,Zip,Inspection Date,Inspection Type,Results,Violations,Latitude,Longitude
94598,2300450,SUBWAY,0.0,Restaurant,Risk 1 (High),4771 N LINCOLN AVE,CHICAGO,IL,60625.0,2019-06-28,Canvass,Out of Business,,41.968506,-87.688338
8406,2293545,SUBWAY,0.0,Restaurant,Risk 1 (High),4771 N LINCOLN AVE,CHICAGO,IL,60625.0,2019-06-13,Canvass,Fail,"1. PERSON IN CHARGE PRESENT, DEMONSTRATES KNOW...",41.968506,-87.688338
94955,2293900,SUBWAY,0.0,Restaurant,Risk 1 (High),4771 N LINCOLN AVE,CHICAGO,IL,60625.0,2019-06-20,Canvass Re-Inspection,Pass w/ Conditions,"3. MANAGEMENT, FOOD EMPLOYEE AND CONDITIONAL E...",41.968506,-87.688338
224619,519380,SUBWAY,15059.0,Restaurant,Risk 1 (High),1313 W TOUHY AVE,CHICAGO,IL,60626.0,2012-02-03,Canvass Re-Inspection,Pass,,42.012710,-87.664034
115695,1955999,SUBWAY,15059.0,Restaurant,Risk 1 (High),1313 W TOUHY AVE,CHICAGO,IL,60626.0,2017-11-14,Complaint,Pass,32. FOOD AND NON-FOOD CONTACT SURFACES PROPERL...,42.012710,-87.664034
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
37107,2570591,SUBWAY,2890832.0,Restaurant,Risk 1 (High),2620 N NARRAGANSETT AVE,CHICAGO,IL,60639.0,2023-02-02,License,Pass,,41.927995,-87.785752
38592,2573846,SUBWAY,2900790.0,Restaurant,Risk 1 (High),400 N ORLEANS ST,CHICAGO,IL,60654.0,2023-04-07,License,Pass,36. THERMOMETERS PROVIDED & ACCURATE - Comment...,41.889172,-87.637110
235,2577597,SUBWAY,2906313.0,Restaurant,Risk 1 (High),1958 W PETERSON AVE,CHICAGO,IL,60660.0,2023-06-21,License,Pass,,41.990792,-87.679365
35710,2577541,SUBWAY,2917457.0,Restaurant,Risk 1 (High),1252 S HALSTED ST,CHICAGO,IL,60607.0,2023-06-20,License,Pass,"55. PHYSICAL FACILITIES INSTALLED, MAINTAINED ...",41.865599,-87.647018


In [18]:
restaurant_df.dtypes

Inspection ID               int64
AKA Name                   object
License #                 float64
Facility Type              object
Risk                       object
Address                    object
City                       object
State                      object
Zip                       float64
Inspection Date    datetime64[ns]
Inspection Type            object
Results                    object
Violations                 object
Latitude                  float64
Longitude                 float64
dtype: object

In [19]:
restaurant_df['year'] = restaurant_df['Inspection Date'].dt.strftime('%Y')

In [20]:
business_info[business_info['DOING BUSINESS AS NAME'] == 'SUBWAY'].sort_values(by='LICENSE NUMBER', ascending=True).iloc[:,0:20]

Unnamed: 0,DOING BUSINESS AS NAME,LICENSE ID,ADDRESS,WARD,PRECINCT,WARD PRECINCT,POLICE DISTRICT,LICENSE CODE,LICENSE DESCRIPTION,LICENSE NUMBER,BUSINESS ACTIVITY ID,BUSINESS ACTIVITY,APPLICATION TYPE,APPLICATION REQUIREMENTS COMPLETE,CONDITIONAL APPROVAL,LICENSE TERM START DATE,LICENSE TERM EXPIRATION DATE,LICENSE APPROVED FOR ISSUANCE,DATE ISSUED
2641,SUBWAY,2863846,1313 W TOUHY AVE,49.0,27.0,49-27,24.0,1006,Retail Food Establishment,15059,775,Retail Sales of Perishable Foods,RENEW,08/15/2022,N,10/16/2022,10/15/2024,08/16/2022,08/17/2022
20525,SUBWAY,2891924,1917 W FULLERTON AVE # 1ST,32.0,34.0,32-34,14.0,1006,Retail Food Establishment,75380,775,Retail Sales of Perishable Foods,RENEW,02/15/2023,N,04/16/2023,04/15/2025,04/04/2023,04/05/2023
44229,SUBWAY,2888244,1020 W BELMONT AVE,44.0,39.0,44-39,19.0,1006,Retail Food Establishment,1272424,775,Retail Sales of Perishable Foods,RENEW,01/15/2023,N,03/16/2023,03/15/2025,01/31/2023,02/01/2023
41768,SUBWAY,2883591,3021 N CLARK ST,44.0,6.0,44-6,19.0,1006,Retail Food Establishment,1272426,775,Retail Sales of Perishable Foods,RENEW,12/15/2022,N,02/16/2023,02/15/2025,12/21/2022,12/22/2022
6173,SUBWAY,2853315,4030 W ROOSEVELT RD,24.0,48.0,24-48,11.0,1006,Retail Food Establishment,1447811,775,Retail Sales of Perishable Foods,RENEW,06/15/2022,N,08/16/2022,08/15/2024,07/25/2022,07/26/2022
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
42968,SUBWAY,2877825,2750 W DEVON AVE 1,50.0,19.0,50-19,24.0,1006,Retail Food Establishment,2877825,1083,Expedited Restaurant with On-Premises Consumption,ISSUE,01/10/2023,N,01/12/2023,01/15/2025,01/11/2023,01/12/2023
22740,SUBWAY,2900790,400 N ORLEANS ST 1 1,42.0,17.0,42-17,18.0,1006,Retail Food Establishment,2900790,781,Sale of Food Prepared Onsite With Dining Area,ISSUE,03/24/2023,N,04/11/2023,04/15/2025,04/10/2023,04/11/2023
17040,SUBWAY,2906313,1958 W PETERSON AVE 1ST,40.0,9.0,40-9,24.0,1006,Retail Food Establishment,2906313,781,Sale of Food Prepared Onsite With Dining Area,ISSUE,04/25/2023,N,07/26/2023,08/15/2025,07/26/2023,07/26/2023
16954,SUBWAY,2917457,1252 S HALSTED ST 1ST 5,28.0,7.0,28-7,12.0,1006,Retail Food Establishment,2917457,781,Sale of Food Prepared Onsite With Dining Area,ISSUE,06/15/2023,N,07/26/2023,08/15/2025,07/26/2023,07/26/2023


### Check other records to ensure matching by decomals on LICENSE NUMBER works

In [21]:
business_info[business_info['LICENSE NUMBER'] == 15059.0]

Unnamed: 0,DOING BUSINESS AS NAME,LICENSE ID,ADDRESS,WARD,PRECINCT,WARD PRECINCT,POLICE DISTRICT,LICENSE CODE,LICENSE DESCRIPTION,LICENSE NUMBER,BUSINESS ACTIVITY ID,BUSINESS ACTIVITY,APPLICATION TYPE,APPLICATION REQUIREMENTS COMPLETE,CONDITIONAL APPROVAL,LICENSE TERM START DATE,LICENSE TERM EXPIRATION DATE,LICENSE APPROVED FOR ISSUANCE,DATE ISSUED
2641,SUBWAY,2863846,1313 W TOUHY AVE,49.0,27.0,49-27,24.0,1006,Retail Food Establishment,15059,775,Retail Sales of Perishable Foods,RENEW,08/15/2022,N,10/16/2022,10/15/2024,08/16/2022,08/17/2022


### See if the ADDRESS matches

In [22]:
business_info[business_info['ADDRESS']=='4771 N LINCOLN AVE']

Unnamed: 0,DOING BUSINESS AS NAME,LICENSE ID,ADDRESS,WARD,PRECINCT,WARD PRECINCT,POLICE DISTRICT,LICENSE CODE,LICENSE DESCRIPTION,LICENSE NUMBER,BUSINESS ACTIVITY ID,BUSINESS ACTIVITY,APPLICATION TYPE,APPLICATION REQUIREMENTS COMPLETE,CONDITIONAL APPROVAL,LICENSE TERM START DATE,LICENSE TERM EXPIRATION DATE,LICENSE APPROVED FOR ISSUANCE,DATE ISSUED
34723,SUBWAY,2796097,4771 N LINCOLN AVE,47.0,,47-,,1006,Retail Food Establishment,2677482,781,Sale of Food Prepared Onsite With Dining Area,RENEW,06/15/2021,N,08/16/2021,08/15/2023,01/31/2022,09/07/2022


### Check range of License \#

In [23]:
restaurant_df['License #'].describe()

count    1.732060e+05
mean     1.757982e+06
std      8.773912e+05
min      0.000000e+00
25%      1.473493e+06
50%      2.048650e+06
75%      2.379895e+06
max      3.624677e+06
Name: License #, dtype: float64

In [24]:
restaurant_df['License #'].value_counts()

1354323.0    196
0.0          158
1574001.0     69
60184.0       60
1974745.0     58
            ... 
2827305.0      1
2771129.0      1
2293833.0      1
2283741.0      1
1800100.0      1
Name: License #, Length: 24083, dtype: int64

In [25]:
restaurant_df.columns

Index(['Inspection ID', 'AKA Name', 'License #', 'Facility Type', 'Risk',
       'Address', 'City', 'State', 'Zip', 'Inspection Date', 'Inspection Type',
       'Results', 'Violations', 'Latitude', 'Longitude', 'year'],
      dtype='object')

In [26]:
restaurant_df.groupby(['Address','AKA Name', 'License #','year']).count()['Risk']

Address              AKA Name                    License #  year
                     CASA CENTRAL LA POSADA      0.0        2010    1
                     EAT N RUN CHICKEN AND FISH  2442868.0  2016    1
1 E 83RD ST          SUBWAY                      1938179.0  2010    1
                                                            2011    4
                                                            2012    1
                                                                   ..
9956 S WESTERN AVE   FOX'S BEVERLY PUB           149.0      2019    2
                                                            2020    2
                                                            2021    2
                                                            2022    1
                                                            2023    2
Name: Risk, Length: 106780, dtype: int64

In [27]:
restaurant_df[restaurant_df['AKA Name'] == 'SUBWAY'].groupby(['Address','AKA Name', 'License #','year']).count()['Risk']

Address             AKA Name  License #  year
1 E 83RD ST         SUBWAY    1938179.0  2010    1
                                         2011    4
                                         2012    1
                                         2013    3
                                         2014    2
                                                ..
9518 S HALSTED ST   SUBWAY    1846937.0  2019    1
953 W WEBSTER AVE   SUBWAY    2327923.0  2014    3
                              2354661.0  2014    1
                                         2015    1
                                         2016    2
Name: Risk, Length: 2652, dtype: int64

In [28]:
bad_license_numbers = restaurant_df[restaurant_df['License #'] < 100]
len(bad_license_numbers)

247

In [29]:
bad_license_numbers

Unnamed: 0,Inspection ID,AKA Name,License #,Facility Type,Risk,Address,City,State,Zip,Inspection Date,Inspection Type,Results,Violations,Latitude,Longitude,year
332,2576806,MINI SUPER DANY,0.0,Restaurant,Risk 1 (High),10111 S EWING AVE,CHICAGO,IL,60617.0,2023-06-05,Complaint,Fail,"1. PERSON IN CHARGE PRESENT, DEMONSTRATES KNOW...",41.711605,-87.535133,2023
744,2573601,PENNINGTON'S FOOD AND SNACKS,0.0,Restaurant,Risk 1 (High),410 E 79TH ST,CHICAGO,IL,60619.0,2023-04-03,Complaint,Fail,2. CITY OF CHICAGO FOOD SERVICE SANITATION CER...,41.751234,-87.614471,2023
2859,2553296,KRISPY KREME DOUGHNUTS,0.0,Restaurant,Risk 2 (Medium),108 N STATE ST,CHICAGO,IL,60602.0,2022-03-23,Canvass,Out of Business,,41.883423,-87.628022,2022
4962,2484249,XANDO COFFEE & BAR / COSI SANDWICH BAR,9.0,Restaurant,Risk 1 (High),116 S MICHIGAN AVE,CHICAGO,IL,60603.0,2021-01-28,Canvass,Out of Business,,41.880396,-87.624502,2021
5301,2454105,,0.0,Restaurant,Risk 2 (Medium),3407 1/2 W MADISON ST,CHICAGO,IL,60624.0,2020-11-02,Canvass,Out of Business,,41.880805,-87.711375,2020
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
250938,229498,XANDO COFFEE & BAR / COSI SANDWICH BAR,62.0,Restaurant,Risk 1 (High),230 W WASHINGTON ST,CHICAGO,IL,60606.0,2010-06-28,Canvass Re-Inspection,Pass,32. FOOD AND NON-FOOD CONTACT SURFACES PROPERL...,41.883318,-87.634769,2010
254608,60405,CASA CENTRAL LA POSADA,0.0,Restaurant,Risk 1 (High),,,IL,,2010-04-05,Canvass,Pass,37. TOILET ROOM DOORS SELF CLOSING: DRESSING R...,,,2010
255452,68320,,0.0,Restaurant,Risk 1 (High),3522 E 100TH ST,CHICAGO,IL,60617.0,2010-03-02,Consultation,Fail,,41.713977,-87.537917,2010
256045,120273,UPS Cafeteria,0.0,Restaurant,Risk 1 (High),1400 S JEFFERSON ST,CHICAGO,IL,60607.0,2010-01-06,Canvass,Pass,33. FOOD AND NON-FOOD CONTACT EQUIPMENT UTENSI...,41.863501,-87.642210,2010


In [30]:
bad_license_numbers[bad_license_numbers['AKA Name']=='SUBWAY']

Unnamed: 0,Inspection ID,AKA Name,License #,Facility Type,Risk,Address,City,State,Zip,Inspection Date,Inspection Type,Results,Violations,Latitude,Longitude,year
8406,2293545,SUBWAY,0.0,Restaurant,Risk 1 (High),4771 N LINCOLN AVE,CHICAGO,IL,60625.0,2019-06-13,Canvass,Fail,"1. PERSON IN CHARGE PRESENT, DEMONSTRATES KNOW...",41.968506,-87.688338,2019
94598,2300450,SUBWAY,0.0,Restaurant,Risk 1 (High),4771 N LINCOLN AVE,CHICAGO,IL,60625.0,2019-06-28,Canvass,Out of Business,,41.968506,-87.688338,2019
94955,2293900,SUBWAY,0.0,Restaurant,Risk 1 (High),4771 N LINCOLN AVE,CHICAGO,IL,60625.0,2019-06-20,Canvass Re-Inspection,Pass w/ Conditions,"3. MANAGEMENT, FOOD EMPLOYEE AND CONDITIONAL E...",41.968506,-87.688338,2019


In [31]:
business_info[business_info['ADDRESS']=='4771 N LINCOLN AVE']

Unnamed: 0,DOING BUSINESS AS NAME,LICENSE ID,ADDRESS,WARD,PRECINCT,WARD PRECINCT,POLICE DISTRICT,LICENSE CODE,LICENSE DESCRIPTION,LICENSE NUMBER,BUSINESS ACTIVITY ID,BUSINESS ACTIVITY,APPLICATION TYPE,APPLICATION REQUIREMENTS COMPLETE,CONDITIONAL APPROVAL,LICENSE TERM START DATE,LICENSE TERM EXPIRATION DATE,LICENSE APPROVED FOR ISSUANCE,DATE ISSUED
34723,SUBWAY,2796097,4771 N LINCOLN AVE,47.0,,47-,,1006,Retail Food Establishment,2677482,781,Sale of Food Prepared Onsite With Dining Area,RENEW,06/15/2021,N,08/16/2021,08/15/2023,01/31/2022,09/07/2022


In [32]:
# this should work

# name = (bad_license_numbers[bad_license_numbers['Inspection ID'] == 2576806]['Address']).values[0]

bad_license_numbers[bad_license_numbers['Address']=='4771 N LINCOLN AVE ']

Unnamed: 0,Inspection ID,AKA Name,License #,Facility Type,Risk,Address,City,State,Zip,Inspection Date,Inspection Type,Results,Violations,Latitude,Longitude,year
8406,2293545,SUBWAY,0.0,Restaurant,Risk 1 (High),4771 N LINCOLN AVE,CHICAGO,IL,60625.0,2019-06-13,Canvass,Fail,"1. PERSON IN CHARGE PRESENT, DEMONSTRATES KNOW...",41.968506,-87.688338,2019
94598,2300450,SUBWAY,0.0,Restaurant,Risk 1 (High),4771 N LINCOLN AVE,CHICAGO,IL,60625.0,2019-06-28,Canvass,Out of Business,,41.968506,-87.688338,2019
94955,2293900,SUBWAY,0.0,Restaurant,Risk 1 (High),4771 N LINCOLN AVE,CHICAGO,IL,60625.0,2019-06-20,Canvass Re-Inspection,Pass w/ Conditions,"3. MANAGEMENT, FOOD EMPLOYEE AND CONDITIONAL E...",41.968506,-87.688338,2019


In [33]:
bad_license_numbers['Address'] = bad_license_numbers['Address'].apply(lambda x: x.strip())
bad_license_numbers[bad_license_numbers['Address']=='4771 N LINCOLN AVE']

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  bad_license_numbers['Address'] = bad_license_numbers['Address'].apply(lambda x: x.strip())


Unnamed: 0,Inspection ID,AKA Name,License #,Facility Type,Risk,Address,City,State,Zip,Inspection Date,Inspection Type,Results,Violations,Latitude,Longitude,year
8406,2293545,SUBWAY,0.0,Restaurant,Risk 1 (High),4771 N LINCOLN AVE,CHICAGO,IL,60625.0,2019-06-13,Canvass,Fail,"1. PERSON IN CHARGE PRESENT, DEMONSTRATES KNOW...",41.968506,-87.688338,2019
94598,2300450,SUBWAY,0.0,Restaurant,Risk 1 (High),4771 N LINCOLN AVE,CHICAGO,IL,60625.0,2019-06-28,Canvass,Out of Business,,41.968506,-87.688338,2019
94955,2293900,SUBWAY,0.0,Restaurant,Risk 1 (High),4771 N LINCOLN AVE,CHICAGO,IL,60625.0,2019-06-20,Canvass Re-Inspection,Pass w/ Conditions,"3. MANAGEMENT, FOOD EMPLOYEE AND CONDITIONAL E...",41.968506,-87.688338,2019


In [34]:
bad_license_numbers2 = business_info[business_info['LICENSE NUMBER'] < 100]
len(bad_license_numbers2)

0

### See if the correct License # can be obtained from business_info by matching addresses

In [35]:
business_info[business_info['DOING BUSINESS AS NAME'] == 'SUBWAY'].sort_values(by='ADDRESS', ascending=True)

Unnamed: 0,DOING BUSINESS AS NAME,LICENSE ID,ADDRESS,WARD,PRECINCT,WARD PRECINCT,POLICE DISTRICT,LICENSE CODE,LICENSE DESCRIPTION,LICENSE NUMBER,BUSINESS ACTIVITY ID,BUSINESS ACTIVITY,APPLICATION TYPE,APPLICATION REQUIREMENTS COMPLETE,CONDITIONAL APPROVAL,LICENSE TERM START DATE,LICENSE TERM EXPIRATION DATE,LICENSE APPROVED FOR ISSUANCE,DATE ISSUED
46862,SUBWAY,2865113,1 E 83RD ST 1,6.0,37.0,6-37,6.0,1006,Retail Food Establishment,1938179,775,Retail Sales of Perishable Foods,RENEW,08/15/2022,N,10/16/2022,10/15/2024,10/26/2022,03/13/2023
34834,SUBWAY,2847889,100 W 87TH ST,21.0,37.0,21-37,6.0,1006,Retail Food Establishment,1739186,775,Retail Sales of Perishable Foods,RENEW,05/15/2022,N,07/16/2022,07/15/2024,09/07/2022,09/08/2022
44229,SUBWAY,2888244,1020 W BELMONT AVE,44.0,39.0,44-39,19.0,1006,Retail Food Establishment,1272424,775,Retail Sales of Perishable Foods,RENEW,01/15/2023,N,03/16/2023,03/15/2025,01/31/2023,02/01/2023
27451,SUBWAY,2875022,10319 S PULASKI RD,19.0,12.0,19-12,22.0,1006,Retail Food Establishment,2202608,775,Retail Sales of Perishable Foods,RENEW,10/15/2022,N,12/16/2022,12/15/2024,04/18/2023,04/19/2023
46007,SUBWAY,2885331,10325 S TORRENCE AVE 1ST,10.0,30.0,10-30,4.0,1006,Retail Food Establishment,2373618,781,Sale of Food Prepared Onsite With Dining Area,RENEW,12/15/2022,N,02/16/2023,02/15/2025,02/27/2023,02/28/2023
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
36425,SUBWAY,2859935,800 N KEDZIE AVE LST 301,27.0,51.0,27-51,11.0,1006,Retail Food Establishment,1909605,775,Retail Sales of Perishable Foods,RENEW,07/15/2022,N,09/16/2022,09/15/2024,09/20/2022,09/21/2022
1270,SUBWAY,2849548,8331 S STEWART AVE 1,21.0,31.0,21-31,6.0,1006,Retail Food Establishment,2608389,735,Preparation of Food and Dining on Premise With...,RENEW,05/15/2022,N,07/16/2022,07/15/2024,05/23/2022,05/24/2022
28452,SUBWAY,2810324,8614 S KEDZIE AVE,18.0,10.0,18-10,8.0,1006,Retail Food Establishment,1517669,775,Retail Sales of Perishable Foods,RENEW,09/15/2021,N,11/16/2021,11/15/2023,09/28/2021,09/29/2021
22117,SUBWAY,2839061,8655 S JEFFERY BLVD 1ST,8.0,25.0,8-25,4.0,1006,Retail Food Establishment,2452423,781,Sale of Food Prepared Onsite With Dining Area,RENEW,03/15/2022,N,05/16/2022,05/15/2024,03/25/2022,03/28/2022


In [36]:
len(bad_license_numbers)

247

In [37]:
# most likely this does not work, as is, because year-to-year the restaurants change
temp = pd.merge(bad_license_numbers, business_info, left_on='Address', right_on='ADDRESS', how='left')
temp

Unnamed: 0,Inspection ID,AKA Name,License #,Facility Type,Risk,Address,City,State,Zip,Inspection Date,...,LICENSE NUMBER,BUSINESS ACTIVITY ID,BUSINESS ACTIVITY,APPLICATION TYPE,APPLICATION REQUIREMENTS COMPLETE,CONDITIONAL APPROVAL,LICENSE TERM START DATE,LICENSE TERM EXPIRATION DATE,LICENSE APPROVED FOR ISSUANCE,DATE ISSUED
0,2576806,MINI SUPER DANY,0.0,Restaurant,Risk 1 (High),10111 S EWING AVE,CHICAGO,IL,60617.0,2023-06-05,...,2900944.0,767,Retail Sales of General Merchandise and Non-Pe...,ISSUE,03/28/2023,N,03/28/2023,10/15/2023,03/28/2023,03/28/2023
1,2573601,PENNINGTON'S FOOD AND SNACKS,0.0,Restaurant,Risk 1 (High),410 E 79TH ST,CHICAGO,IL,60619.0,2023-04-03,...,,,,,,,,,,
2,2553296,KRISPY KREME DOUGHNUTS,0.0,Restaurant,Risk 2 (Medium),108 N STATE ST,CHICAGO,IL,60602.0,2022-03-23,...,2555968.0,904,Retail Sales of General Merchandise,RENEW,01/15/2023,N,03/16/2023,03/15/2025,03/14/2023,06/20/2023
3,2553296,KRISPY KREME DOUGHNUTS,0.0,Restaurant,Risk 2 (Medium),108 N STATE ST,CHICAGO,IL,60602.0,2022-03-23,...,2677865.0,911,Retail Sales of Clothing / Accessories / Shoes,RENEW,05/15/2023,N,07/16/2023,07/15/2025,07/05/2023,07/06/2023
4,2553296,KRISPY KREME DOUGHNUTS,0.0,Restaurant,Risk 2 (Medium),108 N STATE ST,CHICAGO,IL,60602.0,2022-03-23,...,2008920.0,911,Retail Sales of Clothing / Accessories / Shoes,RENEW,09/15/2022,N,11/16/2022,11/15/2024,10/17/2022,10/18/2022
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
493,60405,CASA CENTRAL LA POSADA,0.0,Restaurant,Risk 1 (High),,,IL,,2010-04-05,...,,,,,,,,,,
494,68320,,0.0,Restaurant,Risk 1 (High),3522 E 100TH ST,CHICAGO,IL,60617.0,2010-03-02,...,,,,,,,,,,
495,120273,UPS Cafeteria,0.0,Restaurant,Risk 1 (High),1400 S JEFFERSON ST,CHICAGO,IL,60607.0,2010-01-06,...,2216352.0,816,Storage or Use of Hazardous Materials,RENEW,03/15/2022,N,05/16/2022,05/15/2024,04/27/2022,12/14/2022
496,197244,COSI,99.0,Restaurant,Risk 1 (High),203 N LA SALLE ST,CHICAGO,IL,60601.0,2010-02-03,...,1220988.0,725,Provide 4 or More Parking Spaces For a Fee - A...,RENEW,05/15/2022,N,07/16/2022,07/15/2024,06/30/2022,07/05/2022


In [38]:
temp.dropna(subset='LICENSE NUMBER', axis=0, inplace=True)
# temp.set_index('Inspection ID', inplace=True)
# restaurant_df.set_index('Inspection ID', inplace=True)
temp

Unnamed: 0,Inspection ID,AKA Name,License #,Facility Type,Risk,Address,City,State,Zip,Inspection Date,...,LICENSE NUMBER,BUSINESS ACTIVITY ID,BUSINESS ACTIVITY,APPLICATION TYPE,APPLICATION REQUIREMENTS COMPLETE,CONDITIONAL APPROVAL,LICENSE TERM START DATE,LICENSE TERM EXPIRATION DATE,LICENSE APPROVED FOR ISSUANCE,DATE ISSUED
0,2576806,MINI SUPER DANY,0.0,Restaurant,Risk 1 (High),10111 S EWING AVE,CHICAGO,IL,60617.0,2023-06-05,...,2900944.0,767,Retail Sales of General Merchandise and Non-Pe...,ISSUE,03/28/2023,N,03/28/2023,10/15/2023,03/28/2023,03/28/2023
2,2553296,KRISPY KREME DOUGHNUTS,0.0,Restaurant,Risk 2 (Medium),108 N STATE ST,CHICAGO,IL,60602.0,2022-03-23,...,2555968.0,904,Retail Sales of General Merchandise,RENEW,01/15/2023,N,03/16/2023,03/15/2025,03/14/2023,06/20/2023
3,2553296,KRISPY KREME DOUGHNUTS,0.0,Restaurant,Risk 2 (Medium),108 N STATE ST,CHICAGO,IL,60602.0,2022-03-23,...,2677865.0,911,Retail Sales of Clothing / Accessories / Shoes,RENEW,05/15/2023,N,07/16/2023,07/15/2025,07/05/2023,07/06/2023
4,2553296,KRISPY KREME DOUGHNUTS,0.0,Restaurant,Risk 2 (Medium),108 N STATE ST,CHICAGO,IL,60602.0,2022-03-23,...,2008920.0,911,Retail Sales of Clothing / Accessories / Shoes,RENEW,09/15/2022,N,11/16/2022,11/15/2024,10/17/2022,10/18/2022
5,2553296,KRISPY KREME DOUGHNUTS,0.0,Restaurant,Risk 2 (Medium),108 N STATE ST,CHICAGO,IL,60602.0,2022-03-23,...,2895858.0,708,Miscellaneous Commercial Services,ISSUE,02/24/2023,N,02/27/2023,03/15/2025,02/27/2023,02/27/2023
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
488,229468,COSI,2.0,Restaurant,Risk 1 (High),230 W MONROE ST,CHICAGO,IL,60606.0,2010-06-15,...,1545138.0,775,Retail Sales of Perishable Foods,RENEW,03/15/2023,N,05/16/2023,05/15/2025,05/25/2023,07/17/2023
490,277423,XANDO COFFEE & BAR / COSI SANDWICH BAR,9.0,Restaurant,Risk 1 (High),116 S MICHIGAN AVE,CHICAGO,IL,60603.0,2010-07-15,...,2785819.0,781,Sale of Food Prepared Onsite With Dining Area,RENEW,10/15/2022,N,12/16/2022,12/15/2024,11/07/2022,11/08/2022
495,120273,UPS Cafeteria,0.0,Restaurant,Risk 1 (High),1400 S JEFFERSON ST,CHICAGO,IL,60607.0,2010-01-06,...,2216352.0,816,Storage or Use of Hazardous Materials,RENEW,03/15/2022,N,05/16/2022,05/15/2024,04/27/2022,12/14/2022
496,197244,COSI,99.0,Restaurant,Risk 1 (High),203 N LA SALLE ST,CHICAGO,IL,60601.0,2010-02-03,...,1220988.0,725,Provide 4 or More Parking Spaces For a Fee - A...,RENEW,05/15/2022,N,07/16/2022,07/15/2024,06/30/2022,07/05/2022


In [39]:
restaurant_df

Unnamed: 0,Inspection ID,AKA Name,License #,Facility Type,Risk,Address,City,State,Zip,Inspection Date,Inspection Type,Results,Violations,Latitude,Longitude,year
0,2579974,PING PONG,1765705.0,Restaurant,Risk 1 (High),3322-3326 N BROADWAY,CHICAGO,IL,60657.0,2023-08-11,Non-Inspection,No Entry,,41.942623,-87.644625,2023
5,2579896,LAS ISLAS MARIAS,1943409.0,Restaurant,Risk 1 (High),2400 S PULASKI RD,CHICAGO,IL,60623.0,2023-08-10,Canvass Re-Inspection,Fail,,41.847858,-87.724795,2023
6,2579853,Lito's Taco and Empanada House,2762609.0,Restaurant,Risk 1 (High),2460 N CLARK ST,CHICAGO,IL,60614.0,2023-08-09,Canvass Re-Inspection,No Entry,,41.927067,-87.641530,2023
7,2579910,SHU TIN XIA,2574892.0,Restaurant,Risk 1 (High),2428-2436 S WALLACE ST,CHICAGO,IL,60616.0,2023-08-09,Canvass,Out of Business,,41.848386,-87.641960,2023
8,2579839,QUERETACO,2918599.0,Restaurant,Risk 2 (Medium),2247 E 71ST ST,CHICAGO,IL,60649.0,2023-08-09,License,Pass,,41.766032,-87.569550,2023
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
258044,63258,CHICKIES,1848186.0,Restaurant,Risk 2 (Medium),2839 S PULASKI RD,CHICAGO,IL,60623.0,2010-02-10,Complaint,Fail,"6. HANDS WASHED AND CLEANED, GOOD HYGIENIC PRA...",41.839389,-87.724271,2010
258045,215227,WENDY'S,1145755.0,Restaurant,Risk 1 (High),7601 S CICERO AVE,CHICAGO,IL,60652.0,2010-02-22,Complaint-Fire Re-inspection,Pass,,41.754660,-87.741385,2010
258049,114321,HOTEL CHICAGO,1680526.0,Restaurant,Risk 1 (High),333 N DEARBORN ST,CHICAGO,IL,60654.0,2010-02-04,Canvass,Pass w/ Conditions,8. SANITIZING RINSE FOR EQUIPMENT AND UTENSILS...,41.888414,-87.629411,2010
258051,70292,EL GORDO RESTAURANT,31100.0,Restaurant,Risk 1 (High),5746 W BELMONT AVE,CHICAGO,IL,60634.0,2010-01-19,Canvass,Pass,32. FOOD AND NON-FOOD CONTACT SURFACES PROPERL...,41.938627,-87.770830,2010


In [40]:
temp


Unnamed: 0,Inspection ID,AKA Name,License #,Facility Type,Risk,Address,City,State,Zip,Inspection Date,...,LICENSE NUMBER,BUSINESS ACTIVITY ID,BUSINESS ACTIVITY,APPLICATION TYPE,APPLICATION REQUIREMENTS COMPLETE,CONDITIONAL APPROVAL,LICENSE TERM START DATE,LICENSE TERM EXPIRATION DATE,LICENSE APPROVED FOR ISSUANCE,DATE ISSUED
0,2576806,MINI SUPER DANY,0.0,Restaurant,Risk 1 (High),10111 S EWING AVE,CHICAGO,IL,60617.0,2023-06-05,...,2900944.0,767,Retail Sales of General Merchandise and Non-Pe...,ISSUE,03/28/2023,N,03/28/2023,10/15/2023,03/28/2023,03/28/2023
2,2553296,KRISPY KREME DOUGHNUTS,0.0,Restaurant,Risk 2 (Medium),108 N STATE ST,CHICAGO,IL,60602.0,2022-03-23,...,2555968.0,904,Retail Sales of General Merchandise,RENEW,01/15/2023,N,03/16/2023,03/15/2025,03/14/2023,06/20/2023
3,2553296,KRISPY KREME DOUGHNUTS,0.0,Restaurant,Risk 2 (Medium),108 N STATE ST,CHICAGO,IL,60602.0,2022-03-23,...,2677865.0,911,Retail Sales of Clothing / Accessories / Shoes,RENEW,05/15/2023,N,07/16/2023,07/15/2025,07/05/2023,07/06/2023
4,2553296,KRISPY KREME DOUGHNUTS,0.0,Restaurant,Risk 2 (Medium),108 N STATE ST,CHICAGO,IL,60602.0,2022-03-23,...,2008920.0,911,Retail Sales of Clothing / Accessories / Shoes,RENEW,09/15/2022,N,11/16/2022,11/15/2024,10/17/2022,10/18/2022
5,2553296,KRISPY KREME DOUGHNUTS,0.0,Restaurant,Risk 2 (Medium),108 N STATE ST,CHICAGO,IL,60602.0,2022-03-23,...,2895858.0,708,Miscellaneous Commercial Services,ISSUE,02/24/2023,N,02/27/2023,03/15/2025,02/27/2023,02/27/2023
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
488,229468,COSI,2.0,Restaurant,Risk 1 (High),230 W MONROE ST,CHICAGO,IL,60606.0,2010-06-15,...,1545138.0,775,Retail Sales of Perishable Foods,RENEW,03/15/2023,N,05/16/2023,05/15/2025,05/25/2023,07/17/2023
490,277423,XANDO COFFEE & BAR / COSI SANDWICH BAR,9.0,Restaurant,Risk 1 (High),116 S MICHIGAN AVE,CHICAGO,IL,60603.0,2010-07-15,...,2785819.0,781,Sale of Food Prepared Onsite With Dining Area,RENEW,10/15/2022,N,12/16/2022,12/15/2024,11/07/2022,11/08/2022
495,120273,UPS Cafeteria,0.0,Restaurant,Risk 1 (High),1400 S JEFFERSON ST,CHICAGO,IL,60607.0,2010-01-06,...,2216352.0,816,Storage or Use of Hazardous Materials,RENEW,03/15/2022,N,05/16/2022,05/15/2024,04/27/2022,12/14/2022
496,197244,COSI,99.0,Restaurant,Risk 1 (High),203 N LA SALLE ST,CHICAGO,IL,60601.0,2010-02-03,...,1220988.0,725,Provide 4 or More Parking Spaces For a Fee - A...,RENEW,05/15/2022,N,07/16/2022,07/15/2024,06/30/2022,07/05/2022


In [41]:
# Need to re-index both dataframes

# restaurant_df['License #'] = restaurant_df.replace(restaurant_df['License #'], dict(temp['LICENSE NUMBER']), inplace=True)

In [56]:
restaurant_df[restaurant_df['AKA Name'] == 'SUBWAY'].sort_values(by='License #', ascending=True)
restaurant_df.fillna('-99.  Unknown', inplace=True)

In [57]:
restaurant_df['License #'].value_counts()

1354323.0    196
0.0          158
1574001.0     69
60184.0       60
1974745.0     58
            ... 
2827305.0      1
2771129.0      1
2293833.0      1
2283741.0      1
1800100.0      1
Name: License #, Length: 24083, dtype: int64

In [58]:
# Solution :  Remove License # under 100 becasue they are probably incorrect

In [59]:
restaurant_df['Results'].value_counts()

Pass                    88166
Fail                    32544
Pass w/ Conditions      29606
Out of Business         12351
No Entry                 8451
Not Ready                2062
Business Not Located       26
Name: Results, dtype: int64

In [60]:
temp = restaurant_df.groupby(['AKA Name', 'License #']).count().iloc[:,0:2]

names = temp.index.get_level_values(0)
temp

Unnamed: 0_level_0,Unnamed: 1_level_0,Inspection ID,Facility Type
AKA Name,License #,Unnamed: 2_level_1,Unnamed: 3_level_1
"#1 CHINA EXPRESS , LTD",1869616.0,2,2
#1 CHOP SUEY,1970042.0,26,26
#1 CHOP SUEY,2042372.0,11,11
#1 CHOP SUEY,2137377.0,20,20
#1 CHOP SUEY,2695112.0,4,4
...,...,...,...
naansense,2098531.0,8,8
no signage,1292.0,3,3
the FRENCH LUNCHBOX,2511081.0,5,5
vitino pizzeria,2009515.0,1,1


In [61]:
len(temp.xs('1 N WACKER KITCHEN AND BAR', level='AKA Name'))


2

In [62]:
location_counts = []
for name in names:
    num = len(temp.xs(name, level='AKA Name'))
    location_counts.append(num)

temp['locations'] = location_counts
temp.drop(labels='Facility Type', axis=1, inplace=True)
temp = temp.sort_values(by=['locations'], ascending=False)

In [63]:
temp

Unnamed: 0_level_0,Unnamed: 1_level_0,Inspection ID,locations
AKA Name,License #,Unnamed: 2_level_1,Unnamed: 3_level_1
-99. Unknown,2368190.0,5,388
-99. Unknown,2293822.0,8,388
-99. Unknown,2289715.0,1,388
-99. Unknown,2289533.0,3,388
-99. Unknown,2284667.0,7,388
...,...,...,...
KENTUCKY FRIED CHICKEN 521038,1816808.0,10,1
KENTUCKY FRIED CHICKEN 521039,1816810.0,11,1
KENTUCKY FRIED CHICKEN 521040,1816811.0,11,1
KENTUCKY FRIED CHICKEN 521041,1816812.0,6,1


In [64]:
temp.xs('DUNKIN DONUTS', level='AKA Name')

Unnamed: 0_level_0,Inspection ID,locations
License #,Unnamed: 1_level_1,Unnamed: 2_level_1
2569804.0,6,188
1677626.0,8,188
2642143.0,8,188
2569844.0,3,188
2642263.0,1,188
...,...,...
1913870.0,14,188
1913868.0,24,188
2890954.0,1,188
13801.0,14,188


In [65]:
restaurant_df['Violations_List'] = restaurant_df['Violations'].apply(lambda x: x.split("|"))
restaurant_df['Number_violations'] = restaurant_df['Violations_List'].apply(lambda x: len(x))
restaurant_df.head(2)

Unnamed: 0,Inspection ID,AKA Name,License #,Facility Type,Risk,Address,City,State,Zip,Inspection Date,Inspection Type,Results,Violations,Latitude,Longitude,year,Violations_List,Number_violations
0,2579974,PING PONG,1765705.0,Restaurant,Risk 1 (High),3322-3326 N BROADWAY,CHICAGO,IL,60657.0,2023-08-11,Non-Inspection,No Entry,-99. Unknown,41.942623,-87.644625,2023,[-99. Unknown],1
5,2579896,LAS ISLAS MARIAS,1943409.0,Restaurant,Risk 1 (High),2400 S PULASKI RD,CHICAGO,IL,60623.0,2023-08-10,Canvass Re-Inspection,Fail,-99. Unknown,41.847858,-87.724795,2023,[-99. Unknown],1


In [67]:
import re

restaurant_df['violation_number'] = restaurant_df['Violations_List'].apply(lambda x: [re.findall(r'\b\d+\b',i)[0] for i in x])
restaurant_df.head(3)

restaurant_df['must_comply'] = restaurant_df['Violations_List'].apply(lambda x: [ 'MUST COMPLY' in i for i in x])
restaurant_df['must_comply_count'] = restaurant_df['must_comply'].apply(lambda x: sum(x))

restaurant_df['instructed_comply'] = restaurant_df['Violations_List'].apply(lambda x: ['INSTRUCTED TO COMPLY' in i for i in x])
restaurant_df['instructed_comply_count'] = restaurant_df['instructed_comply'].apply(lambda x: sum(x))

restaurant_df['not_cited'] = restaurant_df['Violations_List'].apply(lambda x: ['NO CITATION ISSUED' in i for i in x])
restaurant_df['citation_count'] = restaurant_df['not_cited'].apply(lambda x: len(x) - sum(x))

restaurant_df.head(2)

Unnamed: 0,Inspection ID,AKA Name,License #,Facility Type,Risk,Address,City,State,Zip,Inspection Date,...,year,Violations_List,Number_violations,violation_number,must_comply,must_comply_count,instructed_comply,instructed_comply_count,not_cited,citation_count
0,2579974,PING PONG,1765705.0,Restaurant,Risk 1 (High),3322-3326 N BROADWAY,CHICAGO,IL,60657.0,2023-08-11,...,2023,[-99. Unknown],1,[99],[False],0,[False],0,[False],1
5,2579896,LAS ISLAS MARIAS,1943409.0,Restaurant,Risk 1 (High),2400 S PULASKI RD,CHICAGO,IL,60623.0,2023-08-10,...,2023,[-99. Unknown],1,[99],[False],0,[False],0,[False],1


In [69]:
temp = restaurant_df.groupby('License #').count()['AKA Name'].sort_values(ascending=False)
temp[temp > 1]

License #
1354323.0    196
0.0          158
1574001.0     69
60184.0       60
1974745.0     58
            ... 
1933087.0      2
2601610.0      2
2535860.0      2
2464561.0      2
2500745.0      2
Name: AKA Name, Length: 20082, dtype: int64

In [70]:
len(restaurant_df)

173206

In [73]:
temp = restaurant_df.sort_values(by=['Inspection Date']).groupby('License #')['Inspection Date'].apply(list)
temp2 = temp.apply(lambda x: len(x))
df = pd.concat([temp,temp2], axis=1)
df.columns=['Inspection Date', 'Total_inspections']  


df = df.sort_values(by=['Total_inspections'], ascending=False)
df['Date_min'] = df['Inspection Date'].apply(lambda x: min(x))
df['Date_max'] = df['Inspection Date'].apply(lambda x: max(x))



df.head(10)

Unnamed: 0_level_0,Inspection Date,Total_inspections,Date_min,Date_max
License #,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1354323.0,"[2010-06-07 00:00:00, 2010-06-07 00:00:00, 201...",196,2010-06-07,2013-11-15
0.0,"[2010-01-06 00:00:00, 2010-03-02 00:00:00, 201...",158,2010-01-06,2023-07-19
1574001.0,"[2010-04-05 00:00:00, 2010-04-05 00:00:00, 201...",69,2010-04-05,2017-08-10
60184.0,"[2010-01-19 00:00:00, 2010-03-15 00:00:00, 201...",60,2010-01-19,2023-07-19
1974745.0,"[2011-02-08 00:00:00, 2011-02-08 00:00:00, 201...",58,2011-02-08,2017-11-07
39623.0,"[2010-01-07 00:00:00, 2010-01-14 00:00:00, 201...",50,2010-01-07,2023-05-08
1273271.0,"[2010-06-07 00:00:00, 2010-07-14 00:00:00, 201...",49,2010-06-07,2023-06-20
14616.0,"[2010-03-25 00:00:00, 2010-03-25 00:00:00, 201...",49,2010-03-25,2011-08-31
4190.0,"[2010-02-25 00:00:00, 2010-06-16 00:00:00, 201...",47,2010-02-25,2023-07-24
1490035.0,"[2010-02-24 00:00:00, 2011-01-13 00:00:00, 201...",46,2010-02-24,2017-09-15


In [74]:
restaurant_df[restaurant_df['License #']==2637249]

Unnamed: 0,Inspection ID,AKA Name,License #,Facility Type,Risk,Address,City,State,Zip,Inspection Date,...,year,Violations_List,Number_violations,violation_number,must_comply,must_comply_count,instructed_comply,instructed_comply_count,not_cited,citation_count
9506,2252616,CASA DEL SOL,2637249.0,Restaurant,Risk 1 (High),8918 S COMMERCIAL AVE,CHICAGO,IL,60617.0,2019-01-15,...,2019,"[3. MANAGEMENT, FOOD EMPLOYEE AND CONDITIONAL ...",18,"[3, 5, 25, 29, 32, 36, 36, 38, 39, 48, 51, 54,...","[False, False, False, False, False, False, Fal...",0,"[False, False, False, False, False, False, Fal...",0,"[True, True, True, False, False, True, False, ...",13
79930,2355665,CASA DEL SOL,2637249.0,Restaurant,Risk 1 (High),8918 S COMMERCIAL AVE,CHICAGO,IL,60617.0,2019-12-19,...,2019,[-99. Unknown],1,[99],[False],0,[False],0,[False],1
98305,2268587,CASA DEL SOL,2637249.0,Restaurant,Risk 1 (High),8918 S COMMERCIAL AVE,CHICAGO,IL,60617.0,2019-02-26,...,2019,"[3. MANAGEMENT, FOOD EMPLOYEE AND CONDITIONAL ...",4,"[3, 5, 57, 58]","[False, False, False, False]",0,"[False, False, False, False]",0,"[False, False, False, False]",4
98884,2243833,CASA DEL SOL,2637249.0,Restaurant,Risk 1 (High),8918 S COMMERCIAL AVE,CHICAGO,IL,60617.0,2018-12-26,...,2018,"[3. MANAGEMENT, FOOD EMPLOYEE AND CONDITIONAL ...",18,"[3, 5, 25, 29, 32, 36, 36, 38, 39, 48, 51, 54,...","[False, False, False, False, False, False, Fal...",0,"[False, False, False, False, False, False, Fal...",0,"[True, True, True, False, False, True, False, ...",13
101149,2253115,CASA DEL SOL,2637249.0,Restaurant,Risk 1 (High),8918 S COMMERCIAL AVE,CHICAGO,IL,60617.0,2019-01-25,...,2019,"[3. MANAGEMENT, FOOD EMPLOYEE AND CONDITIONAL ...",8,"[3, 5, 25, 29, 32, 36, 57, 58]","[False, False, False, False, False, False, Fal...",0,"[False, False, False, False, False, False, Fal...",0,"[True, True, True, False, False, True, False, ...",4
101370,2252398,CASA DEL SOL,2637249.0,Restaurant,Risk 1 (High),8918 S COMMERCIAL AVE,CHICAGO,IL,60617.0,2019-01-11,...,2019,"[3. MANAGEMENT, FOOD EMPLOYEE AND CONDITIONAL ...",18,"[3, 5, 25, 29, 32, 36, 36, 38, 39, 48, 51, 54,...","[False, False, False, False, False, False, Fal...",0,"[False, False, False, False, False, False, Fal...",0,"[True, True, True, False, False, True, False, ...",13


In [None]:
# Convert Risk into numerical values
risk_mapping = {
    'Risk 1 (High)': 3,
    'Risk 2 (Medium)': 2,
    'Risk 3 (Low)': 1
}
restaurant_df['Risk_numeric'] = restaurant_df['Risk'].map(risk_mapping)

# Convert Results into dummy variables
results_dummies = pd.get_dummies(restaurant_df['Results'], prefix='Results')
results_dummies = results_dummies * 1  # Convert True/False to 1/0 if needed
restaurant_df = pd.concat([restaurant_df, results_dummies], axis=1)

# Drop boolean columns
for col in restaurant_df.columns:
    if restaurant_df[col].dtype == bool:
        restaurant_df.drop(col, axis=1, inplace=True)

restaurant_df.head()

In [None]:
# Drop non-numeric columns (like restaurant names, addresses, etc.)
# This step assumes that such columns are of 'object' dtype. Modify if needed.
X_df = restaurant_df.select_dtypes(exclude=['object']).drop("Results_Fail", axis=1)

# For columns with a small number of unique values, use label encoding
# You might want to adjust the threshold (e.g., 10) based on your data
for col in X_df.columns:
    if len(X_df[col].unique()) <= 10:
        le = LabelEncoder()
        X_df[col] = le.fit_transform(X_df[col])

bestfeatures = SelectKBest(score_func=chi2, k=5)
fit = bestfeatures.fit(X_df,y)
dfscores = pd.DataFrame(fit.scores_)
dfcolumns = pd.DataFrame(X_df.columns)

featureScores = pd.concat([dfcolumns,dfscores],axis=1)
featureScores.columns = ['Features','Score']

print(featureScores.nlargest(5,'Score'))

In [None]:
# Using the top features for prediction
X = restaurant_df[['License_ID', 'Risk_numeric']]
y = restaurant_df["Results_Fail"]

# Splitting data
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

# Create & fit model
model = LinearRegression()
model.fit(X_train, y_train)

# Make predictions
y_pred = model.predict(X_test)

# Calculate R^2 score
r2 = r2_score(y_test, y_pred)
print(f"R-squared value: {r2}")

### Check for inspections occuring on the same day
If small then leave as is
If large then drop all duplicates except the last one

In [75]:
# Are there multiple inspections for the same day for the same license number? 
restaurant_df.groupby(['inspect_date','license_num', 'aka_name']).count()['inspect_id'].sort_values(ascending=False)

KeyError: 'inspect_date'

In [None]:
# # drop duplicates ignoring the inspection id
# columns = list(restaurant_df.columns[1:])

# not including lists due to error, not including inspect_type because multiple complaints of different kinds could be submitted
restaurant_df.drop_duplicates(subset=['aka_name',
                                     'license_num',
                                     'facility_type',
                                     'risk',
                                     'address',
                                     'city',
                                     'state',
                                     'zipcode',
                                     'inspect_date',
                                     'results',
                                     'lat',
                                     'lon'], 
                              keep='last',
                              inplace=True)

Check license number consistency

In [1]:
# Is the company and address the same but different license number?
temp = restaurant_df.groupby(['aka_name','address','license_num']).count()[['inspect_id','zipcode']].sort_values(by='inspect_id', ascending=False)
temp

NameError: name 'restaurant_df' is not defined

In [None]:
temp[temp.index.get_level_values('aka_name') == 'SAPORI']

In [None]:
# two methods that are the same
#temp.loc[('SAPORI', slice(None), slice(None))]
subset_temp = temp.xs('SAPORI', level='aka_name')
subset_temp

In [None]:
# counts number of licence_num per business name
t4 = subset_temp.groupby(level='address').count()['inspect_id']
t4.values[0]

In [None]:
# number of businesses per address
t = temp.groupby(level='address').count().sort_values(by='inspect_id', ascending=False)['inspect_id']
t.index[0]

# Note:  11601 W TOUHY AVE I think represents the entire O'Hare Airport

In [None]:
location_counts = []

names = temp.index.get_level_values(0)
for name in names:
    num = len(temp.xs(name, level='aka_name'))
    location_counts.append(num)

# temp2 = pd.Series(location_counts, names)
# pd.DataFrame({temp, temp}])
temp['duplicate_licenses'] = location_counts
temp.sort_values(by='duplicate_licenses', ascending=False)

In [None]:
restaurant_df[restaurant_df['address']=='11601 W TOUHY AVE']