**U1 (Target Use Case):**   
One could use this dataset to investigate whether different neighborhoods (indicated by zip code or geographic regions) have distinct food safety patterns, such as passing rate, complaint rate, risks, violations, and violation types, and further stratify the data by other factors (such as facility types and time range).

In [1]:
import pandas as pd
import numpy as np
import re
import string

In [2]:
input_data = pd.read_csv("data/open_refine_processed/Food-Inspections-openrefine.csv")
input_data.head()

Unnamed: 0,Inspection ID,DBA Name,AKA Name,License #,Facility Type,Facility Type Cleaned,Risk,Address,City,State,Zip,Inspection Date,Inspection Date Transformed,Inspection Type,Inspection Type Cleaned,Results,Violations,Latitude,Longitude,Location
0,2079132,MARRIOT MARQUIS CHICAGO,MARRIOT MARQUIS CHICAGO,2517328.0,Restaurant,restaurant,Risk 1 (High),2121 S PRAIRIE AVE,CHICAGO,IL,60616.0,08/28/2017,2017-08-28,License,license,Pass,,41.853651,-87.620534,"(41.853650885040594, -87.62053358114167)"
1,2079129,JET'S PIZZA,JET'S PIZZA,2522268.0,Restaurant,restaurant,Risk 2 (Medium),1025 W MADISON ST,CHICAGO,IL,60607.0,08/28/2017,2017-08-28,License,license,Not Ready,,41.881572,-87.653052,"(41.88157249576794, -87.65305233593274)"
2,2079125,ROOM 1520,ROOM 1520,2446638.0,Special Event,special event,Risk 3 (Low),301 N JUSTINE ST,CHICAGO,IL,60607.0,08/28/2017,2017-08-28,License Re-Inspection,license re-inspection,Not Ready,8. SANITIZING RINSE FOR EQUIPMENT AND UTENSILS...,41.886577,-87.665328,"(41.88657652150854, -87.66532812140231)"
3,2079123,MARRIOT MARQUIS CHICAGO,MARRIOT MARQUIS CHICAGO,2517338.0,Restaurant,restaurant,Risk 1 (High),2121 S PRAIRIE AVE,CHICAGO,IL,60616.0,08/28/2017,2017-08-28,License,license,Pass,"35. WALLS, CEILINGS, ATTACHED EQUIPMENT CONSTR...",41.853651,-87.620534,"(41.853650885040594, -87.62053358114167)"
4,2079105,CHARTWELLS,CICS WEST BELDEN CAMPUS,2549079.0,CHARTER SCHOOL,school,Risk 1 (High),2245 N MCVICKER AVE,CHICAGO,IL,60639.0,08/28/2017,2017-08-28,License Re-Inspection,license re-inspection,Pass,18. NO EVIDENCE OF RODENT OR INSECT OUTER OPEN...,41.921675,-87.776711,"(41.921675488910864, -87.7767113569357)"


In [3]:
input_data = input_data.drop(columns=['Facility Type', 'Inspection Date', 'Inspection Type']).rename(columns={
    'Facility Type Cleaned': 'Facility Type', 
    'Inspection Date Transformed': 'Inspection Date', 
    'Inspection Type Cleaned': 'Inspection Type'
})
input_data.head()

Unnamed: 0,Inspection ID,DBA Name,AKA Name,License #,Facility Type,Risk,Address,City,State,Zip,Inspection Date,Inspection Type,Results,Violations,Latitude,Longitude,Location
0,2079132,MARRIOT MARQUIS CHICAGO,MARRIOT MARQUIS CHICAGO,2517328.0,restaurant,Risk 1 (High),2121 S PRAIRIE AVE,CHICAGO,IL,60616.0,2017-08-28,license,Pass,,41.853651,-87.620534,"(41.853650885040594, -87.62053358114167)"
1,2079129,JET'S PIZZA,JET'S PIZZA,2522268.0,restaurant,Risk 2 (Medium),1025 W MADISON ST,CHICAGO,IL,60607.0,2017-08-28,license,Not Ready,,41.881572,-87.653052,"(41.88157249576794, -87.65305233593274)"
2,2079125,ROOM 1520,ROOM 1520,2446638.0,special event,Risk 3 (Low),301 N JUSTINE ST,CHICAGO,IL,60607.0,2017-08-28,license re-inspection,Not Ready,8. SANITIZING RINSE FOR EQUIPMENT AND UTENSILS...,41.886577,-87.665328,"(41.88657652150854, -87.66532812140231)"
3,2079123,MARRIOT MARQUIS CHICAGO,MARRIOT MARQUIS CHICAGO,2517338.0,restaurant,Risk 1 (High),2121 S PRAIRIE AVE,CHICAGO,IL,60616.0,2017-08-28,license,Pass,"35. WALLS, CEILINGS, ATTACHED EQUIPMENT CONSTR...",41.853651,-87.620534,"(41.853650885040594, -87.62053358114167)"
4,2079105,CHARTWELLS,CICS WEST BELDEN CAMPUS,2549079.0,school,Risk 1 (High),2245 N MCVICKER AVE,CHICAGO,IL,60639.0,2017-08-28,license re-inspection,Pass,18. NO EVIDENCE OF RODENT OR INSECT OUTER OPEN...,41.921675,-87.776711,"(41.921675488910864, -87.7767113569357)"


## 1. basic data exploration and cleaning

**[exploration]**   
Check data types and convert data types:

In [4]:
input_data.dtypes

Inspection ID        int64
DBA Name            object
AKA Name            object
License #          float64
Facility Type       object
Risk                object
Address             object
City                object
State               object
Zip                float64
Inspection Date     object
Inspection Type     object
Results             object
Violations          object
Latitude           float64
Longitude          float64
Location            object
dtype: object

In [5]:
input_data['Inspection ID'] = input_data['Inspection ID'].astype(str)
input_data['License #'] = input_data['License #'].astype(str)
input_data['License #'] = input_data['License #'].str.strip('.0')
input_data['Zip'] = input_data['Zip'].astype(str)
input_data['Zip'] = input_data['Zip'].str.strip('.0')
input_data['Inspection Date'] = pd.to_datetime(input_data['Inspection Date'])
input_data.dtypes

Inspection ID              object
DBA Name                   object
AKA Name                   object
License #                  object
Facility Type              object
Risk                       object
Address                    object
City                       object
State                      object
Zip                        object
Inspection Date    datetime64[ns]
Inspection Type            object
Results                    object
Violations                 object
Latitude                  float64
Longitude                 float64
Location                   object
dtype: object

**[exploration]**   
Check missing values:

In [6]:
input_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 153810 entries, 0 to 153809
Data columns (total 17 columns):
 #   Column           Non-Null Count   Dtype         
---  ------           --------------   -----         
 0   Inspection ID    153810 non-null  object        
 1   DBA Name         153810 non-null  object        
 2   AKA Name         151267 non-null  object        
 3   License #        153810 non-null  object        
 4   Facility Type    149250 non-null  object        
 5   Risk             153744 non-null  object        
 6   Address          153810 non-null  object        
 7   City             153651 non-null  object        
 8   State            153802 non-null  object        
 9   Zip              153810 non-null  object        
 10  Inspection Date  153810 non-null  datetime64[ns]
 11  Inspection Type  153809 non-null  object        
 12  Results          153810 non-null  object        
 13  Violations       123012 non-null  object        
 14  Latitude         153

In [7]:
input_data.isnull().sum()

Inspection ID          0
DBA Name               0
AKA Name            2543
License #              0
Facility Type       4560
Risk                  66
Address                0
City                 159
State                  8
Zip                    0
Inspection Date        0
Inspection Type        1
Results                0
Violations         30798
Latitude             544
Longitude            544
Location             544
dtype: int64

**[exploration]**  
Check whether the "location" column has the same information with the "Latitude" and "Longitude" columns:

In [8]:
def check_location(x):
    result = True
    e = 10e-5
    if (not pd.isnull(x['Location'])) and len(x['Location']) > 0:
        la, lo = x['Location'].strip('(|)').split(',')
        la, lo = float(la), float(lo)
        result = (abs(la - x['Latitude']) < e) and (abs(lo - x['Longitude']) < e)
    return result
    
sum(input_data.apply(check_location, axis=1)) == len(input_data)

True

**[cleaning]**  
So let's remove the "location" column since it does not provide any additional information:

In [9]:
input_data.drop(columns='Location', inplace=True)

In [10]:
input_data.head()

Unnamed: 0,Inspection ID,DBA Name,AKA Name,License #,Facility Type,Risk,Address,City,State,Zip,Inspection Date,Inspection Type,Results,Violations,Latitude,Longitude
0,2079132,MARRIOT MARQUIS CHICAGO,MARRIOT MARQUIS CHICAGO,2517328,restaurant,Risk 1 (High),2121 S PRAIRIE AVE,CHICAGO,IL,60616,2017-08-28,license,Pass,,41.853651,-87.620534
1,2079129,JET'S PIZZA,JET'S PIZZA,2522268,restaurant,Risk 2 (Medium),1025 W MADISON ST,CHICAGO,IL,60607,2017-08-28,license,Not Ready,,41.881572,-87.653052
2,2079125,ROOM 1520,ROOM 1520,2446638,special event,Risk 3 (Low),301 N JUSTINE ST,CHICAGO,IL,60607,2017-08-28,license re-inspection,Not Ready,8. SANITIZING RINSE FOR EQUIPMENT AND UTENSILS...,41.886577,-87.665328
3,2079123,MARRIOT MARQUIS CHICAGO,MARRIOT MARQUIS CHICAGO,2517338,restaurant,Risk 1 (High),2121 S PRAIRIE AVE,CHICAGO,IL,60616,2017-08-28,license,Pass,"35. WALLS, CEILINGS, ATTACHED EQUIPMENT CONSTR...",41.853651,-87.620534
4,2079105,CHARTWELLS,CICS WEST BELDEN CAMPUS,2549079,school,Risk 1 (High),2245 N MCVICKER AVE,CHICAGO,IL,60639,2017-08-28,license re-inspection,Pass,18. NO EVIDENCE OF RODENT OR INSECT OUTER OPEN...,41.921675,-87.776711


**[cleaning]**  
Clean up strings:

In [11]:
def clean_strings(x):
    return x.upper().replace('.', '').replace(',', '').replace("'", '').strip() if (not pd.isnull(x)) else x

for column in ['DBA Name', 'AKA Name', 'Address']:
    input_data[column] = input_data[column].apply(clean_strings)

input_data.head()

Unnamed: 0,Inspection ID,DBA Name,AKA Name,License #,Facility Type,Risk,Address,City,State,Zip,Inspection Date,Inspection Type,Results,Violations,Latitude,Longitude
0,2079132,MARRIOT MARQUIS CHICAGO,MARRIOT MARQUIS CHICAGO,2517328,restaurant,Risk 1 (High),2121 S PRAIRIE AVE,CHICAGO,IL,60616,2017-08-28,license,Pass,,41.853651,-87.620534
1,2079129,JETS PIZZA,JETS PIZZA,2522268,restaurant,Risk 2 (Medium),1025 W MADISON ST,CHICAGO,IL,60607,2017-08-28,license,Not Ready,,41.881572,-87.653052
2,2079125,ROOM 1520,ROOM 1520,2446638,special event,Risk 3 (Low),301 N JUSTINE ST,CHICAGO,IL,60607,2017-08-28,license re-inspection,Not Ready,8. SANITIZING RINSE FOR EQUIPMENT AND UTENSILS...,41.886577,-87.665328
3,2079123,MARRIOT MARQUIS CHICAGO,MARRIOT MARQUIS CHICAGO,2517338,restaurant,Risk 1 (High),2121 S PRAIRIE AVE,CHICAGO,IL,60616,2017-08-28,license,Pass,"35. WALLS, CEILINGS, ATTACHED EQUIPMENT CONSTR...",41.853651,-87.620534
4,2079105,CHARTWELLS,CICS WEST BELDEN CAMPUS,2549079,school,Risk 1 (High),2245 N MCVICKER AVE,CHICAGO,IL,60639,2017-08-28,license re-inspection,Pass,18. NO EVIDENCE OF RODENT OR INSECT OUTER OPEN...,41.921675,-87.776711


**[exploaration]**  
Further check if there are different variations of Addresses for the same facility and location (Latitude & Longitude):

In [12]:
temp = input_data[['DBA Name','AKA Name', 'Address', 'City', 'State', 'Zip', 'Latitude','Longitude']].drop_duplicates()
address_duplicated = temp.groupby(
    by=['DBA Name','AKA Name', 'City', 'State', 'Zip', 'Latitude', 'Longitude'], dropna=False)\
    .filter(lambda x: len(x) > 1).sort_values(by='AKA Name', ascending=False)
address_duplicated

Unnamed: 0,DBA Name,AKA Name,Address,City,State,Zip,Latitude,Longitude
13530,YOUNG SCHOLARS DEVELOPMENTAL INSTITUTE,YOUNG SCHOLARS DEVELOPMENTAL INSTITUTE,3038 - 3046 W 59TH ST,CHICAGO,IL,60629,41.786411,-87.700009
112142,YOUNG SCHOLARS DEVELOPMENTAL INSTITUTE,YOUNG SCHOLARS DEVELOPMENTAL INSTITUTE,3038 W 59TH ST,CHICAGO,IL,60629,41.786411,-87.700009
23888,WHIZ KIDS NURSERY CENTER INC,WHIZ KIDS NURSERY CENTER INC,514-522 W 103RD ST,CHICAGO,IL,60628,41.707113,-87.636204
125266,WHIZ KIDS NURSERY CENTER INC,WHIZ KIDS NURSERY CENTER INC,514-518 W 103RD ST,CHICAGO,IL,60628,41.707113,-87.636204
147287,WHALE FISH & CHICKEN,WHALE FISH & CHICKEN,5650 W DIVISION ST,CHICAGO,IL,60651,41.902223,-87.767688
...,...,...,...,...,...,...,...,...
130658,3 RS ORIENTAL FOOD MART,3 RS ORIENTAL FOOD MART,2712-14 W MONTROSE AVE,CHICAGO,IL,60618,41.961371,-87.696669
111443,PAPA TS ITALIANO,,2843 W 63RD ST,INACTIVE,IL,60629,41.778981,-87.695090
111478,PAPA TS ITALIANO,,2843 W 63 ST,INACTIVE,IL,60629,41.778981,-87.695090
134985,OBAMA FOOD MART,,1800 E 79TH ST BLDG,CHICAGO,IL,60649,41.751655,-87.580828


**[cleaning]**  
Let's clean up the "Address" column for these situations. We'll just use the Address from the first record for the duplicated records:

In [13]:
address_dedup = address_duplicated.drop_duplicates(subset=['DBA Name','AKA Name', 'City', 'State', 'Zip', 'Latitude', 'Longitude'], keep='first')
address_dedup

Unnamed: 0,DBA Name,AKA Name,Address,City,State,Zip,Latitude,Longitude
13530,YOUNG SCHOLARS DEVELOPMENTAL INSTITUTE,YOUNG SCHOLARS DEVELOPMENTAL INSTITUTE,3038 - 3046 W 59TH ST,CHICAGO,IL,60629,41.786411,-87.700009
23888,WHIZ KIDS NURSERY CENTER INC,WHIZ KIDS NURSERY CENTER INC,514-522 W 103RD ST,CHICAGO,IL,60628,41.707113,-87.636204
147287,WHALE FISH & CHICKEN,WHALE FISH & CHICKEN,5650 W DIVISION ST,CHICAGO,IL,60651,41.902223,-87.767688
19545,WAKAMONO,WAKAMONO,3313-3319 N BROADWAY,CHICAGO,IL,60657,41.942313,-87.644322
11261,UNCOMMON GROUND,UNCOMMON GROUND,3800-3804 N CLARK ST,CHICAGO,IL,60613,41.951064,-87.659795
...,...,...,...,...,...,...,...,...
101751,ALWAYS THAI,ALWAYS THAI,1825 W IRVING PARK RD,CHICAGO,IL,60613,41.954093,-87.675195
59294,7-ELEVEN,7-ELEVEN,535 S STATE ST BLDG,CHICAGO,IL,60605,41.875011,-87.627475
130661,3 RS ORIENTAL FOOD MART,3 RS ORIENTAL FOOD MART,2712-2714 W MONTROSE AVE,CHICAGO,IL,60618,41.961371,-87.696669
111443,PAPA TS ITALIANO,,2843 W 63RD ST,INACTIVE,IL,60629,41.778981,-87.695090


In [14]:
address_to_check = list(set(address_duplicated['Address']))

def clean_address(x):
    address = x['Address']
    if address in address_to_check:
        dba, aka, city, state, zip_, la, lo = x['DBA Name'], x['AKA Name'], x['City'], x['State'], x['Zip'], x['Latitude'], x['Longitude']
        temp = address_dedup[(address_dedup['DBA Name']==dba) &
                     (pd.isnull(address_dedup['AKA Name']) | (address_dedup['AKA Name']==aka)) &
                     (address_dedup['City']==city) &
                     (address_dedup['State']==state) &
                     (address_dedup['Zip']==zip_) &
                     (address_dedup['Latitude']==la) &
                     (address_dedup['Longitude']==lo)]
        if len(temp) == 1:
            address = temp['Address'].values[0]

    return address
        
input_data['Address'] = input_data.apply(clean_address, axis=1)
input_data

Unnamed: 0,Inspection ID,DBA Name,AKA Name,License #,Facility Type,Risk,Address,City,State,Zip,Inspection Date,Inspection Type,Results,Violations,Latitude,Longitude
0,2079132,MARRIOT MARQUIS CHICAGO,MARRIOT MARQUIS CHICAGO,2517328,restaurant,Risk 1 (High),2121 S PRAIRIE AVE,CHICAGO,IL,60616,2017-08-28,license,Pass,,41.853651,-87.620534
1,2079129,JETS PIZZA,JETS PIZZA,2522268,restaurant,Risk 2 (Medium),1025 W MADISON ST,CHICAGO,IL,60607,2017-08-28,license,Not Ready,,41.881572,-87.653052
2,2079125,ROOM 1520,ROOM 1520,2446638,special event,Risk 3 (Low),301 N JUSTINE ST,CHICAGO,IL,60607,2017-08-28,license re-inspection,Not Ready,8. SANITIZING RINSE FOR EQUIPMENT AND UTENSILS...,41.886577,-87.665328
3,2079123,MARRIOT MARQUIS CHICAGO,MARRIOT MARQUIS CHICAGO,2517338,restaurant,Risk 1 (High),2121 S PRAIRIE AVE,CHICAGO,IL,60616,2017-08-28,license,Pass,"35. WALLS, CEILINGS, ATTACHED EQUIPMENT CONSTR...",41.853651,-87.620534
4,2079105,CHARTWELLS,CICS WEST BELDEN CAMPUS,2549079,school,Risk 1 (High),2245 N MCVICKER AVE,CHICAGO,IL,60639,2017-08-28,license re-inspection,Pass,18. NO EVIDENCE OF RODENT OR INSECT OUTER OPEN...,41.921675,-87.776711
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
153805,67732,WOLCOTTS,TROQUET,1992039,restaurant,Risk 1 (High),1834 W MONTROSE AVE,CHICAGO,IL,60613,2010-01-04,license re-inspection,Pass,2. FACILITIES TO MAINTAIN PROPER TEMPERATURE -...,41.961606,-87.675967
153806,67757,DUNKIN DONUTS/BASKIN-ROBBINS,DUNKIN DONUTS/BASKIN-ROBBINS,1380279,restaurant,Risk 2 (Medium),100 W RANDOLPH ST,CHICAGO,IL,60601,2010-01-04,tag removal,Pass,,41.884586,-87.631010
153807,52234,CAFE 608,CAFE 608,2013328,restaurant,Risk 1 (High),608 W BARRY AVE,CHICAGO,IL,60657,2010-01-04,license re-inspection,Pass,32. FOOD AND NON-FOOD CONTACT SURFACES PROPERL...,41.938007,-87.644755
153808,70269,MRDANIELS,MRDANIELS,1899292,restaurant,Risk 1 (High),5645 W BELMONT AVE,CHICAGO,IL,60634,2010-01-04,license re-inspection,Pass,2. FACILITIES TO MAINTAIN PROPER TEMPERATURE -...,41.938443,-87.768318


Confirm the change:

In [15]:
temp = input_data[['DBA Name','AKA Name', 'Address', 'City', 'State', 'Zip', 'Latitude','Longitude']].drop_duplicates()
temp.groupby(
    by=['DBA Name','AKA Name', 'City', 'State', 'Zip', 'Latitude', 'Longitude'], dropna=False)\
    .filter(lambda x: len(x) > 1).sort_values(by='AKA Name', ascending=False)

Unnamed: 0,DBA Name,AKA Name,Address,City,State,Zip,Latitude,Longitude


**[exploration]**  
Check the Risk column:  
For the same facility, do they have conflicting Risk status?  
Note that it is possible the Risk level could change by time. I checked and confirmed that even on the same date, most of the following records still have conflicting Risk status.  

In [20]:
temp = input_data[['License #', 'DBA Name', 'AKA Name', 'Address', 'Risk']].drop_duplicates()
duplicated_risk = temp[temp.duplicated(subset=['License #', 'DBA Name', 'AKA Name', 'Address'], keep=False)].sort_values(by='License #')
duplicated_risk

Unnamed: 0,License #,DBA Name,AKA Name,Address,Risk
127712,,FLAV-O-RICIOUS,FLAV-O-RICIOUS,600 E GRAND AVE,Risk 3 (Low)
127627,,FLAV-O-RICIOUS,FLAV-O-RICIOUS,600 E GRAND AVE,Risk 2 (Medium)
35736,1578736.0,MT ARARAT DAYCARE CENTER,MT ARARAT DAYCARE CENTER,6514-6516 W HIGGINS AVE,Risk 1 (High)
35751,1578736.0,MT ARARAT DAYCARE CENTER,MT ARARAT DAYCARE CENTER,6514-6516 W HIGGINS AVE,Risk 3 (Low)
33799,1579115.0,TURKISH CUISINE AND BAKERY,TURKISH CUISINE AND BAKERY,5605-5609 N CLARK ST,Risk 1 (High)
85087,1579115.0,TURKISH CUISINE AND BAKERY,TURKISH CUISINE AND BAKERY,5605-5609 N CLARK ST,Risk 3 (Low)
131692,1739775.0,ROTISSERIE ENT & ANNETS,ROTISSERIE ENT & ANNETS,2009 N BISSELL ST,Risk 2 (Medium)
131690,1739775.0,ROTISSERIE ENT & ANNETS,ROTISSERIE ENT & ANNETS,2009 N BISSELL ST,Risk 3 (Low)
2313,1947909.0,SUBWAY,SUBWAY,6450 S PULASKI RD,Risk 1 (High)
115550,1947909.0,SUBWAY,SUBWAY,6450 S PULASKI RD,Risk 3 (Low)


**[cleaning]**  
Let's clean up the "Risk" column for these situations:  
- Infer and add back missing values: For facilities that have a certain level and "NaN", we'll replace the "NaN" with the Risk level.  
- Make conflicting values invalid: For facilities that have multiple non-"NaN" levels, we'll replace all of them with "NaN".

In [30]:
def combine_risk(x):
    non_nan_risks = list(set(x) - set([np.nan]))
    return non_nan_risks[0] if len(non_nan_risks) == 1 else np.nan

risk_dedup = duplicated_risk.groupby(by=['License #', 'DBA Name', 'AKA Name', 'Address'], dropna=False)['Risk'].agg(combine_risk).rename('Risk').reset_index()
risk_dedup

Unnamed: 0,License #,DBA Name,AKA Name,Address,Risk
0,,FLAV-O-RICIOUS,FLAV-O-RICIOUS,600 E GRAND AVE,
1,1578736.0,MT ARARAT DAYCARE CENTER,MT ARARAT DAYCARE CENTER,6514-6516 W HIGGINS AVE,
2,1579115.0,TURKISH CUISINE AND BAKERY,TURKISH CUISINE AND BAKERY,5605-5609 N CLARK ST,
3,1739775.0,ROTISSERIE ENT & ANNETS,ROTISSERIE ENT & ANNETS,2009 N BISSELL ST,
4,1947909.0,SUBWAY,SUBWAY,6450 S PULASKI RD,
5,1954774.0,WELLS STREET POPCORN,WELLS STREET POPCORN,2804 N CLARK ST,
6,1974918.0,ATINOS PIZZA,ATINOS PIZZA,4610 W DIVERSEY AVE,
7,2013962.0,KENDALL COLLEGE,LAB KITCHEN B-ROOM 120B,900 N NORTH BRANCH ST,
8,2031677.0,PORK SHOPPE,PORK SHOPPE,2755 W BELMONT AVE,
9,203223.0,MORE CUPCAKES,MORE CUPCAKES,1 E DELAWARE PL,


In [31]:
license_to_check, dba_to_check, address_to_check = list(risk_dedup['License #']), list(risk_dedup['DBA Name']), list(risk_dedup['Address'])

def clean_risk(x):
    license, dba, aka, address, risk = x['License #'], x['DBA Name'], x['AKA Name'], x['Address'], x['Risk']
    if (license in license_to_check) and (dba in dba_to_check) and (address in address_to_check):
        temp = risk_dedup[(risk_dedup['License #']==license) &
                     (risk_dedup['DBA Name']==dba) &
                     (pd.isnull(risk_dedup['AKA Name']) | (risk_dedup['AKA Name']==aka)) &
                     (risk_dedup['Address']==address)]
        if len(temp) == 1:
            risk = temp['Risk'].values[0]

    return risk
        
input_data['Risk'] = input_data.apply(clean_risk, axis=1)
input_data

Unnamed: 0,Inspection ID,DBA Name,AKA Name,License #,Facility Type,Risk,Address,City,State,Zip,Inspection Date,Inspection Type,Results,Violations,Latitude,Longitude
0,2079132,MARRIOT MARQUIS CHICAGO,MARRIOT MARQUIS CHICAGO,2517328,restaurant,Risk 1 (High),2121 S PRAIRIE AVE,CHICAGO,IL,60616,2017-08-28,license,Pass,,41.853651,-87.620534
1,2079129,JETS PIZZA,JETS PIZZA,2522268,restaurant,Risk 2 (Medium),1025 W MADISON ST,CHICAGO,IL,60607,2017-08-28,license,Not Ready,,41.881572,-87.653052
2,2079125,ROOM 1520,ROOM 1520,2446638,special event,Risk 3 (Low),301 N JUSTINE ST,CHICAGO,IL,60607,2017-08-28,license re-inspection,Not Ready,8. SANITIZING RINSE FOR EQUIPMENT AND UTENSILS...,41.886577,-87.665328
3,2079123,MARRIOT MARQUIS CHICAGO,MARRIOT MARQUIS CHICAGO,2517338,restaurant,Risk 1 (High),2121 S PRAIRIE AVE,CHICAGO,IL,60616,2017-08-28,license,Pass,"35. WALLS, CEILINGS, ATTACHED EQUIPMENT CONSTR...",41.853651,-87.620534
4,2079105,CHARTWELLS,CICS WEST BELDEN CAMPUS,2549079,school,Risk 1 (High),2245 N MCVICKER AVE,CHICAGO,IL,60639,2017-08-28,license re-inspection,Pass,18. NO EVIDENCE OF RODENT OR INSECT OUTER OPEN...,41.921675,-87.776711
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
153805,67732,WOLCOTTS,TROQUET,1992039,restaurant,Risk 1 (High),1834 W MONTROSE AVE,CHICAGO,IL,60613,2010-01-04,license re-inspection,Pass,2. FACILITIES TO MAINTAIN PROPER TEMPERATURE -...,41.961606,-87.675967
153806,67757,DUNKIN DONUTS/BASKIN-ROBBINS,DUNKIN DONUTS/BASKIN-ROBBINS,1380279,restaurant,Risk 2 (Medium),100 W RANDOLPH ST,CHICAGO,IL,60601,2010-01-04,tag removal,Pass,,41.884586,-87.631010
153807,52234,CAFE 608,CAFE 608,2013328,restaurant,Risk 1 (High),608 W BARRY AVE,CHICAGO,IL,60657,2010-01-04,license re-inspection,Pass,32. FOOD AND NON-FOOD CONTACT SURFACES PROPERL...,41.938007,-87.644755
153808,70269,MRDANIELS,MRDANIELS,1899292,restaurant,Risk 1 (High),5645 W BELMONT AVE,CHICAGO,IL,60634,2010-01-04,license re-inspection,Pass,2. FACILITIES TO MAINTAIN PROPER TEMPERATURE -...,41.938443,-87.768318


Confirm the change:

In [32]:
temp = input_data[['License #', 'DBA Name', 'AKA Name', 'Address', 'Risk']].drop_duplicates()
temp[temp.duplicated(subset=['License #', 'DBA Name', 'AKA Name', 'Address'], keep=False)].sort_values(by='License #')

Unnamed: 0,License #,DBA Name,AKA Name,Address,Risk


**[exploration]**  
Check that all inspection IDs are unique:

In [33]:
len(set(input_data['Inspection ID'])) == len(list(input_data['Inspection ID']))

True

**[exploration]**  
However, if we ignore the inspection ID, there seem to be duplicated records:

In [34]:
input_data[input_data[list(input_data)[1:]].duplicated(keep=False)]

Unnamed: 0,Inspection ID,DBA Name,AKA Name,License #,Facility Type,Risk,Address,City,State,Zip,Inspection Date,Inspection Type,Results,Violations,Latitude,Longitude
2084,2069297,PICOSITO GRILL-BAR,PICOSITO GRILL-BAR,2158267,restaurant,Risk 1 (High),3553-3559 E 100TH ST,CHICAGO,IL,60617,2017-06-21,canvass,No Entry,,41.713767,-87.536796
2138,2064677,PICOSITO GRILL-BAR,PICOSITO GRILL-BAR,2158267,restaurant,Risk 1 (High),3553-3559 E 100TH ST,CHICAGO,IL,60617,2017-06-21,canvass,No Entry,,41.713767,-87.536796
3648,2015324,METAL HAVEN GRILL,METAL HAVEN GRILL,2457868,restaurant,Risk 1 (High),3835 W FULLERTON AVE,CHICAGO,IL,60647,2017-05-22,canvass,Out of Business,,41.924416,-87.723320
3694,2050813,METAL HAVEN GRILL,METAL HAVEN GRILL,2457868,restaurant,Risk 1 (High),3835 W FULLERTON AVE,CHICAGO,IL,60647,2017-05-22,canvass,Out of Business,,41.924416,-87.723320
7507,2010211,LOOMIS FOOD MARKET,LOOMIS FOOD MARKET,2004423,grocery,Risk 2 (Medium),6859 S LOOMIS BLVD,CHICAGO,IL,60636,2017-03-14,complaint,No Entry,,41.768716,-87.658922
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
152903,68271,DE ARCOS PIZZA & RESTAURANT,DE ARCOS PIZZA & RESTAURANT,1577283,restaurant,Risk 1 (High),2832 E 87TH ST,CHICAGO,IL,60617,2010-01-22,tag removal,Pass,,41.737447,-87.555039
153108,88223,ASPIRA HAUGAN,ASPIRA HAUGAN MIDDLE SCHOOL,66351,school,Risk 1 (High),3729 W LELAND AVE,CHICAGO,IL,60625,2010-01-20,canvass,Pass,,41.966378,-87.721826
153109,88222,ASPIRA HAUGAN,ASPIRA HAUGAN MIDDLE SCHOOL,66351,school,Risk 1 (High),3729 W LELAND AVE,CHICAGO,IL,60625,2010-01-20,canvass,Pass,,41.966378,-87.721826
153113,158274,ARGENTINA FOODS,ARGENTINA FOODS,57047,grocery,Risk 2 (Medium),4500 S WOOD ST,CHICAGO,IL,60609,2010-01-20,out of business,Fail,,41.812105,-87.670072


**[cleaning]**  
There are 175 records that contain duplicated information. For example, inspection #2069297 and #2064677 are exactly the same except for the inspection ID itself. For the purpose of our target use case, we don't need those repeated records, so we'll remove them:

In [35]:
input_data = input_data.drop_duplicates(subset=list(input_data)[1:], ignore_index=True)
input_data

Unnamed: 0,Inspection ID,DBA Name,AKA Name,License #,Facility Type,Risk,Address,City,State,Zip,Inspection Date,Inspection Type,Results,Violations,Latitude,Longitude
0,2079132,MARRIOT MARQUIS CHICAGO,MARRIOT MARQUIS CHICAGO,2517328,restaurant,Risk 1 (High),2121 S PRAIRIE AVE,CHICAGO,IL,60616,2017-08-28,license,Pass,,41.853651,-87.620534
1,2079129,JETS PIZZA,JETS PIZZA,2522268,restaurant,Risk 2 (Medium),1025 W MADISON ST,CHICAGO,IL,60607,2017-08-28,license,Not Ready,,41.881572,-87.653052
2,2079125,ROOM 1520,ROOM 1520,2446638,special event,Risk 3 (Low),301 N JUSTINE ST,CHICAGO,IL,60607,2017-08-28,license re-inspection,Not Ready,8. SANITIZING RINSE FOR EQUIPMENT AND UTENSILS...,41.886577,-87.665328
3,2079123,MARRIOT MARQUIS CHICAGO,MARRIOT MARQUIS CHICAGO,2517338,restaurant,Risk 1 (High),2121 S PRAIRIE AVE,CHICAGO,IL,60616,2017-08-28,license,Pass,"35. WALLS, CEILINGS, ATTACHED EQUIPMENT CONSTR...",41.853651,-87.620534
4,2079105,CHARTWELLS,CICS WEST BELDEN CAMPUS,2549079,school,Risk 1 (High),2245 N MCVICKER AVE,CHICAGO,IL,60639,2017-08-28,license re-inspection,Pass,18. NO EVIDENCE OF RODENT OR INSECT OUTER OPEN...,41.921675,-87.776711
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
153712,67732,WOLCOTTS,TROQUET,1992039,restaurant,Risk 1 (High),1834 W MONTROSE AVE,CHICAGO,IL,60613,2010-01-04,license re-inspection,Pass,2. FACILITIES TO MAINTAIN PROPER TEMPERATURE -...,41.961606,-87.675967
153713,67757,DUNKIN DONUTS/BASKIN-ROBBINS,DUNKIN DONUTS/BASKIN-ROBBINS,1380279,restaurant,Risk 2 (Medium),100 W RANDOLPH ST,CHICAGO,IL,60601,2010-01-04,tag removal,Pass,,41.884586,-87.631010
153714,52234,CAFE 608,CAFE 608,2013328,restaurant,Risk 1 (High),608 W BARRY AVE,CHICAGO,IL,60657,2010-01-04,license re-inspection,Pass,32. FOOD AND NON-FOOD CONTACT SURFACES PROPERL...,41.938007,-87.644755
153715,70269,MRDANIELS,MRDANIELS,1899292,restaurant,Risk 1 (High),5645 W BELMONT AVE,CHICAGO,IL,60634,2010-01-04,license re-inspection,Pass,2. FACILITIES TO MAINTAIN PROPER TEMPERATURE -...,41.938443,-87.768318


**[exploration]** 

In [39]:
num_inspection = len(input_data)
print(f"How many inspection records are there in total? {num_inspection}")
num_license = len(set(input_data['License #']))
print(f"How many unique license numbers are there? {num_license}")
num_facility = len(input_data[['License #', 'DBA Name', 'AKA Name', 'Facility Type', 'Address']].drop_duplicates())
print(f"How many unique food facilities are there? {num_facility}")
num_dates = len(set(input_data['Inspection Date']))
print(f"How many unique inspection dates are there? {num_dates}")

How many inspection records are there in total? 153717
How many unique license numbers are there? 32752
How many unique food facilities are there? 34037
How many unique inspection dates are there? 1946


## Try to better understand the data

**[exploration]**  
### 1. The same License # can have multiple DBA Name:

In [40]:
temp = input_data[(input_data['License #']!='') & (input_data['License #']!='nan') & (~pd.isnull(input_data['License #']))][[
    'License #', 'DBA Name']].drop_duplicates()
temp[temp.duplicated(subset='License #', keep=False)].sort_values(by='License #')

Unnamed: 0,License #,DBA Name
2031,1000774,RECLAIMED BAR AND RESTAURANT
13475,1000774,CRABBBY KIMS
112542,104,VITO & NICKS LOUNGE
65434,104,MIARAS CONTL PASTRY DELI INC
91724,1042664,SPOT 6
...,...,...
15801,9648,CAFE #10
134717,968,CAFE BERNARD
3618,968,CHEZ MOI
2002,99,FALCOS PIZZA


### 2. The same License # can have multiple addresses:

In [41]:
temp = input_data[(input_data['License #']!='') & (input_data['License #']!='nan') & (~pd.isnull(input_data['License #']))][[
    'License #', 'Address', 'City', 'State', 'Zip', 'Latitude','Longitude']].drop_duplicates()
temp[temp.duplicated(subset='License #', keep=False)].sort_values(by='License #')

Unnamed: 0,License #,Address,City,State,Zip,Latitude,Longitude
65434,104,7051-7053 W ADDISON ST,CHICAGO,IL,60634,41.945234,-87.802927
112542,104,8433-8435 S PULASKI RD,CHICAGO,IL,60652,41.739329,-87.721440
18082,1045301,1400 W RANDOLPH ST,CHICAGO,IL,60607,41.884274,-87.662433
98090,1045301,160 N LOOMIS ST,CHICAGO,IL,60607,41.883991,-87.662525
136005,1093945,2321 W LAWRENSE AVE,CHICAGO,IL,60625,41.968527,-87.687315
...,...,...,...,...,...,...,...
18290,845,11 E HUBBARD ST,CHICAGO,IL,60611,41.889964,-87.627507
99450,901,6647 S HALSTED ST,CHICAGO,IL,60621,41.772902,-87.644461
137479,901,3835 W 26TH ST,CHICAGO,IL,60623,41.844188,-87.720950
10657,99,203 N LA SALLE ST,CHICAGO,IL,60601,41.885822,-87.632304


Note: even the same set of 'License #', 'DBA Name', 'AKA Name', 'Facility Type' can still have different Address/Latitude/Longitude:

In [42]:
temp = input_data[(input_data['License #']!='') & (input_data['License #']!='nan') & (~pd.isnull(input_data['License #']))][[
    'License #', 'DBA Name', 'AKA Name', 'Facility Type', 'Address', 'City', 'State', 'Zip', 'Latitude','Longitude']].drop_duplicates()
temp[temp.duplicated(subset=['License #', 'DBA Name', 'AKA Name', 'Facility Type'], keep=False)].sort_values(by='License #')

Unnamed: 0,License #,DBA Name,AKA Name,Facility Type,Address,City,State,Zip,Latitude,Longitude
3425,1142125,POTBELLY SANDWICH WORKS LLC,POTBELLY SANDWICH WORKS LLC,restaurant,5700 S CICERO AVE,CHICAGO,IL,60638.0,41.789329,-87.741646
77705,1142125,POTBELLY SANDWICH WORKS LLC,POTBELLY SANDWICH WORKS LLC,restaurant,5800 S CICERO AVE,CHICAGO,IL,60638.0,41.787652,-87.74156
3196,13976,ALDI INC #64,ALDI INC #64,grocery,1753 N MILWAUKEE AVE,CHICAGO,IL,60647.0,41.913342,-87.681822
3210,13976,ALDI INC #64,ALDI INC #64,grocery,1767 N MILWAUKEE AVE,CHICAGO,IL,60647.0,41.913588,-87.682203
106612,1476455,THUNDERBIRD CATERING,THUNDERBIRD CATERING,mobile food preparer,1204 W 36TH PL,CHICAGO,IL,60609.0,41.828094,-87.655854
108307,1476455,THUNDERBIRD CATERING,THUNDERBIRD CATERING,mobile food preparer,1204 W 35 ST,CHICAGO,IL,60609.0,41.830726,-87.656012
120570,1743553,STARBUCKS COFFEE #10957,STARBUCKS COFFEE #10957,restaurant,150 N MICHIGAN AVE,CHICAGO,IL,60601.0,41.884531,-87.624602
3243,1743553,STARBUCKS COFFEE #10957,STARBUCKS COFFEE #10957,restaurant,150 E RANDOLPH ST,CHICAGO,IL,60601.0,41.884628,-87.622613
7737,188865,TASTY PLACE,TASTY PLACE,bakery,2339 S WENTWORTH AVE,CHICAGO,IL,60616.0,41.849788,-87.631797
98419,188865,TASTY PLACE,TASTY PLACE,bakery,2101 S CHINA PL,CHICAGO,IL,60616.0,41.854105,-87.634059


### 3. The same license number and DBA Name can have multiple AKA Names.   
For example, there are 81 distinct DBA Name/facility type combinations for license #14616 ILLINOIS SPORTSERVICE INC:

In [43]:
temp = input_data[['License #', 'DBA Name', 'AKA Name', 'Facility Type']].drop_duplicates().groupby(
    by=['License #', 'DBA Name'], dropna=False).size().rename('count').reset_index()\
    .sort_values(by='count', ascending=False).reset_index(drop=True)
temp.head(20)

Unnamed: 0,License #,DBA Name,count
0,14616,ILLINOIS SPORTSERVICE INC,81
1,1974745,THE UNITED CENTER,45
2,1354323,SPORTSERVICE SOLDIER FIELD,45
3,1574001,LEVY RESTAURANTS AT WRIGLEY FIELD,19
4,1932251,MCCORMICK PLACE,16
5,2013962,KENDALL COLLEGE,13
6,20481,THE GREAT STATE FARE,8
7,1354323,SPORT SERVICE SOLDIER FIELD,6
8,1574002,LEVY RESTAURANTS AT WRIGLEY FIELD,5
9,1932257,MCCORMICK PLACE,5


There are 178 such License #/DBA Name combinations that have more than 1 DBA Name/facility type combinations:

In [44]:
len(temp[temp['count']>1])

178

Take a closer look at license #14616:

In [46]:
input_data[(input_data['License #']=='14616') & (input_data['DBA Name']=='ILLINOIS SPORTSERVICE INC')][
    ['License #', 'DBA Name', 'AKA Name', 'Facility Type']].drop_duplicates()

Unnamed: 0,License #,DBA Name,AKA Name,Facility Type
2209,14616,ILLINOIS SPORTSERVICE INC,GUARANTEED RATE FIELD,stadium
123192,14616,ILLINOIS SPORTSERVICE INC,MAIN COMMISSARY,commissary
123200,14616,ILLINOIS SPORTSERVICE INC,SHERM LOLLARS GUARD THE PLATE GRILLE STAND #524,restaurant
128176,14616,ILLINOIS SPORTSERVICE INC,TEX-MEX 3 - STEAK (#544),kiosk
128419,14616,ILLINOIS SPORTSERVICE INC,BBQ BRISKET (#531),kiosk
...,...,...,...,...
149924,14616,ILLINOIS SPORTSERVICE INC,DIPPING DOTS (SEC #132),restaurant
149950,14616,ILLINOIS SPORTSERVICE INC,GO GO WHITE SOX,restaurant
149955,14616,ILLINOIS SPORTSERVICE INC,NACHO 8 (#544),restaurant
149973,14616,ILLINOIS SPORTSERVICE INC,HOT DOG VIENNA BEEF 7 (#557),restaurant


### 4. The same address can have multiple license numbers

In [47]:
temp = input_data[(input_data['License #']!='') & (input_data['License #']!='nan') & (~pd.isnull(input_data['License #']))][[
    'License #', 'Address', 'City', 'State', 'Zip', 'Latitude','Longitude']].drop_duplicates()
temp[temp.duplicated(subset=['Address', 'City', 'State', 'Zip', 'Latitude','Longitude'], keep=False)].sort_values(by='Address')

Unnamed: 0,License #,Address,City,State,Zip,Latitude,Longitude
10019,2215538,1 E 113TH ST,CHICAGO,IL,60628,41.688848,-87.622894
116232,21702,1 E 113TH ST,CHICAGO,IL,60628,41.688848,-87.622894
116234,1800876,1 E 113TH ST,CHICAGO,IL,60628,41.688848,-87.622894
1704,1904686,1 E DELAWARE PL,CHICAGO,IL,60611,41.899031,-87.628192
23599,2397758,1 E DELAWARE PL,CHICAGO,IL,60611,41.899031,-87.628192
...,...,...,...,...,...,...,...
88096,2013312,9917 S EWING AVE,CHICAGO,IL,60617,41.715244,-87.535130
9089,3247017,9928 S CRANDON AVE FL,CHICAGO,IL,60617,41.714402,-87.567167
75412,2046663,9928 S CRANDON AVE FL,CHICAGO,IL,60617,41.714402,-87.567167
83366,2093742,9939 S HALSTED ST,CHICAGO,IL,60628,41.713035,-87.642832


### 5. Even the same (?) food facility can have different license numbers!

In [48]:
temp = input_data[[
    'DBA Name','AKA Name','License #','Facility Type','Address','City','State','Zip',
    'Latitude','Longitude']].drop_duplicates()
facility_counts = temp.groupby(
    by=['DBA Name','AKA Name','Facility Type','Address','City','State','Zip','Latitude','Longitude'], dropna=False)\
    .size().rename('count').reset_index().sort_values(by='count', ascending=False)
facility_counts.head(20)

Unnamed: 0,DBA Name,AKA Name,Facility Type,Address,City,State,Zip,Latitude,Longitude,count
27185,TRIPLE A SERVICES INC,TRIPLE A SERVICES INC,mobile food preparer,2637 S THROOP ST FL,CHICAGO,IL,60608,41.844671,-87.654962,43
19441,PALETERIA Y NEVERIA LA MEXICANA YOGURT AND CHURRO,LA MEXICANA TAMALE CART,mobile food preparer,4171 S ARCHER AVE,CHICAGO,IL,60632,41.817618,-87.698258,16
27676,VHC LOWER LLC,3RD FLOOR - BANQUET/MEETING ROOMS(LL KITCHEN),restaurant,203 N WABASH AVE,CHICAGO,IL,60601,41.885828,-87.626138,15
23291,SOHO HOUSE,SOHO HOUSE,restaurant,113-125 N GREEN ST,CHICAGO,IL,60607,41.883557,-87.648432,10
19383,PALETERIA AZTECA #2,,mobile food preparer,3119 W CERMAK RD,CHICAGO,IL,60623,41.851676,-87.703696,10
27899,WALDORF ASTORIA CHICAGO,WALDORF ASTORIA CHICAGO,restaurant,11 E WALTON ST,CHICAGO,IL,60611,41.899835,-87.627799,9
9517,FLAT TOP GRILL,FLAT TOP GRILL,restaurant,30 S WABASH AVE,CHICAGO,IL,60603,41.881335,-87.626335,8
19327,PACIFIC LANGHAM CHICAGO CORPORATION,THE LANGHAM CHICAGO,restaurant,330 N WABASH AVE,CHICAGO,IL,60611,41.887939,-87.627036,7
19380,PALETERIA AZTECA #2,PALETERIA AZTECA #2,mobile food preparer,3119 W CERMAK RD,CHICAGO,IL,60623,41.851676,-87.703696,7
16670,MARRIOT MARQUIS CHICAGO,MARRIOT MARQUIS CHICAGO,restaurant,2121 S PRAIRIE AVE,CHICAGO,IL,60616,41.853651,-87.620534,7


In [49]:
print(f"{len(facility_counts[facility_counts['count']>1])} such facilities are involved!")
print(f"That's {round(100*len(facility_counts[facility_counts['count']>1])/len(facility_counts), 2)}% of all facilities!!")

4015 such facilities are involved!
That's 13.83% of all facilities!!


Note that some of them have the facility type of "Mobile Prepared Food Vendor". Some of the "Restaurant" facilities seem to be hotels, or within a complex.   
In the following example, most of them have the facility type of "Mobile Prepared Food Vendor":

In [51]:
input_data[input_data['AKA Name']=='LA MEXICANA TAMALE CART'][
    ['License #', 'DBA Name','AKA Name','Facility Type','Address','City','State','Zip',
     'Latitude','Longitude','Inspection Date']
].drop_duplicates()

Unnamed: 0,License #,DBA Name,AKA Name,Facility Type,Address,City,State,Zip,Latitude,Longitude,Inspection Date
5099,2506389,PALETERIA Y NEVERIA LA MEXICANA YOGURT AND CHURRO,LA MEXICANA TAMALE CART,mobile food preparer,4171 S ARCHER AVE,CHICAGO,IL,60632,41.817618,-87.698258,2017-01-10
11717,2506388,PALETERIA Y NEVERIA LA MEXICANA YOGURT AND CHURRO,LA MEXICANA TAMALE CART,mobile food preparer,4171 S ARCHER AVE,CHICAGO,IL,60632,41.817618,-87.698258,2017-01-10
11720,2506235,PALETERIA Y NEVERIA LA MEXICANA YOGURT AND CHURRO,LA MEXICANA TAMALE CART,mobile food preparer,4171 S ARCHER AVE,CHICAGO,IL,60632,41.817618,-87.698258,2017-01-10
11721,2506234,PALETERIA Y NEVERIA LA MEXICANA YOGURT AND CHURRO,LA MEXICANA TAMALE CART,mobile food preparer,4171 S ARCHER AVE,CHICAGO,IL,60632,41.817618,-87.698258,2017-01-10
11725,2506233,PALETERIA Y NEVERIA LA MEXICANA YOGURT AND CHURRO,LA MEXICANA TAMALE CART,mobile food preparer,4171 S ARCHER AVE,CHICAGO,IL,60632,41.817618,-87.698258,2017-01-10
11727,2506231,PALETERIA Y NEVERIA LA MEXICANA YOGURT AND CHURRO,LA MEXICANA TAMALE CART,mobile food preparer,4171 S ARCHER AVE,CHICAGO,IL,60632,41.817618,-87.698258,2017-01-10
11729,250623,PALETERIA Y NEVERIA LA MEXICANA YOGURT AND CHURRO,LA MEXICANA TAMALE CART,mobile food preparer,4171 S ARCHER AVE,CHICAGO,IL,60632,41.817618,-87.698258,2017-01-10
11731,2506222,PALETERIA Y NEVERIA LA MEXICANA YOGURT AND CHURRO,LA MEXICANA TAMALE CART,mobile food preparer,4171 S ARCHER AVE,CHICAGO,IL,60632,41.817618,-87.698258,2017-01-10
11735,250622,PALETERIA Y NEVERIA LA MEXICANA YOGURT AND CHURRO,LA MEXICANA TAMALE CART,mobile food preparer,4171 S ARCHER AVE,CHICAGO,IL,60632,41.817618,-87.698258,2017-01-10
11737,2506221,PALETERIA Y NEVERIA LA MEXICANA YOGURT AND CHURRO,LA MEXICANA TAMALE CART,mobile food preparer,4171 S ARCHER AVE,CHICAGO,IL,60632,41.817618,-87.698258,2017-01-10


**[exploration]**  
### 6. Are there any redundant inspection entries on the same day for the same facility?

In [53]:
temp2 = input_data[['Inspection ID', 'License #', 'DBA Name', 'AKA Name', 'Inspection Date', 'Inspection Type']]
inspection_counts2 = temp2.groupby(by=['License #', 'DBA Name', 'AKA Name', 'Inspection Date', 'Inspection Type'], dropna=False).size().rename('count').reset_index()\
    .sort_values(by='count', ascending=False)
inspection_counts2.head(10)

Unnamed: 0,License #,DBA Name,AKA Name,Inspection Date,Inspection Type,count
12931,1354323,SPORTSERVICE SOLDIER FIELD,GRIDIRON GRILL,2010-06-07,canvass,5
12933,1354323,SPORTSERVICE SOLDIER FIELD,GRIDIRON GRILL,2011-05-23,canvass,3
12892,1354323,SPORTSERVICE SOLDIER FIELD,1ST DOWN FRANKS,2010-06-07,canvass,3
12899,1354323,SPORTSERVICE SOLDIER FIELD,2ND CITY PIZZA,2010-06-07,canvass,3
16591,14616,ILLINOIS SPORTSERVICE INC,MOOSES DOGS AND POLISH 10 (#538),2010-03-25,canvass,2
23926,157883,BUFFET CASTLE,BUFFET CASTLE,2013-04-18,canvass,2
24426,1592201,6 DEGREES,6 DEGREES,2016-09-09,canvass,2
66801,2048785,TRUMP INTERNATIONAL HOTEL,SIXTEEN,2017-05-09,canvass,2
137502,3727,ALL AMERICAN NURSING HOME,ALL AMERICAN NURSING HOME,2010-02-25,canvass,2
139166,40155,POPEYES CHICKEN/INDIAN SWAAD,POPEYES CHICKEN/INDIAN SWAAD,2010-05-20,complaint re-inspection,2


In [54]:
print("How many unique food facilities/inspection date combinations have multiple inspections on that date?")
num_facility_date_multiple_inspections = len(inspection_counts2[inspection_counts2['count']>1])
num_facility_date_comb = len(inspection_counts2)
print(f"{num_facility_date_multiple_inspections} out of {num_facility_date_comb} ({round(100*num_facility_date_multiple_inspections/num_facility_date_comb, 2)}%)")

print("How many inspection records are involved ?")
num_inspection_involved = inspection_counts2[inspection_counts2['count']>1]['count'].sum()
print(f"{num_inspection_involved} out of {num_inspection} ({round(100*num_inspection_involved/num_inspection, 2)}%)")

print("How many unique license numbers are involved?")
num_license_involved = len(inspection_counts2[inspection_counts2['count']>1][['License #']].drop_duplicates())
print(f"{num_license_involved} out of {num_license} ({round(100*num_license_involved/num_license, 2)}%)")

print("How many unique food facilities are involved?")
num_facility_involved = len(inspection_counts2[inspection_counts2['count']>1][['License #', 'AKA Name']].drop_duplicates())
print(f"{num_facility_involved} out of {num_facility} ({round(100*num_facility_involved/num_facility, 2)}%)")

print("How many unique inspection dates are involved?")
num_dates_involved = len(inspection_counts2[inspection_counts2['count']>1][['Inspection Date']].drop_duplicates())
print(f"{num_dates_involved} out of {num_dates} ({round(100*num_dates_involved/num_dates, 2)}%)")

How many unique food facilities/inspection date combinations have multiple inspections on that date?
393 out of 153318 (0.26%)
How many inspection records are involved ?
792 out of 153717 (0.52%)
How many unique license numbers are involved?
370 out of 32752 (1.13%)
How many unique food facilities are involved?
381 out of 34037 (1.12%)
How many unique inspection dates are involved?
336 out of 1946 (17.27%)


What's happening there? Take one example to look into the details:

In [55]:
input_data[(input_data['License #']=='1354323') & (input_data['AKA Name']=='GRIDIRON GRILL') & (input_data['Inspection Date']=='2010-06-07')]

Unnamed: 0,Inspection ID,DBA Name,AKA Name,License #,Facility Type,Risk,Address,City,State,Zip,Inspection Date,Inspection Type,Results,Violations,Latitude,Longitude
146342,250633,SPORTSERVICE SOLDIER FIELD,GRIDIRON GRILL,1354323,restaurant,Risk 2 (Medium),1410 S MUSEUM CAMPUS DR,CHICAGO,IL,60605,2010-06-07,canvass,Pass,"34. FLOORS: CONSTRUCTED PER CODE, CLEANED, GOO...",41.862628,-87.615031
146350,197469,SPORTSERVICE SOLDIER FIELD,GRIDIRON GRILL,1354323,restaurant,Risk 2 (Medium),1410 S MUSEUM CAMPUS DR,CHICAGO,IL,60605,2010-06-07,canvass,Pass,33. FOOD AND NON-FOOD CONTACT EQUIPMENT UTENSI...,41.862628,-87.615031
146358,112420,SPORTSERVICE SOLDIER FIELD,GRIDIRON GRILL,1354323,restaurant,Risk 2 (Medium),1410 S MUSEUM CAMPUS DR,CHICAGO,IL,60605,2010-06-07,canvass,Pass,32. FOOD AND NON-FOOD CONTACT SURFACES PROPERL...,41.862628,-87.615031
146363,160390,SPORTSERVICE SOLDIER FIELD,GRIDIRON GRILL,1354323,restaurant,Risk 2 (Medium),1410 S MUSEUM CAMPUS DR,CHICAGO,IL,60605,2010-06-07,canvass,Pass,"35. WALLS, CEILINGS, ATTACHED EQUIPMENT CONSTR...",41.862628,-87.615031
146406,197476,SPORTSERVICE SOLDIER FIELD,GRIDIRON GRILL,1354323,restaurant,Risk 2 (Medium),1410 S MUSEUM CAMPUS DR,CHICAGO,IL,60605,2010-06-07,canvass,Pass,"34. FLOORS: CONSTRUCTED PER CODE, CLEANED, GOO...",41.862628,-87.615031


OK. It looks like there are indeed multiple inspections on the same date. Although the results are the same ("Pass"), the detailed "Violations" notes are different! In this case, **we will need to collapse these inspections**. (We'll do this in a later section!)  
Also note that some of the violations have different comments although they share the same violation code:

In [96]:
input_data.loc[146428, 'Violations']

'34. FLOORS: CONSTRUCTED PER CODE, CLEANED, GOOD REPAIR, COVING INSTALLED, DUST-LESS CLEANING METHODS USED - Comments: The floors shall be constructed per code, be smooth and easily cleaned, and be kept clean and in good repair. \n \nMUST DETAIL CLEAN FLOORS UNDER GRILL AND REMOVE FOOD DEBRIS AND GREASE BUILD UP.  '

In [97]:
input_data.loc[146493, 'Violations']

'34. FLOORS: CONSTRUCTED PER CODE, CLEANED, GOOD REPAIR, COVING INSTALLED, DUST-LESS CLEANING METHODS USED - Comments: The floors shall be constructed per code, be smooth and easily cleaned, and be kept clean and in good repair. \nSTANDING WATER NEXT TO GREASE TRAP. INSTRUCTED TO CLEAN AND MAINTAIN. | 35. WALLS, CEILINGS, ATTACHED EQUIPMENT CONSTRUCTED PER CODE: GOOD REPAIR, SURFACES CLEAN AND DUST-LESS CLEANING METHODS - Comments: The walls and ceilings shall be in good repair and easily cleaned. \nPEELING PAINT NOTED ON WALL NEXT TO DEEP FRYER, INSTRUCTED MANAGER TO REMOVE AND MAINTAIN.'

Let's also consider address, results and violatoins:  
Now there are no duplicates!

In [61]:
temp = input_data[['Inspection ID', 'License #', 'DBA Name', 'AKA Name', 'Facility Type', 'Address', 'Inspection Date', 'Inspection Type', 'Results', 'Violations']]
temp.duplicated(subset=list(temp)[1:], keep=False).sum()

0

## 2. Further data cleanup and split into multiple tables

### Facility

In [None]:
input_data[['DBA Name', 'AKA Name', 'License #', 'Facility Type', 'Address', 'City', 'State', 'Zip', 'Latitude','Longitude']]

### Facility_Risk

In [134]:
temp = input_data[['DBA Name', 'AKA Name', 'Address','Zip', 'Risk', 'Inspection Date']].drop_duplicates()
temp[temp.duplicated(subset=['DBA Name', 'AKA Name', 'Address','Zip', 'Inspection Date'], keep=False)]

Unnamed: 0,DBA Name,AKA Name,Address,Zip,Risk,Inspection Date
30,NANDO'S PERI-PERI,NANDO'S PERI-PERI,117 E LAKE ST,60601,Risk 3 (Low),2017-08-25
33,NANDO'S PERI-PERI,NANDO'S PERI-PERI,117 E LAKE ST,60601,Risk 1 (High),2017-08-25
52,THE PRESS ROOM,THE PRESS ROOM,1134 W WASHINGTON BLVD,60607,Risk 3 (Low),2017-08-24
54,THE PRESS ROOM,THE PRESS ROOM,1134 W WASHINGTON BLVD,60607,Risk 1 (High),2017-08-24
149,THE PRESS ROOM,THE PRESS ROOM,1134 W WASHINGTON BLVD,60607,Risk 3 (Low),2017-08-22
...,...,...,...,...,...,...
153143,FLO & SANTO,FLO & SANTO,1310 S WABASH AVE,60605,Risk 3 (Low),2010-01-19
153352,"WISHCO, INC.","WISHCO, INC.",1546 W 35TH ST,60609,Risk 3 (Low),2010-01-13
153353,"WISHCO, INC.","WISHCO, INC.",1546 W 35TH ST,60609,Risk 1 (High),2010-01-13
153608,HONG KONG MARKET,HONG KONG MARKET,520 W 24TH PL,60616,Risk 3 (Low),2010-01-07


In [112]:
temp = input_data[input_data['License #']!=''][['License #', 'Risk']].drop_duplicates()
temp[temp.duplicated(subset='License #', keep=False)].sort_values(by='License #')

Unnamed: 0,License #,Risk
65459,104,Risk 1 (High)
112603,104,Risk 3 (Low)
1729,1042664,Risk 1 (High)
91775,1042664,Risk 3 (Low)
4405,1094,Risk 1 (High)
...,...,...
37903,8425,Risk 2 (Medium)
137559,901,Risk 1 (High)
99502,901,Risk 3 (Low)
578,,Risk 2 (Medium)


In [117]:
temp = input_data[~pd.isnull(input_data['Risk'])][['License #', 'DBA Name', 'AKA Name', 'Risk']].drop_duplicates()
temp[temp.duplicated(subset=['License #', 'DBA Name', 'AKA Name'], keep=False)].sort_values(by='License #')

Unnamed: 0,License #,DBA Name,AKA Name,Risk
127712,,FLAV-O-RICIOUS,FLAV-O-RICIOUS,Risk 3 (Low)
127627,,FLAV-O-RICIOUS,FLAV-O-RICIOUS,Risk 2 (Medium)
33799,1579115.0,TURKISH CUISINE AND BAKERY,TURKISH CUISINE AND BAKERY,Risk 1 (High)
85087,1579115.0,TURKISH CUISINE AND BAKERY,TURKISH CUISINE AND BAKERY,Risk 3 (Low)
2313,1947909.0,SUBWAY,SUBWAY,Risk 1 (High)
115550,1947909.0,SUBWAY,SUBWAY,Risk 3 (Low)
80181,1954774.0,Wells Street Popcorn,Wells Street Popcorn,Risk 2 (Medium)
80180,1954774.0,Wells Street Popcorn,Wells Street Popcorn,Risk 3 (Low)
87107,1974918.0,ATINO'S PIZZA,ATINO'S PIZZA,Risk 3 (Low)
87109,1974918.0,ATINO'S PIZZA,ATINO'S PIZZA,Risk 1 (High)


In [116]:
temp = input_data[~pd.isnull(input_data['Risk'])][['License #', 'DBA Name', 'AKA Name', 'Inspection Date', 'Risk']].drop_duplicates()
temp[temp.duplicated(subset=['License #', 'DBA Name', 'AKA Name', 'Inspection Date'], keep=False)].sort_values(by='License #')

Unnamed: 0,License #,DBA Name,AKA Name,Inspection Date,Risk
85087,1579115,TURKISH CUISINE AND BAKERY,TURKISH CUISINE AND BAKERY,2013-08-26,Risk 3 (Low)
85099,1579115,TURKISH CUISINE AND BAKERY,TURKISH CUISINE AND BAKERY,2013-08-26,Risk 1 (High)
115550,1947909,SUBWAY,SUBWAY,2012-02-02,Risk 3 (Low)
115558,1947909,SUBWAY,SUBWAY,2012-02-02,Risk 1 (High)
80180,1954774,Wells Street Popcorn,Wells Street Popcorn,2013-11-07,Risk 3 (Low)
80181,1954774,Wells Street Popcorn,Wells Street Popcorn,2013-11-07,Risk 2 (Medium)
87107,1974918,ATINO'S PIZZA,ATINO'S PIZZA,2013-07-22,Risk 3 (Low)
87109,1974918,ATINO'S PIZZA,ATINO'S PIZZA,2013-07-22,Risk 1 (High)
69370,2013962,KENDALL COLLEGE,LAB KITCHEN B-ROOM 120B.,2013-09-09,Risk 3 (Low)
84339,2013962,KENDALL COLLEGE,LAB KITCHEN B-ROOM 120B.,2013-09-09,Risk 1 (High)


In [136]:
temp = input_data[~pd.isnull(input_data['Risk'])][['License #', 'DBA Name', 'AKA Name', 'Address', 'Inspection Date', 'Risk']].drop_duplicates()
temp[temp.duplicated(subset=['License #', 'DBA Name', 'AKA Name', 'Address', 'Inspection Date'], keep=False)].sort_values(by='License #')

Unnamed: 0,License #,DBA Name,AKA Name,Address,Inspection Date,Risk
85087,1579115,TURKISH CUISINE AND BAKERY,TURKISH CUISINE AND BAKERY,5605-5609 N CLARK ST,2013-08-26,Risk 3 (Low)
85099,1579115,TURKISH CUISINE AND BAKERY,TURKISH CUISINE AND BAKERY,5605-5609 N CLARK ST,2013-08-26,Risk 1 (High)
115550,1947909,SUBWAY,SUBWAY,6450 S PULASKI RD,2012-02-02,Risk 3 (Low)
115558,1947909,SUBWAY,SUBWAY,6450 S PULASKI RD,2012-02-02,Risk 1 (High)
80180,1954774,Wells Street Popcorn,Wells Street Popcorn,2804 N CLARK ST,2013-11-07,Risk 3 (Low)
80181,1954774,Wells Street Popcorn,Wells Street Popcorn,2804 N CLARK ST,2013-11-07,Risk 2 (Medium)
87107,1974918,ATINO'S PIZZA,ATINO'S PIZZA,4610 W DIVERSEY AVE,2013-07-22,Risk 3 (Low)
87109,1974918,ATINO'S PIZZA,ATINO'S PIZZA,4610 W DIVERSEY AVE,2013-07-22,Risk 1 (High)
69370,2013962,KENDALL COLLEGE,LAB KITCHEN B-ROOM 120B.,900 N NORTH BRANCH ST,2013-09-09,Risk 3 (Low)
84339,2013962,KENDALL COLLEGE,LAB KITCHEN B-ROOM 120B.,900 N NORTH BRANCH ST,2013-09-09,Risk 1 (High)


In [137]:
temp = input_data[~pd.isnull(input_data['Risk'])][['License #', 'DBA Name', 'AKA Name', 'Address', 'Risk']].drop_duplicates()
temp[temp.duplicated(subset=['License #', 'DBA Name', 'AKA Name', 'Address'], keep=False)].sort_values(by='License #')

Unnamed: 0,License #,DBA Name,AKA Name,Address,Risk
127712,,FLAV-O-RICIOUS,FLAV-O-RICIOUS,600 E GRAND AVE,Risk 3 (Low)
127627,,FLAV-O-RICIOUS,FLAV-O-RICIOUS,600 E GRAND AVE,Risk 2 (Medium)
33799,1579115.0,TURKISH CUISINE AND BAKERY,TURKISH CUISINE AND BAKERY,5605-5609 N CLARK ST,Risk 1 (High)
85087,1579115.0,TURKISH CUISINE AND BAKERY,TURKISH CUISINE AND BAKERY,5605-5609 N CLARK ST,Risk 3 (Low)
2313,1947909.0,SUBWAY,SUBWAY,6450 S PULASKI RD,Risk 1 (High)
115550,1947909.0,SUBWAY,SUBWAY,6450 S PULASKI RD,Risk 3 (Low)
80180,1954774.0,Wells Street Popcorn,Wells Street Popcorn,2804 N CLARK ST,Risk 3 (Low)
80181,1954774.0,Wells Street Popcorn,Wells Street Popcorn,2804 N CLARK ST,Risk 2 (Medium)
87107,1974918.0,ATINO'S PIZZA,ATINO'S PIZZA,4610 W DIVERSEY AVE,Risk 3 (Low)
87109,1974918.0,ATINO'S PIZZA,ATINO'S PIZZA,4610 W DIVERSEY AVE,Risk 1 (High)


In [115]:
temp = input_data[~pd.isnull(input_data['Risk'])][['License #', 'DBA Name', 'Risk']].drop_duplicates()
temp[temp.duplicated(subset=['License #', 'DBA Name'], keep=False)].sort_values(by='License #')

Unnamed: 0,License #,DBA Name,Risk
127712,,FLAV-O-RICIOUS,Risk 3 (Low)
127627,,FLAV-O-RICIOUS,Risk 2 (Medium)
149961,1120537,LEVY RESTAURANTS AT U.S CELLULAR FIELD,Risk 1 (High)
127783,1120537,LEVY RESTAURANTS AT U.S CELLULAR FIELD,Risk 2 (Medium)
95475,135245,ILLINOIS INSTITUTE OF TECHNOLOGY,Risk 1 (High)
...,...,...,...
47119,32394,MART PLAZA CATERING CORP,Risk 1 (High)
36142,37979,BISMARCK ENTERPRISES/UNITED,Risk 2 (Medium)
84078,37979,BISMARCK ENTERPRISES/UNITED,Risk 1 (High)
120491,6121,MACY'S,Risk 2 (Medium)


### Inspection
(Id, Date, Type, Result)

In [None]:
input_data[['Inspection ID', 'Inspection Date', 'Inspection Type', 'Results']]

### Violation and  Inspection_Violation

In [64]:
violation_df = input_data[~pd.isnull(input_data['Violations'])][['Inspection ID', 'Violations']].reset_index(drop=True)
violation_df

Unnamed: 0,Inspection ID,Violations
0,2079125,8. SANITIZING RINSE FOR EQUIPMENT AND UTENSILS...
1,2079123,"35. WALLS, CEILINGS, ATTACHED EQUIPMENT CONSTR..."
2,2079105,18. NO EVIDENCE OF RODENT OR INSECT OUTER OPEN...
3,2079104,2. FACILITIES TO MAINTAIN PROPER TEMPERATURE -...
4,2079091,2. FACILITIES TO MAINTAIN PROPER TEMPERATURE -...
...,...,...
123007,67733,2. FACILITIES TO MAINTAIN PROPER TEMPERATURE -...
123008,67732,2. FACILITIES TO MAINTAIN PROPER TEMPERATURE -...
123009,52234,32. FOOD AND NON-FOOD CONTACT SURFACES PROPERL...
123010,70269,2. FACILITIES TO MAINTAIN PROPER TEMPERATURE -...


In [10]:
input_data['Violations'].values[2]

'8. SANITIZING RINSE FOR EQUIPMENT AND UTENSILS:  CLEAN, PROPER TEMPERATURE, CONCENTRATION, EXPOSURE TIME - Comments: NO DISH WASHING FACILITIES ON SITE, (NO THREE COMPARTMENT SINK, WITH GREASE TRAP, OR DISHMACHINE), INSTRUCTED TO PROVIDE, | 11. ADEQUATE NUMBER, CONVENIENT, ACCESSIBLE, DESIGNED, AND MAINTAINED - Comments:  NO EXPOSED HAND SINK FOR REAR SERVICE AREA, INSTRUCTED TO PROVIDE,\n | 18. NO EVIDENCE OF RODENT OR INSECT OUTER OPENINGS PROTECTED/RODENT PROOFED, A WRITTEN LOG SHALL BE MAINTAINED AVAILABLE TO THE INSPECTORS - Comments: NO LICENSE PEST CONTROL LOG BOOK AT THIS TIME OF INSPECTION, INSTRUCTED TO PROVIDE, UPON NEXT VISIT,'

In [65]:
violation_df = violation_df.assign(
    Violations=violation_df['Violations'].str.split(' \| ')).explode('Violations').reset_index(drop=True)
violation_df.head(10)

Unnamed: 0,Inspection ID,Violations
0,2079125,8. SANITIZING RINSE FOR EQUIPMENT AND UTENSILS...
1,2079125,"11. ADEQUATE NUMBER, CONVENIENT, ACCESSIBLE, D..."
2,2079125,18. NO EVIDENCE OF RODENT OR INSECT OUTER OPEN...
3,2079123,"35. WALLS, CEILINGS, ATTACHED EQUIPMENT CONSTR..."
4,2079123,"34. FLOORS: CONSTRUCTED PER CODE, CLEANED, GOO..."
5,2079105,18. NO EVIDENCE OF RODENT OR INSECT OUTER OPEN...
6,2079105,31. CLEAN MULTI-USE UTENSILS AND SINGLE SERVIC...
7,2079105,"34. FLOORS: CONSTRUCTED PER CODE, CLEANED, GOO..."
8,2079105,38. VENTILATION: ROOMS AND EQUIPMENT VENTED AS...
9,2079105,"41. PREMISES MAINTAINED FREE OF LITTER, UNNECE..."


In [44]:
violation_df.loc[0, 'Violations']

'8. SANITIZING RINSE FOR EQUIPMENT AND UTENSILS:  CLEAN, PROPER TEMPERATURE, CONCENTRATION, EXPOSURE TIME - Comments: NO DISH WASHING FACILITIES ON SITE, (NO THREE COMPARTMENT SINK, WITH GREASE TRAP, OR DISHMACHINE), INSTRUCTED TO PROVIDE,'

In [45]:
violation_df.loc[2, 'Violations']

'18. NO EVIDENCE OF RODENT OR INSECT OUTER OPENINGS PROTECTED/RODENT PROOFED, A WRITTEN LOG SHALL BE MAINTAINED AVAILABLE TO THE INSPECTORS - Comments: NO LICENSE PEST CONTROL LOG BOOK AT THIS TIME OF INSPECTION, INSTRUCTED TO PROVIDE, UPON NEXT VISIT,'

In [46]:
violation_df.loc[5, 'Violations']

'18. NO EVIDENCE OF RODENT OR INSECT OUTER OPENINGS PROTECTED/RODENT PROOFED, A WRITTEN LOG SHALL BE MAINTAINED AVAILABLE TO THE INSPECTORS - Comments: VIOLATION CORRECTED'

In [47]:
violation_df.loc[12, 'Violations']

'21. * CERTIFIED FOOD MANAGER ON SITE WHEN POTENTIALLY HAZARDOUS FOODS ARE  PREPARED AND SERVED - Comments: NO ORIGINAL CHICAGO FOOD SANITATION CERTIFICATE POSTED. INSTRUCTED MANAGER TO PROVIDE ORIGINAL FOOD SANITATION CERTIFICATE. COPIES ARE NOT ACCEPTABLE. SERIOUS VIOLATION 7-38-012A'

In [49]:
violation_df.loc[30, 'Violations']

'21. * CERTIFIED FOOD MANAGER ON SITE WHEN POTENTIALLY HAZARDOUS FOODS ARE  PREPARED AND SERVED - Comments: NO CITY OF CHICAGO CERTIFIED FOOD MANAGER ON SITE DURING INITIAL INSPECTION WHILE FOODS WERE BEING PREPARED. CERTIFIED MANAGER WAS TELEPHONED AT THE INITIAL INSPECTION AND ARRIVED THIRTY MINUTES LATER. INSTRUCTED A CITY OF CHICAGO CERTIFIED MANAGER MUST BE ON SITE AT ALL TIMES WHILE FOODS ARE PREPARED. SERIOUS VIOLATION. 7-38-012.'

In [66]:
def split_violation(violation):
    violation, comment = violation.split(' - Comments: ')
    code, description = violation.split('. ', 1)
    return code, description, comment
violation_df['code'], violation_df['description'], violation_df['comment'] = zip(*violation_df['Violations'].apply(split_violation))
violation_df = violation_df.drop(columns='Violations').drop_duplicates().reset_index(drop=True)
violation_df

Unnamed: 0,Inspection ID,code,description,comment
0,2079125,8,SANITIZING RINSE FOR EQUIPMENT AND UTENSILS: ...,"NO DISH WASHING FACILITIES ON SITE, (NO THREE ..."
1,2079125,11,"ADEQUATE NUMBER, CONVENIENT, ACCESSIBLE, DESIG...","NO EXPOSED HAND SINK FOR REAR SERVICE AREA, I..."
2,2079125,18,NO EVIDENCE OF RODENT OR INSECT OUTER OPENINGS...,NO LICENSE PEST CONTROL LOG BOOK AT THIS TIME ...
3,2079123,35,"WALLS, CEILINGS, ATTACHED EQUIPMENT CONSTRUCTE...",MUST PROVIDE OVERHEAD PROTECTION FOR COFFEE/TE...
4,2079123,34,"FLOORS: CONSTRUCTED PER CODE, CLEANED, GOOD RE...",FLOORS AT COFFEE/TEA STATIONS MUST BE MADE SMO...
...,...,...,...,...
568577,104236,36,LIGHTING: REQUIRED MINIMUM FOOT-CANDLES OF LIG...,"All rooms in which food or drink is prepared, ..."
568578,104236,37,TOILET ROOM DOORS SELF CLOSING: DRESSING ROOMS...,None of the operations connected with food est...
568579,104236,38,VENTILATION: ROOMS AND EQUIPMENT VENTED AS REQ...,The flow of air discharged from kitchen fans s...
568580,104236,39,LINEN: CLEAN AND SOILED PROPERLY STORED,"Soiled and clean linens, coats, and aprons sha..."


In [67]:
violation_df[['Inspection ID', 'code']].duplicated().sum()

7599

In [68]:
violations = violation_df[['code', 'description']].drop_duplicates().reset_index(drop=True)
violations

Unnamed: 0,code,description
0,8,SANITIZING RINSE FOR EQUIPMENT AND UTENSILS: ...
1,11,"ADEQUATE NUMBER, CONVENIENT, ACCESSIBLE, DESIG..."
2,18,NO EVIDENCE OF RODENT OR INSECT OUTER OPENINGS...
3,35,"WALLS, CEILINGS, ATTACHED EQUIPMENT CONSTRUCTE..."
4,34,"FLOORS: CONSTRUCTED PER CODE, CLEANED, GOOD RE..."
5,31,CLEAN MULTI-USE UTENSILS AND SINGLE SERVICE AR...
6,38,VENTILATION: ROOMS AND EQUIPMENT VENTED AS REQ...
7,41,"PREMISES MAINTAINED FREE OF LITTER, UNNECESSAR..."
8,2,FACILITIES TO MAINTAIN PROPER TEMPERATURE
9,21,* CERTIFIED FOOD MANAGER ON SITE WHEN POTENTIA...


In [71]:
inspection_violation = violation_df[['Inspection ID', 'code', 'comment']]
inspection_violation

Unnamed: 0,Inspection ID,code,comment
0,2079125,8,"NO DISH WASHING FACILITIES ON SITE, (NO THREE ..."
1,2079125,11,"NO EXPOSED HAND SINK FOR REAR SERVICE AREA, I..."
2,2079125,18,NO LICENSE PEST CONTROL LOG BOOK AT THIS TIME ...
3,2079123,35,MUST PROVIDE OVERHEAD PROTECTION FOR COFFEE/TE...
4,2079123,34,FLOORS AT COFFEE/TEA STATIONS MUST BE MADE SMO...
...,...,...,...
568577,104236,36,"All rooms in which food or drink is prepared, ..."
568578,104236,37,None of the operations connected with food est...
568579,104236,38,The flow of air discharged from kitchen fans s...
568580,104236,39,"Soiled and clean linens, coats, and aprons sha..."


Note that a certain inspection id/violation code combination can have multiple comments. For example:

In [73]:
inspection_violation.groupby(by=['Inspection ID', 'code']).size().rename('count').reset_index()\
    .sort_values(by='count', ascending=False).reset_index(drop=True)

Unnamed: 0,Inspection ID,code,count
0,1764036,34,9
1,1975476,34,8
2,1547061,35,7
3,1501481,32,7
4,1234494,32,6
...,...,...,...
560978,1459941,28,1
560979,1459941,21,1
560980,1459940,41,1
560981,1459940,35,1


In [75]:
inspection_violation[(inspection_violation['Inspection ID']== '1764036') & (inspection_violation['code'] == '34')]

Unnamed: 0,Inspection ID,code,comment
99332,1764036,34,REGROUT THE FLOOR IN FOOD PREP AREA ALL THE WA...
99333,1764036,34,DETAIL CLEAN FLOOR INSIDE WALK-IN COOLER AT BO...
99334,1764036,34,OBSERVED STANDING WATER ON FLOOR ALONG FRONT ...
99336,1764036,34,FLOOR ALONG WALLBASE AND IN THE CORNERS IN NEE...
99337,1764036,34,MUST CLEAN FLOOR ALONG WALLS AND IN THE CORNER...
99338,1764036,34,MUST CLEAN FLOOR ALONG WALLS AND CORNERS AT BO...
99341,1764036,34,OBSERVED FRONT LINE WATER STANDING ON FLOOR AL...
99342,1764036,34,MUST CLEAN FLOOR ALONG THE WALLS AND IN THE CO...
99343,1764036,34,MUST CLEAN FLOOR ALONG THE WALLS AND IN CORNER...


### Inspection_Risk

In [85]:
risk_df = input_data[~pd.isnull(input_data['Risk'])][['Inspection ID', 'Risk']].reset_index(drop=True)
risk_df

Unnamed: 0,Inspection ID,Risk
0,2079132,Risk 1 (High)
1,2079129,Risk 2 (Medium)
2,2079125,Risk 3 (Low)
3,2079123,Risk 1 (High)
4,2079105,Risk 1 (High)
...,...,...
153739,67732,Risk 1 (High)
153740,67757,Risk 2 (Medium)
153741,52234,Risk 1 (High)
153742,70269,Risk 1 (High)


In [103]:
risk_df['Risk'].value_counts()

Risk 1 (High)      107351
Risk 2 (Medium)     31845
Risk 3 (Low)        14529
All                    19
Name: Risk, dtype: int64

In [81]:
list(re.compile(r'Risk (\d+) \((\w+)\)').match('Risk 2 (Medium)').groups())

['2', 'Medium']

In [98]:
def extract_risk(s):
    matched = re.compile(r'\w+ (?P<code>\d+) \((?P<level>\w+)\)').match(s)
    if matched:
        return matched.group('code'), matched.group('level')
    else:
        return '', ''
    
print(extract_risk('Risk 2 (Medium)') ) 
print(extract_risk('Risk2(Medium)') ) 

('2', 'Medium')
('', '')


In [99]:
risk_df['code'], risk_df['level'] = zip(*risk_df['Risk'].apply(extract_risk))

In [100]:
risk_df

Unnamed: 0,Inspection ID,Risk,code,level
0,2079132,Risk 1 (High),1,High
1,2079129,Risk 2 (Medium),2,Medium
2,2079125,Risk 3 (Low),3,Low
3,2079123,Risk 1 (High),1,High
4,2079105,Risk 1 (High),1,High
...,...,...,...,...
153739,67732,Risk 1 (High),1,High
153740,67757,Risk 2 (Medium),2,Medium
153741,52234,Risk 1 (High),1,High
153742,70269,Risk 1 (High),1,High


In [101]:
risk_df[risk_df['code']=='']

Unnamed: 0,Inspection ID,Risk,code,level
161,2078759,All,,
201,2078685,All,,
300,2078476,All,,
391,2078260,All,,
3139,2059435,All,,
4210,2050229,All,,
4672,2049776,All,,
6243,2010064,All,,
58570,1501748,All,,
67584,1474120,All,,


### Inspection_Facility