**U1 (Target Use Case):**   
One could use this dataset to investigate whether different neighborhoods (indicated by zip code or geographic regions) have distinct food safety patterns, such as passing rate, complaint rate, risks, violations, and violation types, and further stratify the data by other factors (such as facility types and time range).

In [77]:
import pandas as pd
import numpy as np
import re

**Note:** later we will need to replace the path to read the intermediate data after OpenRefine.

In [3]:
input_data = pd.read_csv("raw_input/Food_Inspections.csv")
input_data.head()

Unnamed: 0,Inspection ID,DBA Name,AKA Name,License #,Facility Type,Risk,Address,City,State,Zip,Inspection Date,Inspection Type,Results,Violations,Latitude,Longitude,Location
0,2079132,MARRIOT MARQUIS CHICAGO,MARRIOT MARQUIS CHICAGO,2517328.0,Restaurant,Risk 1 (High),2121 S PRAIRIE AVE,CHICAGO,IL,60616.0,8/28/17,License,Pass,,41.853651,-87.620534,"(41.853650885040594, -87.62053358114167)"
1,2079129,JET'S PIZZA,JET'S PIZZA,2522268.0,Restaurant,Risk 2 (Medium),1025 W MADISON ST,CHICAGO,IL,60607.0,8/28/17,License,Not Ready,,41.881572,-87.653052,"(41.88157249576794, -87.65305233593274)"
2,2079125,ROOM 1520,ROOM 1520,2446638.0,Special Event,Risk 3 (Low),301 N JUSTINE ST,CHICAGO,IL,60607.0,8/28/17,License Re-Inspection,Not Ready,8. SANITIZING RINSE FOR EQUIPMENT AND UTENSILS...,41.886577,-87.665328,"(41.88657652150854, -87.66532812140231)"
3,2079123,MARRIOT MARQUIS CHICAGO,MARRIOT MARQUIS CHICAGO,2517338.0,Restaurant,Risk 1 (High),2121 S PRAIRIE AVE,CHICAGO,IL,60616.0,8/28/17,License,Pass,"35. WALLS, CEILINGS, ATTACHED EQUIPMENT CONSTR...",41.853651,-87.620534,"(41.853650885040594, -87.62053358114167)"
4,2079105,CHARTWELLS,CICS WEST BELDEN CAMPUS,2549079.0,CHARTER SCHOOL,Risk 1 (High),2245 N MCVICKER AVE,CHICAGO,IL,60639.0,8/28/17,License Re-Inspection,Pass,18. NO EVIDENCE OF RODENT OR INSECT OUTER OPEN...,41.921675,-87.776711,"(41.921675488910864, -87.7767113569357)"


In [4]:
input_data.dtypes

Inspection ID        int64
DBA Name            object
AKA Name            object
License #          float64
Facility Type       object
Risk                object
Address             object
City                object
State               object
Zip                float64
Inspection Date     object
Inspection Type     object
Results             object
Violations          object
Latitude           float64
Longitude          float64
Location            object
dtype: object

In [5]:
input_data['Inspection ID'] = input_data['Inspection ID'].astype(str)
input_data['License #'] = input_data['License #'].astype(str)
input_data['License #'] = input_data['License #'].str.strip('.0')
input_data['Zip'] = input_data['Zip'].astype(str)
input_data['Zip'] = input_data['Zip'].str.strip('.0')
input_data['Inspection Date'] = pd.to_datetime(input_data['Inspection Date'])
input_data.dtypes

Inspection ID              object
DBA Name                   object
AKA Name                   object
License #                  object
Facility Type              object
Risk                       object
Address                    object
City                       object
State                      object
Zip                        object
Inspection Date    datetime64[ns]
Inspection Type            object
Results                    object
Violations                 object
Latitude                  float64
Longitude                 float64
Location                   object
dtype: object

In [6]:
input_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 153810 entries, 0 to 153809
Data columns (total 17 columns):
 #   Column           Non-Null Count   Dtype         
---  ------           --------------   -----         
 0   Inspection ID    153810 non-null  object        
 1   DBA Name         153810 non-null  object        
 2   AKA Name         151267 non-null  object        
 3   License #        153810 non-null  object        
 4   Facility Type    149250 non-null  object        
 5   Risk             153744 non-null  object        
 6   Address          153810 non-null  object        
 7   City             153651 non-null  object        
 8   State            153802 non-null  object        
 9   Zip              153810 non-null  object        
 10  Inspection Date  153810 non-null  datetime64[ns]
 11  Inspection Type  153809 non-null  object        
 12  Results          153810 non-null  object        
 13  Violations       123012 non-null  object        
 14  Latitude         153

In [7]:
input_data.isnull().sum()

Inspection ID          0
DBA Name               0
AKA Name            2543
License #              0
Facility Type       4560
Risk                  66
Address                0
City                 159
State                  8
Zip                    0
Inspection Date        0
Inspection Type        1
Results                0
Violations         30798
Latitude             544
Longitude            544
Location             544
dtype: int64

In [65]:
input_data.head()

Unnamed: 0,Inspection ID,DBA Name,AKA Name,License #,Facility Type,Risk,Address,City,State,Zip,Inspection Date,Inspection Type,Results,Violations,Latitude,Longitude,Location
0,2079132,MARRIOT MARQUIS CHICAGO,MARRIOT MARQUIS CHICAGO,2517328,Restaurant,Risk 1 (High),2121 S PRAIRIE AVE,CHICAGO,IL,60616,2017-08-28,License,Pass,,41.853651,-87.620534,"(41.853650885040594, -87.62053358114167)"
1,2079129,JET'S PIZZA,JET'S PIZZA,2522268,Restaurant,Risk 2 (Medium),1025 W MADISON ST,CHICAGO,IL,60607,2017-08-28,License,Not Ready,,41.881572,-87.653052,"(41.88157249576794, -87.65305233593274)"
2,2079125,ROOM 1520,ROOM 1520,2446638,Special Event,Risk 3 (Low),301 N JUSTINE ST,CHICAGO,IL,60607,2017-08-28,License Re-Inspection,Not Ready,8. SANITIZING RINSE FOR EQUIPMENT AND UTENSILS...,41.886577,-87.665328,"(41.88657652150854, -87.66532812140231)"
3,2079123,MARRIOT MARQUIS CHICAGO,MARRIOT MARQUIS CHICAGO,2517338,Restaurant,Risk 1 (High),2121 S PRAIRIE AVE,CHICAGO,IL,60616,2017-08-28,License,Pass,"35. WALLS, CEILINGS, ATTACHED EQUIPMENT CONSTR...",41.853651,-87.620534,"(41.853650885040594, -87.62053358114167)"
4,2079105,CHARTWELLS,CICS WEST BELDEN CAMPUS,2549079,CHARTER SCHOOL,Risk 1 (High),2245 N MCVICKER AVE,CHICAGO,IL,60639,2017-08-28,License Re-Inspection,Pass,18. NO EVIDENCE OF RODENT OR INSECT OUTER OPEN...,41.921675,-87.776711,"(41.921675488910864, -87.7767113569357)"


In [66]:
len(set(input_data['Inspection ID'])) == len(list(input_data['Inspection ID']))

True

In [67]:
# double check that there are no duplicates
len(input_data.drop_duplicates()) == len(input_data)

True

In [11]:
list(input_data)

['Inspection ID',
 'DBA Name',
 'AKA Name',
 'License #',
 'Facility Type',
 'Risk',
 'Address',
 'City',
 'State',
 'Zip',
 'Inspection Date',
 'Inspection Type',
 'Results',
 'Violations',
 'Latitude',
 'Longitude',
 'Location']

In [14]:
num_inspection = len(input_data)
print(f"How many inspection records are there in total? {num_inspection}")
num_license = len(set(input_data['License #']))
print(f"How many unique license numbers are there? {num_license}")
num_facility = len(input_data[['License #', 'AKA Name']].drop_duplicates())
print(f"How many unique food facilities are there? {num_facility}")
num_dates = len(set(input_data['Inspection Date']))
print(f"How many unique inspection dates are there? {num_dates}")

How many inspection records are there in total? 153810
How many unique license numbers are there? 32752
How many unique food facilities are there? 33886
How many unique inspection dates are there? 1946


## 1. data cleaning

In [10]:
input_data['Violations'].values[2]

'8. SANITIZING RINSE FOR EQUIPMENT AND UTENSILS:  CLEAN, PROPER TEMPERATURE, CONCENTRATION, EXPOSURE TIME - Comments: NO DISH WASHING FACILITIES ON SITE, (NO THREE COMPARTMENT SINK, WITH GREASE TRAP, OR DISHMACHINE), INSTRUCTED TO PROVIDE, | 11. ADEQUATE NUMBER, CONVENIENT, ACCESSIBLE, DESIGNED, AND MAINTAINED - Comments:  NO EXPOSED HAND SINK FOR REAR SERVICE AREA, INSTRUCTED TO PROVIDE,\n | 18. NO EVIDENCE OF RODENT OR INSECT OUTER OPENINGS PROTECTED/RODENT PROOFED, A WRITTEN LOG SHALL BE MAINTAINED AVAILABLE TO THE INSPECTORS - Comments: NO LICENSE PEST CONTROL LOG BOOK AT THIS TIME OF INSPECTION, INSTRUCTED TO PROVIDE, UPON NEXT VISIT,'

### **Note:** The same license number and DBA Name can have multiple AKA Names, for example:

In [26]:
temp = input_data[['License #', 'DBA Name', 'AKA Name', 'Facility Type']].drop_duplicates().groupby(
    by=['License #', 'DBA Name'], dropna=False).size().rename('count').reset_index()\
    .sort_values(by='count', ascending=False).reset_index(drop=True)
temp

Unnamed: 0,License #,DBA Name,count
0,14616,ILLINOIS SPORTSERVICE INC,58
1,1354323,SPORTSERVICE SOLDIER FIELD,45
2,1974745,THE UNITED CENTER,45
3,1574001,LEVY RESTAURANTS AT WRIGLEY FIELD,19
4,14616,ILLINOIS SPORTSERVICE INC.,17
...,...,...,...
33640,1981546,ST.NICHOLAS OF TOLENTINE,1
33641,1981525,REACH CHRISTIAN ACADEMY,1
33642,1981517,ST GENEVIEVE SCHOOL,1
33643,1981489,MONTCLARE SENIOR RESIDENCES,1


In [27]:
len(temp[temp['count']>1])

164

In [70]:
input_data[(input_data['License #']=='14616')][['License #', 'DBA Name', 'AKA Name', 'Facility Type']].drop_duplicates()

Unnamed: 0,License #,DBA Name,AKA Name,Facility Type
2210,14616,"ILLINOIS SPORTSERVICE, INC.",GUARANTEED RATE FIELD,STADIUM
123258,14616,ILLINOIS SPORTSERVICE INC.,MAIN COMMISSARY,COMMISSARY
123266,14616,ILLINOIS SPORTSERVICE INC.,SHERM LOLLARS GUARD THE PLATE GRILLE STAND #524,Restaurant
128247,14616,"ILLINOIS SPORTSERVICE, INC.",TEX-MEX 3 - STEAK (#544),KIOSK
128494,14616,ILLINOIS SPORTSERVICE INC.,BBQ BRISKET (#531),KIOSK
...,...,...,...,...
150012,14616,ILLINOIS SPORTSERVICE INC,DIPPING DOTS (SEC. #132),Restaurant
150038,14616,ILLINOIS SPORTSERVICE INC,GO GO WHITE SOX,Restaurant
150043,14616,ILLINOIS SPORTSERVICE INC,NACHO 8 (#544),Restaurant
150061,14616,ILLINOIS SPORTSERVICE INC,HOT DOG VIENNA BEEF 7 (#557),Restaurant


### Are there any redundant inspection entries on the same day for the same facility?

In [12]:
temp2 = input_data[['Inspection ID', 'License #', 'DBA Name', 'AKA Name', 'Inspection Date', 'Inspection Type']]
inspection_counts2 = temp2.groupby(by=['License #', 'DBA Name', 'AKA Name', 'Inspection Date', 'Inspection Type'], dropna=False).size().rename('count').reset_index()\
    .sort_values(by='count', ascending=False)
inspection_counts2.head(20)

Unnamed: 0,License #,DBA Name,AKA Name,Inspection Date,Inspection Type,count
12956,1354323,SPORTSERVICE SOLDIER FIELD,GRIDIRON GRILL,2013-11-14,Canvass,7
12952,1354323,SPORTSERVICE SOLDIER FIELD,GRIDIRON GRILL,2010-06-07,Canvass,5
60260,2013962,KENDALL COLLEGE,LAB KITCHEN B-ROOM 120B.,2013-09-09,Canvass,3
12937,1354323,SPORTSERVICE SOLDIER FIELD,BREWS BROTHER,2010-06-07,Canvass,3
12916,1354323,SPORTSERVICE SOLDIER FIELD,1ST DOWN FRANKS,2013-11-14,Canvass,3
12924,1354323,SPORTSERVICE SOLDIER FIELD,2ND CITY PIZZA,2013-11-14,Canvass,3
12913,1354323,SPORTSERVICE SOLDIER FIELD,1ST DOWN FRANKS,2010-06-07,Canvass,3
8373,1246854,SAVE-A-LOT #855,SAVE-A-LOT #855,2014-12-22,Canvass,3
12920,1354323,SPORTSERVICE SOLDIER FIELD,2ND CITY PIZZA,2010-06-07,Canvass,3
26790,1621492,SUBWAY,SUBWAY,2010-03-22,Suspected Food Poisoning,3


In [15]:
print("How many unique food facilities/inspection date combinations have multiple inspections on that date?")
num_facility_date_multiple_inspections = len(inspection_counts2[inspection_counts2['count']>1])
num_facility_date_comb = len(inspection_counts2)
print(f"{num_facility_date_multiple_inspections} out of {num_facility_date_comb} ({round(100*num_facility_date_multiple_inspections/num_facility_date_comb, 2)}%)")

print("How many inspection records are involved ?")
num_inspection_involved = inspection_counts2[inspection_counts2['count']>1]['count'].sum()
print(f"{num_inspection_involved} out of {num_inspection} ({round(100*num_inspection_involved/num_inspection, 2)}%)")

print("How many unique license numbers are involved?")
num_license_involved = len(inspection_counts2[inspection_counts2['count']>1][['License #']].drop_duplicates())
print(f"{num_license_involved} out of {num_license} ({round(100*num_license_involved/num_license, 2)}%)")

print("How many unique food facilities are involved?")
num_facility_involved = len(inspection_counts2[inspection_counts2['count']>1][['License #', 'AKA Name']].drop_duplicates())
print(f"{num_facility_involved} out of {num_facility} ({round(100*num_facility_involved/num_facility, 2)}%)")

print("How many unique inspection dates are involved?")
num_dates_involved = len(inspection_counts2[inspection_counts2['count']>1][['Inspection Date']].drop_duplicates())
print(f"{num_dates_involved} out of {num_dates} ({round(100*num_dates_involved/num_dates, 2)}%)")

How many unique food facilities/inspection date combinations have multiple inspections on that date?
285 out of 153506 (0.19%)
How many inspection records are involved ?
589 out of 153810 (0.38%)
How many unique license numbers are involved?
247 out of 32752 (0.75%)
How many unique food facilities are involved?
261 out of 33886 (0.77%)
How many unique inspection dates are involved?
234 out of 1946 (12.02%)


What's happening there? Take one example to look into the details:

In [86]:
input_data[(input_data['License #']=='1354323') & (input_data['AKA Name']=='GRIDIRON GRILL') & (input_data['Inspection Date']=='11/14/13')]

Unnamed: 0,Inspection ID,DBA Name,AKA Name,License #,Facility Type,Risk,Address,City,State,Zip,Inspection Date,Inspection Type,Results,Violations,Latitude,Longitude,Location
79430,1375402,SPORTSERVICE SOLDIER FIELD,GRIDIRON GRILL,1354323,Restaurant,Risk 2 (Medium),1410 S MUSEUM CAMPUS DR,CHICAGO,IL,60605,2013-11-14,Canvass,Out of Business,,41.862628,-87.615031,"(41.862628294037634, -87.61503109411332)"
79725,1375455,SPORTSERVICE SOLDIER FIELD,GRIDIRON GRILL,1354323,Restaurant,Risk 2 (Medium),1410 S MUSEUM CAMPUS DR,CHICAGO,IL,60605,2013-11-14,Canvass,Out of Business,,41.862628,-87.615031,"(41.862628294037634, -87.61503109411332)"
79730,1375451,SPORTSERVICE SOLDIER FIELD,GRIDIRON GRILL,1354323,Restaurant,Risk 2 (Medium),1410 S MUSEUM CAMPUS DR,CHICAGO,IL,60605,2013-11-14,Canvass,Out of Business,,41.862628,-87.615031,"(41.862628294037634, -87.61503109411332)"
79827,1375416,SPORTSERVICE SOLDIER FIELD,GRIDIRON GRILL,1354323,Restaurant,Risk 2 (Medium),1410 S MUSEUM CAMPUS DR,CHICAGO,IL,60605,2013-11-14,Canvass,Out of Business,,41.862628,-87.615031,"(41.862628294037634, -87.61503109411332)"
79831,1375414,SPORTSERVICE SOLDIER FIELD,GRIDIRON GRILL,1354323,Restaurant,Risk 2 (Medium),1410 S MUSEUM CAMPUS DR,CHICAGO,IL,60605,2013-11-14,Canvass,Out of Business,,41.862628,-87.615031,"(41.862628294037634, -87.61503109411332)"
79840,1375409,SPORTSERVICE SOLDIER FIELD,GRIDIRON GRILL,1354323,Restaurant,Risk 2 (Medium),1410 S MUSEUM CAMPUS DR,CHICAGO,IL,60605,2013-11-14,Canvass,Out of Business,,41.862628,-87.615031,"(41.862628294037634, -87.61503109411332)"
79842,1375407,SPORTSERVICE SOLDIER FIELD,GRIDIRON GRILL,1354323,Restaurant,Risk 2 (Medium),1410 S MUSEUM CAMPUS DR,CHICAGO,IL,60605,2013-11-14,Canvass,Out of Business,,41.862628,-87.615031,"(41.862628294037634, -87.61503109411332)"


We see that all of the Results are "Out of Business"!

In [11]:
temp2_ = input_data[['Inspection ID', 'License #', 'DBA Name', 'AKA Name', 'Inspection Date', 'Inspection Type', 'Results', 'Violations']]
inspection_counts2_ = temp2_.groupby(by=['License #', 'DBA Name', 'AKA Name', 'Inspection Date', 'Inspection Type', 'Results', 'Violations'], dropna=False).size().rename('count').reset_index()\
    .sort_values(by='count', ascending=False)
inspection_counts2_.head(20)

Unnamed: 0,License #,DBA Name,AKA Name,Inspection Date,Inspection Type,Results,Violations,count
12995,1354323.0,SPORTSERVICE SOLDIER FIELD,GRIDIRON GRILL,2013-11-14,Canvass,Out of Business,,7
12955,1354323.0,SPORTSERVICE SOLDIER FIELD,2ND CITY PIZZA,2013-11-14,Canvass,Out of Business,,3
69824,2065013.0,FRANK'S FRANKS,FRANK'S FRANKS,2012-02-16,Canvass,Out of Business,,3
12994,1354323.0,SPORTSERVICE SOLDIER FIELD,GRIDIRON GRILL,2011-05-24,Canvass,Pass,,3
8389,1246854.0,SAVE-A-LOT #855,SAVE-A-LOT #855,2014-12-22,Canvass,Out of Business,,3
12945,1354323.0,SPORTSERVICE SOLDIER FIELD,1ST DOWN FRANKS,2013-11-14,Canvass,Out of Business,,3
60369,2013962.0,KENDALL COLLEGE,LAB KITCHEN B-ROOM 120B.,2013-09-09,Canvass,Out of Business,,3
130427,26334.0,PARADISE GARDEN PRODUCE INC,PARADISE GARDEN PRODUCE INC,2014-12-04,Canvass,Out of Business,,2
58052,2002692.0,CHICAGO MINI MART,,2016-12-08,Canvass,Out of Business,,2
148334,66351.0,ASPIRA HAUGAN,ASPIRA HAUGAN MIDDLE SCHOOL,2010-01-20,Canvass,Pass,,2


Let's exclude the "Out of Business" Results:

In [10]:
temp3 = input_data[['Inspection ID', 'License #', 'DBA Name', 'AKA Name', 'Inspection Date', 'Inspection Type', 'Results']]
inspection_counts3 = temp3[temp3['Results'] != 'Out of Business'].groupby(
    by=['License #', 'DBA Name', 'AKA Name', 'Inspection Date', 'Inspection Type'], dropna=False)\
    .size().rename('count').reset_index().sort_values(by='count', ascending=False)
inspection_counts3.head(20)

Unnamed: 0,License #,DBA Name,AKA Name,Inspection Date,Inspection Type,count
11458,1354323,SPORTSERVICE SOLDIER FIELD,GRIDIRON GRILL,2010-06-07,Canvass,5
11461,1354323,SPORTSERVICE SOLDIER FIELD,GRIDIRON GRILL,2011-05-24,Canvass,3
11460,1354323,SPORTSERVICE SOLDIER FIELD,GRIDIRON GRILL,2011-05-23,Canvass,3
11448,1354323,SPORTSERVICE SOLDIER FIELD,BREWS BROTHER,2010-06-07,Canvass,3
23495,1621492,SUBWAY,SUBWAY,2010-03-22,Suspected Food Poisoning,3
11435,1354323,SPORTSERVICE SOLDIER FIELD,2ND CITY PIZZA,2010-06-07,Canvass,3
11430,1354323,SPORTSERVICE SOLDIER FIELD,1ST DOWN FRANKS,2010-06-07,Canvass,3
59566,2049216,URBAN INDIA RESTAURANT,URBAN INDIA RESTAURANT,2010-08-10,License,2
45495,1959931,Chicago DQ Grill and Chill,Chicago DQ Grill and Chill,2012-03-01,Canvass,2
68224,2102689,Yuki Hana Japanese Cuisine & S,Yuki Hana,2011-12-01,Task Force Liquor 1475,2


In [16]:
print("How many unique food facilities/inspection date combinations have multiple inspections on that date?")
num_facility_date_multiple_inspections = len(inspection_counts3[inspection_counts3['count']>1])
num_facility_date_comb = len(inspection_counts3)
print(f"{num_facility_date_multiple_inspections} out of {num_facility_date_comb} ({round(100*num_facility_date_multiple_inspections/num_facility_date_comb, 2)}%)")

print("How many inspection records are involved ?")
num_inspection_involved = inspection_counts3[inspection_counts3['count']>1]['count'].sum()
print(f"{num_inspection_involved} out of {num_inspection} ({round(100*num_inspection_involved/num_inspection, 2)}%)")

print("How many unique license numbers are involved?")
num_license_involved = len(inspection_counts3[inspection_counts3['count']>1][['License #']].drop_duplicates())
print(f"{num_license_involved} out of {num_license} ({round(100*num_license_involved/num_license, 2)}%)")

print("How many unique food facilities are involved?")
num_facility_involved = len(inspection_counts3[inspection_counts3['count']>1][['License #', 'AKA Name']].drop_duplicates())
print(f"{num_facility_involved} out of {num_facility} ({round(100*num_facility_involved/num_facility, 2)}%)")

print("How many unique inspection dates are involved?")
num_dates_involved = len(inspection_counts3[inspection_counts3['count']>1][['Inspection Date']].drop_duplicates())
print(f"{num_dates_involved} out of {num_dates} ({round(100*num_dates_involved/num_dates, 2)}%)")

How many unique food facilities/inspection date combinations have multiple inspections on that date?
190 out of 139817 (0.14%)
How many inspection records are involved ?
389 out of 153810 (0.25%)
How many unique license numbers are involved?
160 out of 32752 (0.49%)
How many unique food facilities are involved?
172 out of 33886 (0.51%)
How many unique inspection dates are involved?
157 out of 1946 (8.07%)


take an example:

In [89]:
input_data[(input_data['License #']=='1354323') & (input_data['AKA Name']=='GRIDIRON GRILL') & (input_data['Inspection Date']=='6/7/10')]

Unnamed: 0,Inspection ID,DBA Name,AKA Name,License #,Facility Type,Risk,Address,City,State,Zip,Inspection Date,Inspection Type,Results,Violations,Latitude,Longitude,Location
146428,250633,SPORTSERVICE SOLDIER FIELD,GRIDIRON GRILL,1354323,Restaurant,Risk 2 (Medium),1410 S MUSEUM CAMPUS DR,CHICAGO,IL,60605,2010-06-07,Canvass,Pass,"34. FLOORS: CONSTRUCTED PER CODE, CLEANED, GOO...",41.862628,-87.615031,"(41.862628294037634, -87.61503109411332)"
146436,197469,SPORTSERVICE SOLDIER FIELD,GRIDIRON GRILL,1354323,Restaurant,Risk 2 (Medium),1410 S MUSEUM CAMPUS DR,CHICAGO,IL,60605,2010-06-07,Canvass,Pass,33. FOOD AND NON-FOOD CONTACT EQUIPMENT UTENSI...,41.862628,-87.615031,"(41.862628294037634, -87.61503109411332)"
146445,112420,SPORTSERVICE SOLDIER FIELD,GRIDIRON GRILL,1354323,Restaurant,Risk 2 (Medium),1410 S MUSEUM CAMPUS DR,CHICAGO,IL,60605,2010-06-07,Canvass,Pass,32. FOOD AND NON-FOOD CONTACT SURFACES PROPERL...,41.862628,-87.615031,"(41.862628294037634, -87.61503109411332)"
146450,160390,SPORTSERVICE SOLDIER FIELD,GRIDIRON GRILL,1354323,Restaurant,Risk 2 (Medium),1410 S MUSEUM CAMPUS DR,CHICAGO,IL,60605,2010-06-07,Canvass,Pass,"35. WALLS, CEILINGS, ATTACHED EQUIPMENT CONSTR...",41.862628,-87.615031,"(41.862628294037634, -87.61503109411332)"
146493,197476,SPORTSERVICE SOLDIER FIELD,GRIDIRON GRILL,1354323,Restaurant,Risk 2 (Medium),1410 S MUSEUM CAMPUS DR,CHICAGO,IL,60605,2010-06-07,Canvass,Pass,"34. FLOORS: CONSTRUCTED PER CODE, CLEANED, GOO...",41.862628,-87.615031,"(41.862628294037634, -87.61503109411332)"


The "Violations" are different!

In [96]:
input_data.loc[146428, 'Violations']

'34. FLOORS: CONSTRUCTED PER CODE, CLEANED, GOOD REPAIR, COVING INSTALLED, DUST-LESS CLEANING METHODS USED - Comments: The floors shall be constructed per code, be smooth and easily cleaned, and be kept clean and in good repair. \n \nMUST DETAIL CLEAN FLOORS UNDER GRILL AND REMOVE FOOD DEBRIS AND GREASE BUILD UP.  '

In [97]:
input_data.loc[146493, 'Violations']

'34. FLOORS: CONSTRUCTED PER CODE, CLEANED, GOOD REPAIR, COVING INSTALLED, DUST-LESS CLEANING METHODS USED - Comments: The floors shall be constructed per code, be smooth and easily cleaned, and be kept clean and in good repair. \nSTANDING WATER NEXT TO GREASE TRAP. INSTRUCTED TO CLEAN AND MAINTAIN. | 35. WALLS, CEILINGS, ATTACHED EQUIPMENT CONSTRUCTED PER CODE: GOOD REPAIR, SURFACES CLEAN AND DUST-LESS CLEANING METHODS - Comments: The walls and ceilings shall be in good repair and easily cleaned. \nPEELING PAINT NOTED ON WALL NEXT TO DEEP FRYER, INSTRUCTED MANAGER TO REMOVE AND MAINTAIN.'

Let's also consider results and violatoins: 

In [9]:
temp4 = input_data[['Inspection ID', 'License #', 'DBA Name', 'AKA Name', 'Inspection Date', 'Inspection Type', 'Results', 'Violations']]
inspection_counts4 = temp4[temp4['Results'] != 'Out of Business'].groupby(
    by=['License #', 'DBA Name', 'AKA Name', 'Inspection Date', 'Inspection Type', 'Results', 'Violations'], dropna=False)\
    .size().rename('count').reset_index().sort_values(by='count', ascending=False)
inspection_counts4.head(20)

Unnamed: 0,License #,DBA Name,AKA Name,Inspection Date,Inspection Type,Results,Violations,count
11496,1354323.0,SPORTSERVICE SOLDIER FIELD,GRIDIRON GRILL,2011-05-24,Canvass,Pass,,3
98422,2320537.0,TACO BELL #30409,TACO BELL,2014-04-03,License,Fail,,2
23548,1621492.0,SUBWAY,SUBWAY,2010-03-22,Suspected Food Poisoning,Pass,,2
11476,1354323.0,SPORTSERVICE SOLDIER FIELD,BREWS BROTHER,2010-06-07,Canvass,Pass,,2
77896,2158267.0,PICOSITO GRILL-BAR,PICOSITO GRILL-BAR,2017-06-21,Canvass,No Entry,,2
11465,1354323.0,SPORTSERVICE SOLDIER FIELD,2ND CITY PIZZA,2011-05-24,Canvass,Pass,,2
110774,2437593.0,DUNKIN DONUTS,DUNKIN DONUTS,2016-05-19,License,Fail,18. NO EVIDENCE OF RODENT OR INSECT OUTER OPEN...,2
51717,2003567.0,LEONIDAS CAFE CHOCOLATERIE,LEONIDAS CAFE CHOCOLATERIE,2010-03-31,License,Pass,,2
11457,1354323.0,SPORTSERVICE SOLDIER FIELD,1ST DOWN FRANKS,2011-05-24,Canvass,Pass,,2
7610,1248767.0,SATAY,SATAY,2014-08-01,Canvass,No Entry,,2


In [17]:
len(inspection_counts4[inspection_counts4['count']>1])

40

In [8]:
# remove the exclusion of "out of business" inspections
temp5 = input_data[['Inspection ID', 'License #', 'DBA Name', 'AKA Name', 'Inspection Date', 'Inspection Type', 'Results', 'Violations']]
inspection_counts5 = temp5.groupby(
    by=['License #', 'DBA Name', 'AKA Name', 'Inspection Date', 'Inspection Type', 'Results', 'Violations'], dropna=False)\
    .size().rename('count').reset_index().sort_values(by='count', ascending=False)
inspection_counts5.head(20)

Unnamed: 0,License #,DBA Name,AKA Name,Inspection Date,Inspection Type,Results,Violations,count
12995,1354323.0,SPORTSERVICE SOLDIER FIELD,GRIDIRON GRILL,2013-11-14,Canvass,Out of Business,,7
12955,1354323.0,SPORTSERVICE SOLDIER FIELD,2ND CITY PIZZA,2013-11-14,Canvass,Out of Business,,3
69824,2065013.0,FRANK'S FRANKS,FRANK'S FRANKS,2012-02-16,Canvass,Out of Business,,3
12994,1354323.0,SPORTSERVICE SOLDIER FIELD,GRIDIRON GRILL,2011-05-24,Canvass,Pass,,3
8389,1246854.0,SAVE-A-LOT #855,SAVE-A-LOT #855,2014-12-22,Canvass,Out of Business,,3
12945,1354323.0,SPORTSERVICE SOLDIER FIELD,1ST DOWN FRANKS,2013-11-14,Canvass,Out of Business,,3
60369,2013962.0,KENDALL COLLEGE,LAB KITCHEN B-ROOM 120B.,2013-09-09,Canvass,Out of Business,,3
130427,26334.0,PARADISE GARDEN PRODUCE INC,PARADISE GARDEN PRODUCE INC,2014-12-04,Canvass,Out of Business,,2
58052,2002692.0,CHICAGO MINI MART,,2016-12-08,Canvass,Out of Business,,2
148334,66351.0,ASPIRA HAUGAN,ASPIRA HAUGAN MIDDLE SCHOOL,2010-01-20,Canvass,Pass,,2


In [19]:
print("How many unique food facilities/inspection date/result combinations have multiple inspections on that date?")
num_facility_date_multiple_inspections = len(inspection_counts5[inspection_counts5['count']>1])
num_facility_date_comb = len(inspection_counts5)
print(f"{num_facility_date_multiple_inspections} out of {num_facility_date_comb} ({round(100*num_facility_date_multiple_inspections/num_facility_date_comb, 2)}%)")

print("How many inspection records are involved ?")
num_inspection_involved = inspection_counts3[inspection_counts5['count']>1]['count'].sum()
print(f"{num_inspection_involved} out of {num_inspection} ({round(100*num_inspection_involved/num_inspection, 2)}%)")

print("How many unique license numbers are involved?")
num_license_involved = len(inspection_counts5[inspection_counts5['count']>1][['License #']].drop_duplicates())
print(f"{num_license_involved} out of {num_license} ({round(100*num_license_involved/num_license, 2)}%)")

print("How many unique food facilities are involved?")
num_facility_involved = len(inspection_counts5[inspection_counts5['count']>1][['License #', 'AKA Name']].drop_duplicates())
print(f"{num_facility_involved} out of {num_facility} ({round(100*num_facility_involved/num_facility, 2)}%)")

print("How many unique inspection dates are involved?")
num_dates_involved = len(inspection_counts5[inspection_counts5['count']>1][['Inspection Date']].drop_duplicates())
print(f"{num_dates_involved} out of {num_dates} ({round(100*num_dates_involved/num_dates, 2)}%)")

How many unique food facilities/inspection date/result combinations have multiple inspections on that date?
92 out of 153707 (0.06%)
How many inspection records are involved ?
91 out of 153810 (0.06%)
How many unique license numbers are involved?
76 out of 32752 (0.23%)
How many unique food facilities are involved?
85 out of 33886 (0.25%)
How many unique inspection dates are involved?
77 out of 1946 (3.96%)


  import sys


take an example:

In [21]:
input_data[(input_data['License #']=='1354323') 
           & (input_data['AKA Name']=='GRIDIRON GRILL') 
           & (input_data['Inspection Date']=='2011-05-24')
           & (input_data['Results']=='Pass')].reset_index(drop=True)

Unnamed: 0,Inspection ID,DBA Name,AKA Name,License #,Facility Type,Risk,Address,City,State,Zip,Inspection Date,Inspection Type,Results,Violations,Latitude,Longitude,Location
0,567417,SPORTSERVICE SOLDIER FIELD,GRIDIRON GRILL,1354323,Restaurant,Risk 2 (Medium),1410 S MUSEUM CAMPUS DR,CHICAGO,IL,60605,2011-05-24,Canvass,Pass,,41.862628,-87.615031,"(41.862628294037634, -87.61503109411332)"
1,567415,SPORTSERVICE SOLDIER FIELD,GRIDIRON GRILL,1354323,Restaurant,Risk 2 (Medium),1410 S MUSEUM CAMPUS DR,CHICAGO,IL,60605,2011-05-24,Canvass,Pass,,41.862628,-87.615031,"(41.862628294037634, -87.61503109411332)"
2,569358,SPORTSERVICE SOLDIER FIELD,GRIDIRON GRILL,1354323,Restaurant,Risk 2 (Medium),1410 S MUSEUM CAMPUS DR,CHICAGO,IL,60605,2011-05-24,Canvass,Pass,,41.862628,-87.615031,"(41.862628294037634, -87.61503109411332)"


### Different license numbers for the same food facility?

Example:

In [29]:
input_data[input_data['AKA Name']=='LA MEXICANA TAMALE CART'][
    ['DBA Name','AKA Name','License #','Facility Type','Address','City','State','Zip',
     'Latitude','Longitude','Location', 'Inspection Date']
].drop_duplicates()

Unnamed: 0,DBA Name,AKA Name,License #,Facility Type,Address,City,State,Zip,Latitude,Longitude,Location,Inspection Date
5101,PALETERIA Y NEVERIA LA MEXICANA YOGURT AND CHURRO,LA MEXICANA TAMALE CART,2506389,Mobile Prepared Food Vendor,4171 S ARCHER AVE,CHICAGO,IL,60632,41.817618,-87.698258,"(41.81761758884696, -87.69825774800141)",2017-01-10
11720,PALETERIA Y NEVERIA LA MEXICANA YOGURT AND CHURRO,LA MEXICANA TAMALE CART,2506388,Mobile Prepared Food Vendor,4171 S ARCHER AVE,CHICAGO,IL,60632,41.817618,-87.698258,"(41.81761758884696, -87.69825774800141)",2017-01-10
11723,PALETERIA Y NEVERIA LA MEXICANA YOGURT AND CHURRO,LA MEXICANA TAMALE CART,2506235,Mobile Prepared Food Vendor,4171 S ARCHER AVE,CHICAGO,IL,60632,41.817618,-87.698258,"(41.81761758884696, -87.69825774800141)",2017-01-10
11724,PALETERIA Y NEVERIA LA MEXICANA YOGURT AND CHURRO,LA MEXICANA TAMALE CART,2506234,Mobile Prepared Food Vendor,4171 S ARCHER AVE,CHICAGO,IL,60632,41.817618,-87.698258,"(41.81761758884696, -87.69825774800141)",2017-01-10
11728,PALETERIA Y NEVERIA LA MEXICANA YOGURT AND CHURRO,LA MEXICANA TAMALE CART,2506233,Mobile Prepared Food Vendor,4171 S ARCHER AVE,CHICAGO,IL,60632,41.817618,-87.698258,"(41.81761758884696, -87.69825774800141)",2017-01-10
11730,PALETERIA Y NEVERIA LA MEXICANA YOGURT AND CHURRO,LA MEXICANA TAMALE CART,2506231,Mobile Prepared Food Vendor,4171 S ARCHER AVE,CHICAGO,IL,60632,41.817618,-87.698258,"(41.81761758884696, -87.69825774800141)",2017-01-10
11732,PALETERIA Y NEVERIA LA MEXICANA YOGURT AND CHURRO,LA MEXICANA TAMALE CART,250623,Mobile Prepared Food Vendor,4171 S ARCHER AVE,CHICAGO,IL,60632,41.817618,-87.698258,"(41.81761758884696, -87.69825774800141)",2017-01-10
11734,PALETERIA Y NEVERIA LA MEXICANA YOGURT AND CHURRO,LA MEXICANA TAMALE CART,2506222,Mobile Prepared Food Vendor,4171 S ARCHER AVE,CHICAGO,IL,60632,41.817618,-87.698258,"(41.81761758884696, -87.69825774800141)",2017-01-10
11738,PALETERIA Y NEVERIA LA MEXICANA YOGURT AND CHURRO,LA MEXICANA TAMALE CART,250622,Mobile Prepared Food Vendor,4171 S ARCHER AVE,CHICAGO,IL,60632,41.817618,-87.698258,"(41.81761758884696, -87.69825774800141)",2017-01-10
11740,PALETERIA Y NEVERIA LA MEXICANA YOGURT AND CHURRO,LA MEXICANA TAMALE CART,2506221,Mobile Prepared Food Vendor,4171 S ARCHER AVE,CHICAGO,IL,60632,41.817618,-87.698258,"(41.81761758884696, -87.69825774800141)",2017-01-10


Note that most of them have the facility type of "Mobile Prepared Food Vendor".

In [30]:
temp = input_data[[
    'DBA Name','AKA Name','License #','Facility Type','Address','City','State','Zip',
    'Latitude','Longitude','Location']].drop_duplicates()
facility_counts = temp.groupby(
    by=['DBA Name','AKA Name','Facility Type','Address','City','State','Zip','Latitude','Longitude','Location'], dropna=False)\
    .size().rename('count').reset_index().sort_values(by='count', ascending=False)
facility_counts.head(20)

Unnamed: 0,DBA Name,AKA Name,Facility Type,Address,City,State,Zip,Latitude,Longitude,Location,count
27287,"TRIPLE A SERVICES, INC.","TRIPLE A SERVICES, INC.",Mobile Food Dispenser,2637 S THROOP ST,CHICAGO,IL,60608,41.844671,-87.654962,"(41.84467072147077, -87.6549622241458)",39
19563,PALETERIA Y NEVERIA LA MEXICANA YOGURT AND CHURRO,LA MEXICANA TAMALE CART,Mobile Prepared Food Vendor,4171 S ARCHER AVE,CHICAGO,IL,60632,41.817618,-87.698258,"(41.81761758884696, -87.69825774800141)",16
27864,"VHC LOWER, LLC",3RD FLOOR - BANQUET/MEETING ROOMS(LL KITCHEN),Restaurant,203 N WABASH AVE,CHICAGO,IL,60601,41.885828,-87.626138,"(41.885828292802344, -87.62613826248786)",15
23368,SOHO HOUSE,SOHO HOUSE,Restaurant,113-125 N GREEN ST,CHICAGO,IL,60607,41.883557,-87.648432,"(41.883557097965046, -87.64843239611488)",10
19505,PALETERIA AZTECA #2,,Mobile Food Dispenser,3119 W CERMAK RD,CHICAGO,IL,60623,41.851676,-87.703696,"(41.85167565466277, -87.70369582116497)",10
28092,WALDORF ASTORIA CHICAGO,WALDORF ASTORIA CHICAGO,Restaurant,11 E WALTON ST,CHICAGO,IL,60611,41.899835,-87.627799,"(41.89983492074242, -87.6277991723728)",9
9554,FLAT TOP GRILL,FLAT TOP GRILL,Restaurant,30 S WABASH AVE,CHICAGO,IL,60603,41.881335,-87.626335,"(41.88133480594687, -87.62633546646605)",8
19451,PACIFIC LANGHAM CHICAGO CORPORATION,THE LANGHAM CHICAGO,Restaurant,330 N WABASH AVE,CHICAGO,IL,60611,41.887939,-87.627036,"(41.88793874358949, -87.62703564148782)",7
16748,MARRIOT MARQUIS CHICAGO,MARRIOT MARQUIS CHICAGO,Restaurant,2121 S PRAIRIE AVE,CHICAGO,IL,60616,41.853651,-87.620534,"(41.853650885040594, -87.62053358114167)",7
19502,PALETERIA AZTECA #2,PALETERIA AZTECA #2,Mobile Food Dispenser,3119 W CERMAK RD,CHICAGO,IL,60623,41.851676,-87.703696,"(41.85167565466277, -87.70369582116497)",7


In [33]:
len(facility_counts[facility_counts['count']>1])

3909

In [34]:
len(facility_counts[facility_counts['count']>1])/len(facility_counts)

0.13361361771944216

Note that some of them have the facility type of "Mobile Prepared Food Vendor". Some of the "Restaurant" facilities seem to be hotels, or within a complex.

## 2. split the dataframe into multiple tables

In [11]:
list(input_data)

['Inspection ID',
 'DBA Name',
 'AKA Name',
 'License #',
 'Facility Type',
 'Risk',
 'Address',
 'City',
 'State',
 'Zip',
 'Inspection Date',
 'Inspection Type',
 'Results',
 'Violations',
 'Latitude',
 'Longitude',
 'Location']

### Inspection
(Id, Date, Type, Result)

In [None]:
input_data[['Inspection ID', 'Inspection Date', 'Inspection Type', 'Results']]

### Facility

In [None]:
input_data[['DBA Name', 'AKA Name', 'License #', 'Facility Type', 'Address', 'City', 'State', 'Zip', 'Latitude','Longitude']]

The same License # can have multiple DBA Name?

In [122]:
temp = input_data[(input_data['License #']!='') & (input_data['License #']!='nan') & (~pd.isnull(input_data['License #']))][[
    'License #', 'DBA Name']].drop_duplicates()
temp[temp.duplicated(subset='License #', keep=False)].sort_values(by='License #')

Unnamed: 0,License #,DBA Name
2031,1000774,RECLAIMED BAR AND RESTAURANT
13480,1000774,Crabbby Kim's
112603,104,VITO & NICK'S LOUNGE
65459,104,MIARA'S CONT'L PASTRY DELI INC
17601,1042664,BLOKES & BIRDS
...,...,...
19217,9648,CAFFE ITALIA
3619,968,CHEZ MOI
134794,968,CAFE BERNARD
10660,99,XANDO COFFEE & BAR / COSI SANDWICH BAR


The same License # can have multiple addresses?

In [123]:
temp = input_data[(input_data['License #']!='') & (input_data['License #']!='nan') & (~pd.isnull(input_data['License #']))][[
    'License #', 'Address', 'City', 'State', 'Zip', 'Latitude','Longitude']].drop_duplicates()
temp[temp.duplicated(subset='License #', keep=False)].sort_values(by='License #')

Unnamed: 0,License #,Address,City,State,Zip,Latitude,Longitude
65459,104,7051-7053 W ADDISON ST,CHICAGO,IL,60634,41.945234,-87.802927
112603,104,8433-8435 S PULASKI RD,CHICAGO,IL,60652,41.739329,-87.721440
98142,1045301,160 N LOOMIS ST,CHICAGO,IL,60607,41.883991,-87.662525
18089,1045301,1400 W RANDOLPH ST,CHICAGO,IL,60607,41.884274,-87.662433
99286,1093945,2321 W LAWRENCE AVE,CHICAGO,IL,60625,41.968527,-87.687315
...,...,...,...,...,...,...,...
140385,845,820 S MICHIGAN AVE,CHICAGO,IL,60605,41.871577,-87.624310
137559,901,3835 W 26TH ST,CHICAGO,IL,60623,41.844188,-87.720950
99502,901,6647 S HALSTED ST,CHICAGO,IL,60621,41.772902,-87.644461
10660,99,203 N LA SALLE ST,CHICAGO,IL,60601,41.885822,-87.632304


In [135]:
temp = input_data[(input_data['License #']!='') & (input_data['License #']!='nan') & (~pd.isnull(input_data['License #']))][[
    'License #', 'Address', 'City', 'State', 'Zip', 'Latitude','Longitude']].drop_duplicates()
temp[temp.duplicated(subset=['Address', 'City', 'State', 'Zip', 'Latitude','Longitude'], keep=False)].sort_values(by='Address')

Unnamed: 0,License #,Address,City,State,Zip,Latitude,Longitude
10022,2215538,1 E 113TH ST,CHICAGO,IL,60628,41.688848,-87.622894
116298,21702,1 E 113TH ST,CHICAGO,IL,60628,41.688848,-87.622894
116300,1800876,1 E 113TH ST,CHICAGO,IL,60628,41.688848,-87.622894
42462,1843205,1 E DELAWARE PL,CHICAGO,IL,60611,41.899031,-87.628192
108154,203223,1 E DELAWARE PL,CHICAGO,IL,60611,41.899031,-87.628192
...,...,...,...,...,...,...,...
88145,2013312,9917 S EWING AVE,CHICAGO,IL,60617,41.715244,-87.535130
75440,2046663,9928 S CRANDON AVE FL,CHICAGO,IL,60617,41.714402,-87.567167
9092,3247017,9928 S CRANDON AVE FL,CHICAGO,IL,60617,41.714402,-87.567167
15768,1998729,9939 S HALSTED ST,CHICAGO,IL,60628,41.713035,-87.642832


In [134]:
temp = input_data[['DBA Name', 'AKA Name', 'Address','Zip', 'Risk', 'Inspection Date']].drop_duplicates()
temp[temp.duplicated(subset=['DBA Name', 'AKA Name', 'Address','Zip', 'Inspection Date'], keep=False)]

Unnamed: 0,DBA Name,AKA Name,Address,Zip,Risk,Inspection Date
30,NANDO'S PERI-PERI,NANDO'S PERI-PERI,117 E LAKE ST,60601,Risk 3 (Low),2017-08-25
33,NANDO'S PERI-PERI,NANDO'S PERI-PERI,117 E LAKE ST,60601,Risk 1 (High),2017-08-25
52,THE PRESS ROOM,THE PRESS ROOM,1134 W WASHINGTON BLVD,60607,Risk 3 (Low),2017-08-24
54,THE PRESS ROOM,THE PRESS ROOM,1134 W WASHINGTON BLVD,60607,Risk 1 (High),2017-08-24
149,THE PRESS ROOM,THE PRESS ROOM,1134 W WASHINGTON BLVD,60607,Risk 3 (Low),2017-08-22
...,...,...,...,...,...,...
153143,FLO & SANTO,FLO & SANTO,1310 S WABASH AVE,60605,Risk 3 (Low),2010-01-19
153352,"WISHCO, INC.","WISHCO, INC.",1546 W 35TH ST,60609,Risk 3 (Low),2010-01-13
153353,"WISHCO, INC.","WISHCO, INC.",1546 W 35TH ST,60609,Risk 1 (High),2010-01-13
153608,HONG KONG MARKET,HONG KONG MARKET,520 W 24TH PL,60616,Risk 3 (Low),2010-01-07


In [124]:
list(input_data)

['Inspection ID',
 'DBA Name',
 'AKA Name',
 'License #',
 'Facility Type',
 'Risk',
 'Address',
 'City',
 'State',
 'Zip',
 'Inspection Date',
 'Inspection Type',
 'Results',
 'Violations',
 'Latitude',
 'Longitude',
 'Location']

### Facility_Risk

In [112]:
temp = input_data[input_data['License #']!=''][['License #', 'Risk']].drop_duplicates()
temp[temp.duplicated(subset='License #', keep=False)].sort_values(by='License #')

Unnamed: 0,License #,Risk
65459,104,Risk 1 (High)
112603,104,Risk 3 (Low)
1729,1042664,Risk 1 (High)
91775,1042664,Risk 3 (Low)
4405,1094,Risk 1 (High)
...,...,...
37903,8425,Risk 2 (Medium)
137559,901,Risk 1 (High)
99502,901,Risk 3 (Low)
578,,Risk 2 (Medium)


In [117]:
temp = input_data[~pd.isnull(input_data['Risk'])][['License #', 'DBA Name', 'AKA Name', 'Risk']].drop_duplicates()
temp[temp.duplicated(subset=['License #', 'DBA Name', 'AKA Name'], keep=False)].sort_values(by='License #')

Unnamed: 0,License #,DBA Name,AKA Name,Risk
127712,,FLAV-O-RICIOUS,FLAV-O-RICIOUS,Risk 3 (Low)
127627,,FLAV-O-RICIOUS,FLAV-O-RICIOUS,Risk 2 (Medium)
33799,1579115.0,TURKISH CUISINE AND BAKERY,TURKISH CUISINE AND BAKERY,Risk 1 (High)
85087,1579115.0,TURKISH CUISINE AND BAKERY,TURKISH CUISINE AND BAKERY,Risk 3 (Low)
2313,1947909.0,SUBWAY,SUBWAY,Risk 1 (High)
115550,1947909.0,SUBWAY,SUBWAY,Risk 3 (Low)
80181,1954774.0,Wells Street Popcorn,Wells Street Popcorn,Risk 2 (Medium)
80180,1954774.0,Wells Street Popcorn,Wells Street Popcorn,Risk 3 (Low)
87107,1974918.0,ATINO'S PIZZA,ATINO'S PIZZA,Risk 3 (Low)
87109,1974918.0,ATINO'S PIZZA,ATINO'S PIZZA,Risk 1 (High)


In [116]:
temp = input_data[~pd.isnull(input_data['Risk'])][['License #', 'DBA Name', 'AKA Name', 'Inspection Date', 'Risk']].drop_duplicates()
temp[temp.duplicated(subset=['License #', 'DBA Name', 'AKA Name', 'Inspection Date'], keep=False)].sort_values(by='License #')

Unnamed: 0,License #,DBA Name,AKA Name,Inspection Date,Risk
85087,1579115,TURKISH CUISINE AND BAKERY,TURKISH CUISINE AND BAKERY,2013-08-26,Risk 3 (Low)
85099,1579115,TURKISH CUISINE AND BAKERY,TURKISH CUISINE AND BAKERY,2013-08-26,Risk 1 (High)
115550,1947909,SUBWAY,SUBWAY,2012-02-02,Risk 3 (Low)
115558,1947909,SUBWAY,SUBWAY,2012-02-02,Risk 1 (High)
80180,1954774,Wells Street Popcorn,Wells Street Popcorn,2013-11-07,Risk 3 (Low)
80181,1954774,Wells Street Popcorn,Wells Street Popcorn,2013-11-07,Risk 2 (Medium)
87107,1974918,ATINO'S PIZZA,ATINO'S PIZZA,2013-07-22,Risk 3 (Low)
87109,1974918,ATINO'S PIZZA,ATINO'S PIZZA,2013-07-22,Risk 1 (High)
69370,2013962,KENDALL COLLEGE,LAB KITCHEN B-ROOM 120B.,2013-09-09,Risk 3 (Low)
84339,2013962,KENDALL COLLEGE,LAB KITCHEN B-ROOM 120B.,2013-09-09,Risk 1 (High)


In [136]:
temp = input_data[~pd.isnull(input_data['Risk'])][['License #', 'DBA Name', 'AKA Name', 'Address', 'Inspection Date', 'Risk']].drop_duplicates()
temp[temp.duplicated(subset=['License #', 'DBA Name', 'AKA Name', 'Address', 'Inspection Date'], keep=False)].sort_values(by='License #')

Unnamed: 0,License #,DBA Name,AKA Name,Address,Inspection Date,Risk
85087,1579115,TURKISH CUISINE AND BAKERY,TURKISH CUISINE AND BAKERY,5605-5609 N CLARK ST,2013-08-26,Risk 3 (Low)
85099,1579115,TURKISH CUISINE AND BAKERY,TURKISH CUISINE AND BAKERY,5605-5609 N CLARK ST,2013-08-26,Risk 1 (High)
115550,1947909,SUBWAY,SUBWAY,6450 S PULASKI RD,2012-02-02,Risk 3 (Low)
115558,1947909,SUBWAY,SUBWAY,6450 S PULASKI RD,2012-02-02,Risk 1 (High)
80180,1954774,Wells Street Popcorn,Wells Street Popcorn,2804 N CLARK ST,2013-11-07,Risk 3 (Low)
80181,1954774,Wells Street Popcorn,Wells Street Popcorn,2804 N CLARK ST,2013-11-07,Risk 2 (Medium)
87107,1974918,ATINO'S PIZZA,ATINO'S PIZZA,4610 W DIVERSEY AVE,2013-07-22,Risk 3 (Low)
87109,1974918,ATINO'S PIZZA,ATINO'S PIZZA,4610 W DIVERSEY AVE,2013-07-22,Risk 1 (High)
69370,2013962,KENDALL COLLEGE,LAB KITCHEN B-ROOM 120B.,900 N NORTH BRANCH ST,2013-09-09,Risk 3 (Low)
84339,2013962,KENDALL COLLEGE,LAB KITCHEN B-ROOM 120B.,900 N NORTH BRANCH ST,2013-09-09,Risk 1 (High)


In [137]:
temp = input_data[~pd.isnull(input_data['Risk'])][['License #', 'DBA Name', 'AKA Name', 'Address', 'Risk']].drop_duplicates()
temp[temp.duplicated(subset=['License #', 'DBA Name', 'AKA Name', 'Address'], keep=False)].sort_values(by='License #')

Unnamed: 0,License #,DBA Name,AKA Name,Address,Risk
127712,,FLAV-O-RICIOUS,FLAV-O-RICIOUS,600 E GRAND AVE,Risk 3 (Low)
127627,,FLAV-O-RICIOUS,FLAV-O-RICIOUS,600 E GRAND AVE,Risk 2 (Medium)
33799,1579115.0,TURKISH CUISINE AND BAKERY,TURKISH CUISINE AND BAKERY,5605-5609 N CLARK ST,Risk 1 (High)
85087,1579115.0,TURKISH CUISINE AND BAKERY,TURKISH CUISINE AND BAKERY,5605-5609 N CLARK ST,Risk 3 (Low)
2313,1947909.0,SUBWAY,SUBWAY,6450 S PULASKI RD,Risk 1 (High)
115550,1947909.0,SUBWAY,SUBWAY,6450 S PULASKI RD,Risk 3 (Low)
80180,1954774.0,Wells Street Popcorn,Wells Street Popcorn,2804 N CLARK ST,Risk 3 (Low)
80181,1954774.0,Wells Street Popcorn,Wells Street Popcorn,2804 N CLARK ST,Risk 2 (Medium)
87107,1974918.0,ATINO'S PIZZA,ATINO'S PIZZA,4610 W DIVERSEY AVE,Risk 3 (Low)
87109,1974918.0,ATINO'S PIZZA,ATINO'S PIZZA,4610 W DIVERSEY AVE,Risk 1 (High)


In [115]:
temp = input_data[~pd.isnull(input_data['Risk'])][['License #', 'DBA Name', 'Risk']].drop_duplicates()
temp[temp.duplicated(subset=['License #', 'DBA Name'], keep=False)].sort_values(by='License #')

Unnamed: 0,License #,DBA Name,Risk
127712,,FLAV-O-RICIOUS,Risk 3 (Low)
127627,,FLAV-O-RICIOUS,Risk 2 (Medium)
149961,1120537,LEVY RESTAURANTS AT U.S CELLULAR FIELD,Risk 1 (High)
127783,1120537,LEVY RESTAURANTS AT U.S CELLULAR FIELD,Risk 2 (Medium)
95475,135245,ILLINOIS INSTITUTE OF TECHNOLOGY,Risk 1 (High)
...,...,...,...
47119,32394,MART PLAZA CATERING CORP,Risk 1 (High)
36142,37979,BISMARCK ENTERPRISES/UNITED,Risk 2 (Medium)
84078,37979,BISMARCK ENTERPRISES/UNITED,Risk 1 (High)
120491,6121,MACY'S,Risk 2 (Medium)


### Violation and  Inspection_Violation

In [64]:
violation_df = input_data[~pd.isnull(input_data['Violations'])][['Inspection ID', 'Violations']].reset_index(drop=True)
violation_df

Unnamed: 0,Inspection ID,Violations
0,2079125,8. SANITIZING RINSE FOR EQUIPMENT AND UTENSILS...
1,2079123,"35. WALLS, CEILINGS, ATTACHED EQUIPMENT CONSTR..."
2,2079105,18. NO EVIDENCE OF RODENT OR INSECT OUTER OPEN...
3,2079104,2. FACILITIES TO MAINTAIN PROPER TEMPERATURE -...
4,2079091,2. FACILITIES TO MAINTAIN PROPER TEMPERATURE -...
...,...,...
123007,67733,2. FACILITIES TO MAINTAIN PROPER TEMPERATURE -...
123008,67732,2. FACILITIES TO MAINTAIN PROPER TEMPERATURE -...
123009,52234,32. FOOD AND NON-FOOD CONTACT SURFACES PROPERL...
123010,70269,2. FACILITIES TO MAINTAIN PROPER TEMPERATURE -...


In [10]:
input_data['Violations'].values[2]

'8. SANITIZING RINSE FOR EQUIPMENT AND UTENSILS:  CLEAN, PROPER TEMPERATURE, CONCENTRATION, EXPOSURE TIME - Comments: NO DISH WASHING FACILITIES ON SITE, (NO THREE COMPARTMENT SINK, WITH GREASE TRAP, OR DISHMACHINE), INSTRUCTED TO PROVIDE, | 11. ADEQUATE NUMBER, CONVENIENT, ACCESSIBLE, DESIGNED, AND MAINTAINED - Comments:  NO EXPOSED HAND SINK FOR REAR SERVICE AREA, INSTRUCTED TO PROVIDE,\n | 18. NO EVIDENCE OF RODENT OR INSECT OUTER OPENINGS PROTECTED/RODENT PROOFED, A WRITTEN LOG SHALL BE MAINTAINED AVAILABLE TO THE INSPECTORS - Comments: NO LICENSE PEST CONTROL LOG BOOK AT THIS TIME OF INSPECTION, INSTRUCTED TO PROVIDE, UPON NEXT VISIT,'

In [65]:
violation_df = violation_df.assign(
    Violations=violation_df['Violations'].str.split(' \| ')).explode('Violations').reset_index(drop=True)
violation_df.head(10)

Unnamed: 0,Inspection ID,Violations
0,2079125,8. SANITIZING RINSE FOR EQUIPMENT AND UTENSILS...
1,2079125,"11. ADEQUATE NUMBER, CONVENIENT, ACCESSIBLE, D..."
2,2079125,18. NO EVIDENCE OF RODENT OR INSECT OUTER OPEN...
3,2079123,"35. WALLS, CEILINGS, ATTACHED EQUIPMENT CONSTR..."
4,2079123,"34. FLOORS: CONSTRUCTED PER CODE, CLEANED, GOO..."
5,2079105,18. NO EVIDENCE OF RODENT OR INSECT OUTER OPEN...
6,2079105,31. CLEAN MULTI-USE UTENSILS AND SINGLE SERVIC...
7,2079105,"34. FLOORS: CONSTRUCTED PER CODE, CLEANED, GOO..."
8,2079105,38. VENTILATION: ROOMS AND EQUIPMENT VENTED AS...
9,2079105,"41. PREMISES MAINTAINED FREE OF LITTER, UNNECE..."


In [44]:
violation_df.loc[0, 'Violations']

'8. SANITIZING RINSE FOR EQUIPMENT AND UTENSILS:  CLEAN, PROPER TEMPERATURE, CONCENTRATION, EXPOSURE TIME - Comments: NO DISH WASHING FACILITIES ON SITE, (NO THREE COMPARTMENT SINK, WITH GREASE TRAP, OR DISHMACHINE), INSTRUCTED TO PROVIDE,'

In [45]:
violation_df.loc[2, 'Violations']

'18. NO EVIDENCE OF RODENT OR INSECT OUTER OPENINGS PROTECTED/RODENT PROOFED, A WRITTEN LOG SHALL BE MAINTAINED AVAILABLE TO THE INSPECTORS - Comments: NO LICENSE PEST CONTROL LOG BOOK AT THIS TIME OF INSPECTION, INSTRUCTED TO PROVIDE, UPON NEXT VISIT,'

In [46]:
violation_df.loc[5, 'Violations']

'18. NO EVIDENCE OF RODENT OR INSECT OUTER OPENINGS PROTECTED/RODENT PROOFED, A WRITTEN LOG SHALL BE MAINTAINED AVAILABLE TO THE INSPECTORS - Comments: VIOLATION CORRECTED'

In [47]:
violation_df.loc[12, 'Violations']

'21. * CERTIFIED FOOD MANAGER ON SITE WHEN POTENTIALLY HAZARDOUS FOODS ARE  PREPARED AND SERVED - Comments: NO ORIGINAL CHICAGO FOOD SANITATION CERTIFICATE POSTED. INSTRUCTED MANAGER TO PROVIDE ORIGINAL FOOD SANITATION CERTIFICATE. COPIES ARE NOT ACCEPTABLE. SERIOUS VIOLATION 7-38-012A'

In [49]:
violation_df.loc[30, 'Violations']

'21. * CERTIFIED FOOD MANAGER ON SITE WHEN POTENTIALLY HAZARDOUS FOODS ARE  PREPARED AND SERVED - Comments: NO CITY OF CHICAGO CERTIFIED FOOD MANAGER ON SITE DURING INITIAL INSPECTION WHILE FOODS WERE BEING PREPARED. CERTIFIED MANAGER WAS TELEPHONED AT THE INITIAL INSPECTION AND ARRIVED THIRTY MINUTES LATER. INSTRUCTED A CITY OF CHICAGO CERTIFIED MANAGER MUST BE ON SITE AT ALL TIMES WHILE FOODS ARE PREPARED. SERIOUS VIOLATION. 7-38-012.'

In [66]:
def split_violation(violation):
    violation, comment = violation.split(' - Comments: ')
    code, description = violation.split('. ', 1)
    return code, description, comment
violation_df['code'], violation_df['description'], violation_df['comment'] = zip(*violation_df['Violations'].apply(split_violation))
violation_df = violation_df.drop(columns='Violations').drop_duplicates().reset_index(drop=True)
violation_df

Unnamed: 0,Inspection ID,code,description,comment
0,2079125,8,SANITIZING RINSE FOR EQUIPMENT AND UTENSILS: ...,"NO DISH WASHING FACILITIES ON SITE, (NO THREE ..."
1,2079125,11,"ADEQUATE NUMBER, CONVENIENT, ACCESSIBLE, DESIG...","NO EXPOSED HAND SINK FOR REAR SERVICE AREA, I..."
2,2079125,18,NO EVIDENCE OF RODENT OR INSECT OUTER OPENINGS...,NO LICENSE PEST CONTROL LOG BOOK AT THIS TIME ...
3,2079123,35,"WALLS, CEILINGS, ATTACHED EQUIPMENT CONSTRUCTE...",MUST PROVIDE OVERHEAD PROTECTION FOR COFFEE/TE...
4,2079123,34,"FLOORS: CONSTRUCTED PER CODE, CLEANED, GOOD RE...",FLOORS AT COFFEE/TEA STATIONS MUST BE MADE SMO...
...,...,...,...,...
568577,104236,36,LIGHTING: REQUIRED MINIMUM FOOT-CANDLES OF LIG...,"All rooms in which food or drink is prepared, ..."
568578,104236,37,TOILET ROOM DOORS SELF CLOSING: DRESSING ROOMS...,None of the operations connected with food est...
568579,104236,38,VENTILATION: ROOMS AND EQUIPMENT VENTED AS REQ...,The flow of air discharged from kitchen fans s...
568580,104236,39,LINEN: CLEAN AND SOILED PROPERLY STORED,"Soiled and clean linens, coats, and aprons sha..."


In [67]:
violation_df[['Inspection ID', 'code']].duplicated().sum()

7599

In [68]:
violations = violation_df[['code', 'description']].drop_duplicates().reset_index(drop=True)
violations

Unnamed: 0,code,description
0,8,SANITIZING RINSE FOR EQUIPMENT AND UTENSILS: ...
1,11,"ADEQUATE NUMBER, CONVENIENT, ACCESSIBLE, DESIG..."
2,18,NO EVIDENCE OF RODENT OR INSECT OUTER OPENINGS...
3,35,"WALLS, CEILINGS, ATTACHED EQUIPMENT CONSTRUCTE..."
4,34,"FLOORS: CONSTRUCTED PER CODE, CLEANED, GOOD RE..."
5,31,CLEAN MULTI-USE UTENSILS AND SINGLE SERVICE AR...
6,38,VENTILATION: ROOMS AND EQUIPMENT VENTED AS REQ...
7,41,"PREMISES MAINTAINED FREE OF LITTER, UNNECESSAR..."
8,2,FACILITIES TO MAINTAIN PROPER TEMPERATURE
9,21,* CERTIFIED FOOD MANAGER ON SITE WHEN POTENTIA...


In [71]:
inspection_violation = violation_df[['Inspection ID', 'code', 'comment']]
inspection_violation

Unnamed: 0,Inspection ID,code,comment
0,2079125,8,"NO DISH WASHING FACILITIES ON SITE, (NO THREE ..."
1,2079125,11,"NO EXPOSED HAND SINK FOR REAR SERVICE AREA, I..."
2,2079125,18,NO LICENSE PEST CONTROL LOG BOOK AT THIS TIME ...
3,2079123,35,MUST PROVIDE OVERHEAD PROTECTION FOR COFFEE/TE...
4,2079123,34,FLOORS AT COFFEE/TEA STATIONS MUST BE MADE SMO...
...,...,...,...
568577,104236,36,"All rooms in which food or drink is prepared, ..."
568578,104236,37,None of the operations connected with food est...
568579,104236,38,The flow of air discharged from kitchen fans s...
568580,104236,39,"Soiled and clean linens, coats, and aprons sha..."


Note that a certain inspection id/violation code combination can have multiple comments. For example:

In [73]:
inspection_violation.groupby(by=['Inspection ID', 'code']).size().rename('count').reset_index()\
    .sort_values(by='count', ascending=False).reset_index(drop=True)

Unnamed: 0,Inspection ID,code,count
0,1764036,34,9
1,1975476,34,8
2,1547061,35,7
3,1501481,32,7
4,1234494,32,6
...,...,...,...
560978,1459941,28,1
560979,1459941,21,1
560980,1459940,41,1
560981,1459940,35,1


In [75]:
inspection_violation[(inspection_violation['Inspection ID']== '1764036') & (inspection_violation['code'] == '34')]

Unnamed: 0,Inspection ID,code,comment
99332,1764036,34,REGROUT THE FLOOR IN FOOD PREP AREA ALL THE WA...
99333,1764036,34,DETAIL CLEAN FLOOR INSIDE WALK-IN COOLER AT BO...
99334,1764036,34,OBSERVED STANDING WATER ON FLOOR ALONG FRONT ...
99336,1764036,34,FLOOR ALONG WALLBASE AND IN THE CORNERS IN NEE...
99337,1764036,34,MUST CLEAN FLOOR ALONG WALLS AND IN THE CORNER...
99338,1764036,34,MUST CLEAN FLOOR ALONG WALLS AND CORNERS AT BO...
99341,1764036,34,OBSERVED FRONT LINE WATER STANDING ON FLOOR AL...
99342,1764036,34,MUST CLEAN FLOOR ALONG THE WALLS AND IN THE CO...
99343,1764036,34,MUST CLEAN FLOOR ALONG THE WALLS AND IN CORNER...


### Inspection_Risk

In [85]:
risk_df = input_data[~pd.isnull(input_data['Risk'])][['Inspection ID', 'Risk']].reset_index(drop=True)
risk_df

Unnamed: 0,Inspection ID,Risk
0,2079132,Risk 1 (High)
1,2079129,Risk 2 (Medium)
2,2079125,Risk 3 (Low)
3,2079123,Risk 1 (High)
4,2079105,Risk 1 (High)
...,...,...
153739,67732,Risk 1 (High)
153740,67757,Risk 2 (Medium)
153741,52234,Risk 1 (High)
153742,70269,Risk 1 (High)


In [103]:
risk_df['Risk'].value_counts()

Risk 1 (High)      107351
Risk 2 (Medium)     31845
Risk 3 (Low)        14529
All                    19
Name: Risk, dtype: int64

In [81]:
list(re.compile(r'Risk (\d+) \((\w+)\)').match('Risk 2 (Medium)').groups())

['2', 'Medium']

In [98]:
def extract_risk(s):
    matched = re.compile(r'\w+ (?P<code>\d+) \((?P<level>\w+)\)').match(s)
    if matched:
        return matched.group('code'), matched.group('level')
    else:
        return '', ''
    
print(extract_risk('Risk 2 (Medium)') ) 
print(extract_risk('Risk2(Medium)') ) 

('2', 'Medium')
('', '')


In [99]:
risk_df['code'], risk_df['level'] = zip(*risk_df['Risk'].apply(extract_risk))

In [100]:
risk_df

Unnamed: 0,Inspection ID,Risk,code,level
0,2079132,Risk 1 (High),1,High
1,2079129,Risk 2 (Medium),2,Medium
2,2079125,Risk 3 (Low),3,Low
3,2079123,Risk 1 (High),1,High
4,2079105,Risk 1 (High),1,High
...,...,...,...,...
153739,67732,Risk 1 (High),1,High
153740,67757,Risk 2 (Medium),2,Medium
153741,52234,Risk 1 (High),1,High
153742,70269,Risk 1 (High),1,High


In [101]:
risk_df[risk_df['code']=='']

Unnamed: 0,Inspection ID,Risk,code,level
161,2078759,All,,
201,2078685,All,,
300,2078476,All,,
391,2078260,All,,
3139,2059435,All,,
4210,2050229,All,,
4672,2049776,All,,
6243,2010064,All,,
58570,1501748,All,,
67584,1474120,All,,


In [89]:
risk_df['Risk'].apply(extract_risk)

AttributeError: 'NoneType' object has no attribute 'groups'

In [None]:
>>> import re
... 
... input_example = "Today at 12:30 PM on Rakesh's Echo"
... regexp_1 = re.compile(r'(?P<day>\w+) at (?P<time>(\d+):(\d+) (\w+)) on (?P<place>\w+)')
... re_match = regexp_1.match(input_example)
>>> list(re_match.groups())
['Today', '12:30 PM', '12', '30', 'PM', 'Rakesh']
>>> re_match.group('day')
'Today'
>>> re_match.group('time')
'12:30 PM'
>>> re_match.group('place')
'Rakesh'

### Inspection_Facility