# Data Integrity Check and Comparison

## 1. Connect to the database:
Connect to the database using SQLite

In [1]:
import pandas as pd
import sqlite3
con = sqlite3.connect("data/db/food_inspections.db")

Check the tables in the database

In [2]:
facility_test = pd.read_sql_query('select * from facility limit 5;', con)
facility_test

Unnamed: 0,id,name_dba,name_aka,license_id,facility_type,address,zip
0,0,#1 CHINA EXPRESS LTD,#1 CHINA EXPRESS LTD,1869616,restaurant,4248 S WENTWORTH AVE,60609
1,1,#1 CHOP SUEY,#1 CHOP SUEY,2042372,restaurant,1907 W 87TH ST,60620
2,2,#1 CHOP SUEY,#1 CHOP SUEY,1970042,restaurant,4307 W IRVING PARK RD,60641
3,3,#1 CHOP SUEY RESTAURANT,#1 CHOP SUEY RESTAURANT,1739799,restaurant,4307 W IRVING PARK RD,60641
4,4,#1 DELI INC,31 DELI INC,2501188,grocery,6901 S NORMAL BLVD,60621


In [3]:
address_test = pd.read_sql_query('select * from address limit 5;', con)
address_test

Unnamed: 0,street,zip,city,state,latitude,longitude
0,2121 S PRAIRIE AVE,60616,CHICAGO,IL,41.853651,-87.620534
1,1025 W MADISON ST,60607,CHICAGO,IL,41.881572,-87.653052
2,301 N JUSTINE ST,60607,CHICAGO,IL,41.886577,-87.665328
3,2245 N MCVICKER AVE,60639,CHICAGO,IL,41.921675,-87.776711
4,4355 S COTTAGE GROVE AVE,60653,CHICAGO,IL,41.815197,-87.606531


In [4]:
risk_test = pd.read_sql_query('select * from risk;', con)
risk_test

Unnamed: 0,code,level
0,0,All
1,1,High
2,2,Medium
3,3,Low


In [5]:
facility_risk_test = pd.read_sql_query('select * from facility_risk limit 5;', con)
facility_risk_test

Unnamed: 0,facility_id,risk_code
0,0,1
1,1,1
2,2,1
3,3,1
4,4,2


In [6]:
inspection_test = pd.read_sql_query('select * from inspection limit 5;', con)
inspection_test

Unnamed: 0,id,date,type,result
0,67732,2010-01-04,license re-inspection,Pass
1,67733,2010-01-04,license re-inspection,Pass
2,52234,2010-01-04,license re-inspection,Pass
3,67757,2010-01-04,tag removal,Pass
4,70269,2010-01-04,license re-inspection,Pass


In [7]:
facility_inspection_test = pd.read_sql_query('select * from facility_inspection limit 5;', con)
facility_inspection_test

Unnamed: 0,facility_id,inspection_id
0,0,467259
1,0,1285263
2,1,1296904
3,1,1395782
4,1,277104


In [8]:
violation_test = pd.read_sql_query('select * from violation limit 5;', con)
violation_test

Unnamed: 0,code,description
0,1,"SOURCE SOUND CONDITION, NO SPOILAGE, FOODS PRO..."
1,2,FACILITIES TO MAINTAIN PROPER TEMPERATURE
2,3,POTENTIALLY HAZARDOUS FOOD MEETS TEMPERATURE R...
3,4,SOURCE OF CROSS CONTAMINATION CONTROLLED I.E. ...
4,5,PERSONNEL WITH INFECTIONS RESTRICTED: NO OPEN ...


In [9]:
inspection_violation_test = pd.read_sql_query('select * from inspection_violation limit 5;', con)
inspection_violation_test

Unnamed: 0,inspection_id,violation_code,violation_comment
0,44247,30,All food not stored in the original container ...
1,44247,32,OBSERVED TORN DOOR GASKET ON DOOR OF 'CHILL IS...
2,44247,33,All food and non-food contact surfaces of equi...
3,44247,34,"The floors shall be constructed per code, be s..."
4,44247,37,Toilet rooms shall be completely enclosed and ...


## 2. Comparison and Integrity Check

This section compares the cleaned data with the original data.

First, we need to load the original data.

In [10]:
orig_data = pd.read_csv("data/original/Food_Inspections.csv")
orig_data.head()


Unnamed: 0,Inspection ID,DBA Name,AKA Name,License #,Facility Type,Risk,Address,City,State,Zip,Inspection Date,Inspection Type,Results,Violations,Latitude,Longitude,Location
0,2079132,MARRIOT MARQUIS CHICAGO,MARRIOT MARQUIS CHICAGO,2517328.0,Restaurant,Risk 1 (High),2121 S PRAIRIE AVE,CHICAGO,IL,60616.0,08/28/2017,License,Pass,,41.853651,-87.620534,"(41.853650885040594, -87.62053358114167)"
1,2079129,JET'S PIZZA,JET'S PIZZA,2522268.0,Restaurant,Risk 2 (Medium),1025 W MADISON ST,CHICAGO,IL,60607.0,08/28/2017,License,Not Ready,,41.881572,-87.653052,"(41.88157249576794, -87.65305233593274)"
2,2079125,ROOM 1520,ROOM 1520,2446638.0,Special Event,Risk 3 (Low),301 N JUSTINE ST,CHICAGO,IL,60607.0,08/28/2017,License Re-Inspection,Not Ready,8. SANITIZING RINSE FOR EQUIPMENT AND UTENSILS...,41.886577,-87.665328,"(41.88657652150854, -87.66532812140231)"
3,2079123,MARRIOT MARQUIS CHICAGO,MARRIOT MARQUIS CHICAGO,2517338.0,Restaurant,Risk 1 (High),2121 S PRAIRIE AVE,CHICAGO,IL,60616.0,08/28/2017,License,Pass,"35. WALLS, CEILINGS, ATTACHED EQUIPMENT CONSTR...",41.853651,-87.620534,"(41.853650885040594, -87.62053358114167)"
4,2079105,CHARTWELLS,CICS WEST BELDEN CAMPUS,2549079.0,CHARTER SCHOOL,Risk 1 (High),2245 N MCVICKER AVE,CHICAGO,IL,60639.0,08/28/2017,License Re-Inspection,Pass,18. NO EVIDENCE OF RODENT OR INSECT OUTER OPEN...,41.921675,-87.776711,"(41.921675488910864, -87.7767113569357)"


### 2.1. Address Integrity Check
#### 2.1.1. City
The original data have un-uniformed format , and have lots of invalid city names such as 'CCHICAGO', 'CHICAGOCHICAGO', 'CHCHICAGO', 'CHICAGOI', 'CHCICAGO', '312CHICAGO'.

In [11]:
address_city_original = orig_data['City'].value_counts()
address_city_original

CHICAGO                 153090
Chicago                    258
chicago                     77
CCHICAGO                    39
SCHAUMBURG                  18
MAYWOOD                     14
ELK GROVE VILLAGE           12
CHicago                     10
SKOKIE                       8
INACTIVE                     8
CHESTNUT STREET              8
EVANSTON                     7
CICERO                       6
CHICAGOCHICAGO               6
CHCHICAGO                    6
CHARLES A HAYES              6
WORTH                        5
ELMHURST                     5
OAK PARK                     4
CALUMET CITY                 4
SUMMIT                       4
EAST HAZEL CREST             3
ALSIP                        3
CHICAGOI                     3
NILES NILES                  3
CHCICAGO                     3
SCHILLER PARK                3
CHICAGO HEIGHTS              2
BERWYN                       2
BLUE ISLAND                  2
STREAMWOOD                   2
NAPERVILLE                   2
312CHICA

The cleaned city column have uniform upper case format.
And invalid city names are cleaned.

In [12]:
address_city_integrity = pd.read_sql_query('select city from address where city != "" group by city;', con)
address_city_integrity


Unnamed: 0,city
0,ALSIP
1,BANNOCKBURNDEERFIELD
2,BERWYN
3,BLOOMINGDALE
4,BLUE ISLAND
5,BOLINGBROOK
6,BRIDEVIEW
7,BROADVIEW
8,CALUMET CITY
9,CHICAGO


#### 2.1.2. Duplicate address for the same location

In the original data, there are different variations of addresses for the same location (latitude & longitude) and city/state/zip:

In [13]:
temp = orig_data[['Address', 'Latitude', 'Longitude']].drop_duplicates()
address_duplicated_original = temp.groupby(
    by=['Latitude', 'Longitude'], dropna=False)\
    .filter(lambda x: len(x) > 1).sort_values(by=['Latitude', 'Address'], ascending=False)

address_duplicated_original
#with pd.option_context('display.max_rows', None, 'display.max_columns', None):
#    display(address_duplicated)

Unnamed: 0,Address,Latitude,Longitude
115964,1636 W Howard ST,42.019491,-87.672224
6882,1636 W HOWARD ST,42.019491,-87.672224
83917,1632 W howard BLDG,42.019490,-87.672057
25889,1632 W HOWARD ST,42.019490,-87.672057
77277,7410 N clark BLDG,42.016328,-87.675398
...,...,...,...
37884,12201 S HALSTED ST,41.672270,-87.641576
136481,12760 S Halsted ST,41.661467,-87.641508
69411,12760 S HALSTED ST,41.661467,-87.641508
100604,13703 S Indiana BLDG,41.646554,-87.617083


The cleaned data shall have unique address, latitude and longitude. The following query checks if we can duplicated addresses.

In [14]:
address_count_cleaned = pd.read_sql_query\
    ('select street, latitude, longitude, count(*) from address where latitude != "" group by latitude, longitude  having count(*) > 1;', con)
address_count_cleaned


Unnamed: 0,street,latitude,longitude,count(*)


#### 2.1.3. City Inference

In the original data, same latitude and longitude have conflict city names.
- Some locations have a certain city and "NaN". We inferred the missing NaN value for this scenario.
- For Some locations that have conflict city names. We chose the one with a higher count.
We inferred the missing NaN

In [15]:
temp = orig_data.groupby(by=['Latitude', 'Longitude', 'City'], dropna=False).size().rename('count').reset_index()
duplicated_city = temp.groupby(by=['Latitude', 'Longitude'], dropna=False)\
    .filter(lambda x: len(x) > 1).sort_values(by=['Latitude', 'Longitude', 'count'], ascending=False)
duplicated_city

Unnamed: 0,Latitude,Longitude,City,count
15705,42.008536,-87.914428,CHICAGO,1874
15706,42.008536,-87.914428,,10
15299,41.997452,-87.687581,CHICAGO,3
15300,41.997452,-87.687581,CHICAGOI,3
14856,41.985066,-87.668949,CHICAGO,11
...,...,...,...,...
1719,41.751592,-87.568606,,1
1629,41.751030,-87.613649,CHICAGO,5
1630,41.751030,-87.613649,,1
1426,41.746236,-87.643766,,12


The cleaned data shows no conflicts:

In [16]:
city_count_cleaned = pd.read_sql_query\
    ('select city, latitude, longitude, count(*) from address where latitude != "" group by latitude, longitude  having count(*) > 1;', con)
city_count_cleaned

Unnamed: 0,city,latitude,longitude,count(*)


#### 2.1.4. Zip code inference
In the original data, same latitude and longitude have conflict zip codes.
- Some locations have a certain zip and "NaN". We inferred the missing NaN value for this scenario.
- For Some locations that have conflict zip numbers. We chose the one with a higher count.
We inferred the missing NaN

In [17]:
temp = orig_data.groupby(by=['Latitude', 'Longitude', 'Zip'], dropna=False).size().rename('count').reset_index()
duplicated_zip = temp.groupby(by=['Latitude', 'Longitude'], dropna=False)\
    .filter(lambda x: len(x) > 1).sort_values(by=['Latitude', 'Longitude', 'count'], ascending=False)
duplicated_zip

Unnamed: 0,Latitude,Longitude,Zip,count
15719,42.008536,-87.914428,60666.0,1881
15718,42.008536,-87.914428,60606.0,3
14556,41.977421,-87.654869,60613.0,1
14557,41.977421,-87.654869,60640.0,1
14446,41.975801,-87.713499,60625.0,11
...,...,...,...,...
1719,41.751592,-87.568606,,1
1629,41.751030,-87.613649,60619.0,5
1630,41.751030,-87.613649,,1
7,41.648818,-87.617080,60827.0,2


The cleaned zip code data does not have the above conflicts.

In [18]:
zip_count_cleaned = pd.read_sql_query\
    ('select zip, latitude, longitude, count(*) from address where latitude != "" group by latitude, longitude  having count(*) > 1;', con)
zip_count_cleaned

Unnamed: 0,zip,latitude,longitude,count(*)


#### 2.1.5. state and city Integrity
In the original data, the state and city was not identical to the same zip code.

In [19]:
temp = orig_data[~orig_data['Zip'].isnull()].groupby(by=['Zip', 'State', 'City'], dropna=False).size().rename('Count').reset_index()
duplicated_city_state = temp.groupby(by='Zip', dropna=False)\
    .filter(lambda x: len(x) > 1).sort_values(by=['Zip', 'Count'], ascending=False)
duplicated_city_state



Unnamed: 0,Zip,State,City,Count
156,60707.0,IL,CHICAGO,882
157,60707.0,IL,Chicago,11
153,60666.0,IL,CHICAGO,1892
154,60666.0,IL,,10
151,60661.0,IL,CHICAGO,2063
...,...,...,...,...
38,60601.0,IL,Chicago,5
29,60461.0,IL,OLYMPIA FIELDS,1
30,60461.0,IL,OOLYMPIA FIELDS,1
12,60153.0,IL,MAYWOOD,14


The cleaned state and city data are identical to the zip code.

In [20]:
state_city_count_cleaned = pd.read_sql_query\
    ('select zip, city, state, count(*) from (select distinct zip,  city,  state from address where zip != "")  group by zip having count(*) > 1;', con)
state_city_count_cleaned


Unnamed: 0,zip,city,state,count(*)


### 2.2. Risk Integrity Check
#### 2.2.1. Risk level
In original data, the risk level can be different for the same facility in a same day.

In [21]:
temp = orig_data[['License #', 'DBA Name', 'AKA Name', 'Address', 'Risk']].drop_duplicates()
duplicated_risk = temp[temp.duplicated(subset=['License #', 'DBA Name', 'AKA Name', 'Address'], keep=False)].sort_values(by='License #')
duplicated_risk

Unnamed: 0,License #,DBA Name,AKA Name,Address,Risk
127712,0.0,FLAV-O-RICIOUS,FLAV-O-RICIOUS,600 E GRAND AVE,Risk 3 (Low)
127627,0.0,FLAV-O-RICIOUS,FLAV-O-RICIOUS,600 E GRAND AVE,Risk 2 (Medium)
33799,1579115.0,TURKISH CUISINE AND BAKERY,TURKISH CUISINE AND BAKERY,5605-5609 N CLARK ST,Risk 1 (High)
85087,1579115.0,TURKISH CUISINE AND BAKERY,TURKISH CUISINE AND BAKERY,5605-5609 N CLARK ST,Risk 3 (Low)
2313,1947909.0,SUBWAY,SUBWAY,6450 S PULASKI RD,Risk 1 (High)
115550,1947909.0,SUBWAY,SUBWAY,6450 S PULASKI RD,Risk 3 (Low)
80181,1954774.0,Wells Street Popcorn,Wells Street Popcorn,2804 N CLARK ST,Risk 2 (Medium)
80180,1954774.0,Wells Street Popcorn,Wells Street Popcorn,2804 N CLARK ST,Risk 3 (Low)
87109,1974918.0,ATINO'S PIZZA,ATINO'S PIZZA,4610 W DIVERSEY AVE,Risk 1 (High)
87107,1974918.0,ATINO'S PIZZA,ATINO'S PIZZA,4610 W DIVERSEY AVE,Risk 3 (Low)


 The cleaned data has no duplicated risk for the same facility.


In [22]:
risk_cleaned = pd.read_sql_query\
    ('select facility_id, risk_code, count(*) from facility_risk group by facility_id,risk_code having count(*) > 1;', con)
risk_cleaned

Unnamed: 0,facility_id,risk_code,count(*)


### 2.3 Violation Integrity Check

There are two types of conclusions: conclusive result and non-conclusive result:
- conclusive results includes 'Pass', 'Fail', 'Pass w/ Conditions';
- non-conclusive results includes 'Out of Business', 'No Entry', 'Not Ready', 'Business Not Located'

In the original report, we found that for some non-conclusive inspection, it still have some violation code.

In [23]:
conclusive_results = ['Pass', 'Fail', 'Pass w/ Conditions']
non_conclusive_results = ['Out of Business', 'No Entry', 'Not Ready', 'Business Not Located']

In [24]:
orig_data[orig_data['Results'].isin(non_conclusive_results) & (~orig_data['Violations'].isnull())]


Unnamed: 0,Inspection ID,DBA Name,AKA Name,License #,Facility Type,Risk,Address,City,State,Zip,Inspection Date,Inspection Type,Results,Violations,Latitude,Longitude,Location
2,2079125,ROOM 1520,ROOM 1520,2446638.0,Special Event,Risk 3 (Low),301 N JUSTINE ST,CHICAGO,IL,60607.0,08/28/2017,License Re-Inspection,Not Ready,8. SANITIZING RINSE FOR EQUIPMENT AND UTENSILS...,41.886577,-87.665328,"(41.88657652150854, -87.66532812140231)"
11,2079076,THE ART OF CHICKEN,THE ART OF CHICKEN,2202818.0,Restaurant,Risk 1 (High),2041 N WESTERN AVE,CHICAGO,IL,60647.0,08/25/2017,Complaint,No Entry,"34. FLOORS: CONSTRUCTED PER CODE, CLEANED, GOO...",41.918742,-87.687301,"(41.918741718038916, -87.68730089444456)"
38,2079101,THE PARTY ROOM,THE PARTY ROOM,2534843.0,Catering,Risk 1 (High),10232 S VINCENNES AVE,CHICAGO,IL,60643.0,08/25/2017,License Re-Inspection,Not Ready,2. FACILITIES TO MAINTAIN PROPER TEMPERATURE -...,41.707675,-87.655848,"(41.70767490603119, -87.65584840757958)"
129,2078837,SILOM12,SILOM12 THAI BUCKTOWN,2064938.0,Restaurant,Risk 1 (High),1846-1848 N MILWAUKEE AVE,CHICAGO,IL,60647.0,08/22/2017,Canvass,No Entry,"30. FOOD IN ORIGINAL CONTAINER, PROPERLY LABEL...",41.914901,-87.684667,"(41.91490097692986, -87.68466742765311)"
316,2078880,PARTS AND LABOR,PARTS AND LABOR,2240274.0,Restaurant,Risk 1 (High),2700 N MILWAUKEE AVE,CHICAGO,IL,60647.0,08/15/2017,Canvass,No Entry,"35. WALLS, CEILINGS, ATTACHED EQUIPMENT CONSTR...",41.930191,-87.709475,"(41.930190942014534, -87.70947504959446)"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
109860,1215137,BUBS GYROS,BUB'S GYROS,2069295.0,Restaurant,Risk 1 (High),5800 W IRVING PARK RD,CHICAGO,IL,60634.0,05/24/2012,Complaint Re-Inspection,Out of Business,24. DISH WASHING FACILITIES: PROPERLY DESIGNED...,41.953228,-87.771956,"(41.95322815649395, -87.77195584555682)"
115737,545787,HONEY BEAR NURSERY & KINDERGAR,HONEY BEAR NURSERY & KINDERGAR,5664.0,Daycare (2 - 6 Years),Risk 1 (High),419 W 95TH ST,CHICAGO,IL,60628.0,01/31/2012,Canvass Re-Inspection,Out of Business,18. NO EVIDENCE OF RODENT OR INSECT OUTER OPEN...,41.721474,-87.634264,"(41.72147380165719, -87.63426431837996)"
117134,545725,THE BURGER BAR,THE BURGER BAR,2064712.0,Restaurant,Risk 2 (Medium),622 E 71ST ST,CHICAGO,IL,60619.0,01/03/2012,Complaint Re-Inspection,Out of Business,18. NO EVIDENCE OF RODENT OR INSECT OUTER OPEN...,41.765919,-87.609521,"(41.765919379539135, -87.60952093374088)"
127908,517377,KITH & KIN,WEBSTER STREET CAFE,1799703.0,Restaurant,Risk 1 (High),1119-1121 W WEBSTER AVE,CHICAGO,IL,60614.0,06/01/2011,Complaint Re-Inspection,Out of Business,18. NO EVIDENCE OF RODENT OR INSECT OUTER OPEN...,41.921577,-87.656727,"(41.92157698377766, -87.65672651542604)"


After cleaning, we found that for non_conclusive results, there is no violation code.


In [25]:
non_conclusive_cleaned = pd.read_sql_query\
    ("""select id from Inspection join Inspection_Violation on id = inspection_id where result = "Out of Business" or result = "No Entry" or result = "Not Ready" or result = "Business Not Located" ;""", con)

non_conclusive_cleaned

Unnamed: 0,id
