In [1]:
import pandas as pd
import numpy as np
import pickle 
from matplotlib import pyplot as plt
import warnings
from pyjarowinkler import distance
warnings.filterwarnings("ignore")

# Reading the data

In [2]:
columns = [
    'inspection_id', 'dba_name', 'aka_name', 'license', 
    'facility_type', 'risk', 'address', 'city', 'state', 
    'zip', 'inspection_date', 'inspection_type', 'results', 
    'violations', 'latitude', 'longitude', 'location', 'historical_wards_2013_2015',
    'zip_codes', 'community_areas', 'census_tracts', 'wards'
]
chicago = pd.read_csv('./data/food-inspections.csv')
chicago.columns = columns
chicago.drop('historical_wards_2013_2015', axis=1, inplace=True)
chicago.drop('zip_codes', axis=1, inplace=True)
chicago.drop('community_areas', axis=1, inplace=True)
chicago.drop('census_tracts', axis=1, inplace=True)
chicago.drop('wards', axis=1, inplace=True)
columns = [
    'inspection_id', 'dba_name', 'aka_name', 'license', 
    'facility_type', 'risk', 'address', 'city', 'state', 
    'zip', 'inspection_date', 'inspection_type', 'results', 
    'violations', 'latitude', 'longitude', 'location'
]
chicago.head()

FileNotFoundError: [Errno 2] File b'./chicago-food-inspections/food-inspections.csv' does not exist: b'./chicago-food-inspections/food-inspections.csv'

# Examination of the data
In this part, the following columns of the data are examined and cleaned:
Inspection ID', 'DBA Name', 'AKA Name', 'License #', 'Facility Type',
       'Risk', 'Address', 'City', 'State'

## inspection_id
Some inspections have been inserted twice in the dataset and need to be deleted to avoid duplicates.

In [3]:
inspection_id = chicago['inspection_id']

We first verify that no entry has a null id

In [4]:
sum(inspection_id.isnull())

0

Then, we check that inspections sharing the same id were indeed duplicates and delete the redundant entries

In [5]:
doublons = (inspection_id.value_counts()[inspection_id.value_counts() > 1]).index

redundance = []
for doublon in doublons: 
    temp = chicago[chicago['inspection_id'] == doublon]
    
    if len(temp) != 2:
        print('The id', doublon, 'has more than 2 entries!') # Checking that there are no more than 1 duplicate
        
    else:
        redundant = True
        for column in columns: # Checking all columns are the same for both entries
            if temp.iloc[0][column] != temp.iloc[1][column]:
                if not(np.isnan(temp.iloc[0][column]) and np.isnan(temp.iloc[1][column])):
                    print('Two entries have the same id', doublon, 'but different entries at', column )
                    display(temp)
                    redundant = False
                    tempi = temp
                    break
                
        if redundant:
            redundance.append(doublon)
        else:
            print(doublon)

In [8]:
print(len(chicago))
print(len(doublons))
indices = []
for doublon in doublons:
    temp = chicago[chicago['inspection_id'] == doublon]
    index = temp.iloc[0].name
    indices.append(index)
chicago.drop(indices, axis=0, inplace=True)
print(len(chicago))    

195212
128
195084


In [9]:
with open('./chicago-food-inspections/chicago_data.pickle', 'wb') as fp:
    pickle.dump(chicago, fp)

## DBA name

In [10]:
# Checking if there are any null entries
sum(chicago['dba_name'].isnull())

0

In [11]:
# Standardising the name of the chain by lowering the cases
print('Number of unique restaurants in the dataset when the names are case sensitive:', len(chicago['dba_name'].unique()))
chicago['dba_name'] = chicago['dba_name'].str.lower()
print('Number of unique restaurants when the names are not case sensitive:', len(chicago['dba_name'].unique()))

Number of unique restaurants in the dataset when the names are case sensitive: 27519
Number of unique restaurants when the names are not case sensitive: 27233


In [12]:
with open('./chicago-food-inspections/chicago_data.pickle', 'wb') as fp:
    pickle.dump(chicago, fp)

## AKA Name
In this part, we cast all aka names into lower case, and replace the null entries by their DBA names

In [13]:
chicago['aka_name'].unique()

array(['TODD APPEL BAR AND COCKTAIL', 'Sayre Language Acad.',
       'Hitch Elementary School', ..., 'SAFAH FOOD & LIQUOR INC',
       'MAKIA FOOD', 'RAINBOW GROCERY'], dtype=object)

In [14]:
print('Number of entries without an AKA name:', len(chicago[chicago['aka_name'].isnull()]))

Number of entries without an AKA name: 2451


In [15]:
# Replacing null entries with their dba name
temp0 = chicago[chicago['aka_name'].isnull()]
temp0['aka_name'] = temp0['dba_name']
temp1 = chicago[~chicago['aka_name'].isnull()]
chicago = temp1.append(temp0)

In [16]:
print('Number of unique names with case sensitivity:', len(chicago['aka_name'].unique()))
temp = chicago['aka_name'].str.lower()
print('Number of unique names without case sensitivity:', len(temp.unique()))
chicago['aka_name'] = chicago['aka_name'].str.lower()

Number of unique names with case sensitivity: 27188
Number of unique names without case sensitivity: 26712


In [17]:
with open('./chicago-food-inspections/chicago_data.pickle', 'wb') as fp:
    pickle.dump(chicago, fp)

## License Number
Some business don't have a license number. In this dataframe, we create a boolean column, indicating if the business has a license number or not

In [18]:
print('There are ', len(chicago[chicago['license'].isnull()]), 'entries without a license number')

There are  17 entries without a license number


In [19]:
chicago['has_license'] = ~chicago['license'].isnull()

In [20]:
with open('./chicago-food-inspections/chicago_data.pickle', 'wb') as fp:
    pickle.dump(chicago, fp)

## Facility Type
Some types of facilities only contain one establishment, either because it is a very niche category, or because it was entered in a way that the similar entries don't match because of spelling or specificity. In order to group those categories, we match the facility types grouping less than 40 estiablishments to the most similar category according to the Jaro Winkler distance. If none of the categories are a match (distance smaller than 0.70), we place them into misc (miscellaneous).

In [40]:
# Turn Nan values into a string to be able to operate on the column
# Standardising the facility types to lower case categories
chicago['facility_type']
len(chicago['facility_type'].unique())
chicago['facility_type'] = chicago['facility_type'].fillna('not available')
chicago['facility_type'] = chicago['facility_type'].str.lower()

In [41]:
len(chicago)

195084

In [42]:
print('Number of facility types before standardising:', len(chicago['facility_type'].unique()))

Number of facility types before standardising: 441


In [43]:
(chicago['facility_type'].unique())

array(['grocery store', 'school', 'restaurant',
       "children's services facility", 'daycare above and under 2 years',
       'bakery', 'mobile food preparer', 'daycare (2 - 6 years)',
       'not available', 'ice cream', 'long term care',
       'pop-up establishment host-tier ii', 'liquor', 'live poultry',
       'tavern', 'paleteria', 'catering', 'mobile food dispenser',
       'childrens services facility', 'brewery', 'gym', 'charter school',
       'banquet', "childern's service facility", 'golden diner',
       'grocery store /pharmacy', 'daycare combo 1586', 'assisted living',
       'hospital', 'airport lounge', "1023 children's services facility",
       'private school', 'cooking school', 'daycare (under 2 years)',
       'mobile frozen desserts vendor', 'daycare night', 'sushi counter',
       'banquet hall', 'after school program', 'dollar tree',
       'gas station', 'grocery/restaurant', 'fitness center',
       'grocery & restaurant', 'event space', 'commisary restaur

In [44]:
facility_count = chicago['facility_type'].value_counts()

i = 0

facility_types = []
small_types = {}

# Retrieving the main categories, and identifying the smallest ones
for facility in facility_count.index:
    if facility_count[facility] > 40:
        facility_types.append(facility)
    elif facility_count[facility] <= 40:
        small_types[facility] = ''

# Matching the small categories to the principal ones
for small in small_types:
    distances = []
    for facility in facility_types:
        dist = distance.get_jaro_distance(small, facility)
        distances.append(dist)
    index = np.argmax(distances)
    if distances[index] > 0.70:
        small_types[small] = facility_types[index]
    else:
        small_types[small] = 'misc'


In [45]:
print('Minority categories and their match in the main category pool:')
print(small_types)

Minority categories and their match in the main category pool:
{'store': 'misc', 'restaurant/bar': 'restaurant', 'church': 'misc', 'rooftop': 'misc', 'ice cream shop': 'coffee shop', "1023 childern's services facility": "children's services facility", 'church kitchen': 'shared kitchen', 'cooking school': 'charter school', 'commissary': 'misc', 'pop-up establishment host-tier ii': 'misc', 'culinary school': 'charter school', 'assisted living': 'misc', "1023-children's services facility": "children's services facility", 'bar': 'bakery', 'grocery & restaurant': 'grocery/restaurant', 'restaurant/grocery store': 'restaurant', 'roof tops': 'misc', 'restaurant/gas station': 'restaurant', 'mobile desserts vendor': 'mobile frozen desserts vendor', 'theater': 'shelter', 'nursing home': 'misc', 'roof top': 'misc', 'gas station/mini mart': 'gas station', 'grocery store/gas station': 'grocery store', 'paleteria': 'cafeteria', 'after school program': 'private school', 'wrigley roof top': 'misc', 'ho

In [46]:
# Replacing the minority categories by majority ones in the dataframe

chicago['new_facility_type'] = chicago['facility_type']
def get_new_facility(x):
    if x not in small_types:
        return x
    else:
        return small_types[x]
    
chicago['new_facility_type'] = chicago['new_facility_type'].apply(get_new_facility)

In [48]:
print('Number of facilities in the dataset after matching minority types to majority types:', len(chicago['new_facility_type'].unique()))

Number of facilities in the dataset after matching minority types to majority types: 43


In [50]:
with open('./chicago-food-inspections/chicago_data.pickle', 'wb') as fp:
    pickle.dump(chicago, fp)

## Risk
Filled nan values by Risk -1 and change risk 'All' to risk 4 (All), so that we can extract the data numerically if we want later


In [51]:
with open('./chicago-food-inspections/chicago_data.pickle', 'rb') as fp:
    chicago = pickle.load(fp)

In [52]:
chicago['risk'].unique()
chicago['risk'] = chicago['risk'].fillna('Risk -1 (None)')
chicago['risk'] = chicago['risk'].str.replace('All', 'Risk 4 (All)')

In [53]:
chicago['risk'].unique()

array(['Risk 2 (Medium)', 'Risk 1 (High)', 'Risk 3 (Low)',
       'Risk -1 (None)', 'Risk 4 (All)'], dtype=object)

In [54]:
with open('./chicago-food-inspections/chicago_data.pickle', 'wb') as fp:
    pickle.dump(chicago, fp)

## Address

In [55]:
with open('./chicago-food-inspections/chicago_data.pickle', 'rb') as fp:
    chicago = pickle.load(fp)

In [56]:
chicago['city'].unique()

array(['CHICAGO', nan, 'chicago', 'Chicago', 'GRIFFITH', 'NEW YORK',
       'SCHAUMBURG', 'ELMHURST', 'ALGONQUIN', 'NEW HOLSTEIN', 'CCHICAGO',
       'NILES NILES', 'EVANSTON', 'CHICAGO.', 'CHESTNUT STREET',
       'LANSING', 'CHICAGOCHICAGO', 'WILMETTE', 'WHEATON',
       'CHICAGOHICAGO', 'ROSEMONT', 'CHicago', 'CALUMET CITY',
       'PLAINFIELD', 'HIGHLAND PARK', 'PALOS PARK', 'ELK GROVE VILLAGE',
       'CICERO', 'BRIDGEVIEW', 'OAK PARK', 'MAYWOOD', 'LAKE BLUFF',
       '312CHICAGO', 'SCHILLER PARK', 'SKOKIE', 'BEDFORD PARK',
       'BANNOCKBURNDEERFIELD', 'CHCICAGO', 'BLOOMINGDALE', 'Norridge',
       'CHARLES A HAYES', 'CHCHICAGO', 'CHICAGOI', 'SUMMIT',
       'OOLYMPIA FIELDS', 'WESTMONT', 'CHICAGO HEIGHTS', 'JUSTICE',
       'TINLEY PARK', 'LOMBARD', 'EAST HAZEL CREST', 'COUNTRY CLUB HILLS',
       'STREAMWOOD', 'BOLINGBROOK', 'INACTIVE', 'BERWYN', 'BURNHAM',
       'DES PLAINES', 'LAKE ZURICH', 'OLYMPIA FIELDS', 'OAK LAWN',
       'BLUE ISLAND', 'GLENCOE', 'FRANKFORT', 'NAPERVI

In [57]:
def split_address(address):
    liste = address.split(' ')
    nr = liste[0]
    cardinal = liste[1]
    reste = ' '.join(liste[2:])
    liste = nr + ('*') + cardinal + '*' + reste
    return liste

In [58]:
temp = chicago['address'].apply(split_address)
temp = temp.str.split('*', expand=True)


In [59]:
with open('./chicago-food-inspections/chicago_data.pickle', 'wb') as fp:
    pickle.dump(chicago, fp)

## City

In [60]:
print(len(chicago['city'].unique()))
chicago['city'] = chicago['city'].str.lower()
print(len(chicago['city'].unique()))

71
66


In [61]:
chicago['city'].unique()
chicago['city'] = chicago['city'].replace('cchicago', 'chicago').replace('chicago.', 'chicago')
chicago['city'] = chicago['city'].replace('chicagochicago', 'chicago')
chicago['city'] = chicago['city'].replace('chicagohicago', 'chicago')
chicago['city'] = chicago['city'].replace('312chicago', 'chicago').replace('chicagoi', 'chicago')
chicago['city'] = chicago['city'].replace('chchicago', 'chicago')
chicago['city'] = chicago['city'].replace('chcicago', 'chicago')
chicago['city'] = chicago['city'].fillna('chicago')

In [62]:
np.sort(chicago['city'].unique())

array(['algonquin', 'alsip', 'bannockburndeerfield', 'bedford park',
       'berwyn', 'bloomingdale', 'blue island', 'bolingbrook',
       'bridgeview', 'broadview', 'burnham', 'calumet city',
       'charles a hayes', 'chestnut street', 'chicago', 'chicago heights',
       'cicero', 'country club hills', 'des plaines', 'east hazel crest',
       'elk grove village', 'elmhurst', 'evanston', 'evergreen park',
       'frankfort', 'glencoe', 'griffith', 'highland park', 'inactive',
       'justice', 'lake bluff', 'lake zurich', 'lansing', 'lombard',
       'maywood', 'naperville', 'new holstein', 'new york', 'niles niles',
       'norridge', 'oak lawn', 'oak park', 'olympia fields',
       'oolympia fields', 'palos park', 'plainfield', 'rosemont',
       'schaumburg', 'schiller park', 'skokie', 'streamwood', 'summit',
       'tinley park', 'westmont', 'wheaton', 'wilmette', 'worth'],
      dtype=object)

In [63]:
cities = pd.read_csv('./chicago-food-inspections/listsChicago.csv', sep=';', header=None)
cities[0] = cities[0].str.lower()
cities = cities[0].values

In [64]:
outs = []
ins = []
for city in chicago['city'].unique():
    if city not in cities:
        outs.append(city)
        chicago = chicago[chicago['city'] != city]
    else:
        ins.append(city)

In [65]:
print('Here are the real cities from Chicago!')
print(ins)

Here are the real cities from Chicago!
['chicago', 'cicero', 'bridgeview', 'oak park', 'maywood', 'bedford park', 'berwyn', 'oak lawn', 'broadview', 'evergreen park']


## State
 We make sure that all our remaining entries are from Chicago in Illinois! 

In [66]:
print(chicago['state'].unique())
chicago['state'] = np.where(chicago['state'].isnull(), 'IL', chicago['state'])
    

['IL' nan]


In [67]:
print(chicago['state'].unique())

['IL']
