## Food Inspection Feature Development

### Potential Features  

1.  name  
1.  license number  
1.  result (pass/fail)
1.  business age (default start from 2010 inspection date)  
1.  *number of chains / is_chain boolean 
1.  risk  
1.  ward / neighborhood
1.  license code  
1.  renew  
1.  conditional approved  
1.  business activity
1.  *number of (pass/fail) inspections during 1st, 2nd, 3rd, and 4th most recent  license period
    * this can have errors so maybe use years from year_min to simplify
1.  *geoapify number of starbucks within 0.5 mile radius  
1.  *geoapify related business within 0.5 mile radius  
1.  *us census track info of income  

In [None]:
# # Code formatter
# # !pip3 install nb_black
# %load_ext nb_black

In [1]:
# Import required libraries

# eda tools
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt  
import re

# hide jupyter lab warnings
import warnings
warnings.filterwarnings('ignore')

# expand the number of dataframe columns visible
pd.options.display.max_columns = 100

# make sound when this code executes: Audio(sound_file, autoplay=True)
from IPython.display import Audio
sound_file = './sound/chord.wav'


In [2]:
# display package informatin
# !conda install -c conda-forge session-info
import session_info
session_info.show()

In [None]:
def date_select(row):
    if str(row.license_start_date) != 'NaT': 
        temp = row.license_start_date
    elif str(row.license_issued_date) != 'NaT':
        temp = row.license_issued_date
    elif str(row.app_complete) != 'NaT':
        temp = row.app_complete
    elif str(row.date_issued) != 'NaT':
        temp = row.date_issued
    elif str(row.license_end_date) != 'NaT':
        temp = row.license_end_date
    else:
        temp = np.datetime64('nat')
    return temp

## Clean Food Safety Inspection Data

In [None]:
# Read data
inspections_df = pd.read_csv('data/original/food_inspections.csv', parse_dates=['Inspection Date'])
print(f'Original number of records: {len(inspections_df)}')
inspections_df.dropna(subset='License #', inplace=True)
inspections_df['License #'].astype('int32')

restaurant_df = inspections_df[inspections_df['Facility Type'] == 'Restaurant'].copy()
restaurant_df = restaurant_df[~(restaurant_df['License #'] == 0)]
restaurant_df.drop(['DBA Name', 'Location'], axis=1, inplace=True)
restaurant_df = restaurant_df[~restaurant_df['Results'].isin(['Out of Business','No Entry', 'Not Ready', 'Business Not Located'])]
restaurant_df.columns = ['inspect_id', 'aka_name', 'license_num', 'facility_type', 'risk',
                           'address', 'city', 'state', 'zipcode', 'inspect_date', 'inspect_type',
                           'results', 'violations', 'lat', 'lon']

# strip white space from object types
columns = restaurant_df.select_dtypes(['object']).columns
restaurant_df[columns] = restaurant_df[columns].apply(lambda x: x.str.strip())

# restaurant_df.dropna(subset=['violations'], inplace=True)
restaurant_df['violations'].fillna('-99.  No violations reported', inplace=True)
restaurant_df['violations_list'] = restaurant_df['violations'].apply(lambda x: x.split("|"))
restaurant_df['violation_count'] = restaurant_df['violations_list'].apply(lambda x: len(x))

restaurant_df['violation_number'] = restaurant_df['violations_list'].apply(lambda x: [re.findall(r'\b\d+\b',i)[0] for i in x])
restaurant_df.head(3)

restaurant_df['vl_must_comply_list'] = restaurant_df['violations_list'].apply(lambda x: [ 'MUST COMPLY' in i for i in x])
restaurant_df['vl_must_comply_count'] = restaurant_df['vl_must_comply_list'].apply(lambda x: sum(x))

restaurant_df['vl_instructed_comply_list'] = restaurant_df['violations_list'].apply(lambda x: ['INSTRUCTED TO COMPLY' in i for i in x])
restaurant_df['vl_instructed_comply_count'] = restaurant_df['vl_instructed_comply_list'].apply(lambda x: sum(x))

restaurant_df['vl_not_cited_list'] = restaurant_df['violations_list'].apply(lambda x: ['NO CITATION ISSUED' in i for i in x])
restaurant_df['vl_citation_count'] = restaurant_df['vl_not_cited_list'].apply(lambda x: len(x) - sum(x))  

restaurant_df['year'] = restaurant_df['inspect_date'].dt.strftime('%Y').astype('int')
restaurant_df['month'] = restaurant_df['inspect_date'].dt.strftime('%m').astype('int')

restaurant_df.drop_duplicates(subset=['aka_name','license_num','facility_type','risk','address','city','state',
                                     'zipcode','inspect_date','results','lat','lon'], 
                              keep='last',
                              inplace=True)

print(f'Number of records after cleaning:  {len(restaurant_df)}')

restaurant_df.head(2)

In [None]:
restaurant_df.dtypes

In [None]:
restaurant_df.shape

## Clean Business License Data

In [None]:
# Read data
business_info = pd.read_csv('data/original/all_licensed_businesses.csv', parse_dates=['PAYMENT DATE','APPLICATION REQUIREMENTS COMPLETE','APPLICATION CREATED DATE','LICENSE TERM START DATE','LICENSE TERM EXPIRATION DATE','LICENSE APPROVED FOR ISSUANCE','DATE ISSUED'])
# Note:  APPLICATION CREATED DATE has NaT values as warned below

print(f'Original number of records: {len(business_info)}')

business_info.drop(['ID', 'LICENSE STATUS', 'ACCOUNT NUMBER', ], axis=1, inplace=True)
business_info['ZIP CODE'] = pd.to_numeric(business_info['ZIP CODE'], errors='coerce')
business_info.dropna(subset='ZIP CODE', inplace=True)
business_info = business_info[business_info['CITY'] == 'CHICAGO']
business_info = business_info[~business_info['APPLICATION TYPE'].isin(['C_LOC', 'C_SBA', 'C_EXPA', 'C_CAPA'])]

# keep only pertinent columns
business_info = business_info[['DOING BUSINESS AS NAME','LICENSE ID', 'ADDRESS', 'WARD', 'PRECINCT', 'POLICE DISTRICT', 'LICENSE CODE', 'LICENSE DESCRIPTION', 'LICENSE NUMBER','BUSINESS ACTIVITY ID', 'BUSINESS ACTIVITY', 
               'APPLICATION TYPE','APPLICATION REQUIREMENTS COMPLETE', 'CONDITIONAL APPROVAL', 'LICENSE TERM START DATE', 
               'LICENSE TERM EXPIRATION DATE', 'LICENSE APPROVED FOR ISSUANCE', 'DATE ISSUED']]

business_info.columns = ['aka_name', 'license_id', 'address', 'ward', 'precint',
                           'police_district', 'license_code',
                           'license_description', 'license_num', 'bus_activity_id',
                           'bus_activity', 'application_type',
                           'app_complete', 'conditional_approval',
                           'license_start_date', 'license_end_date',
                           'license_issued_date', 'date_issued']

# fixes most missing start date issues
business_info = business_info.assign(approx_start_date = lambda x: date_select(x))

# fixes about 2500 start date issues - date_issued is not as good of a metric - often off by 1 year
temp = business_info[business_info['approx_start_date'].isna()]  
temp['approx_start_date'] = temp['date_issued']
# remove temp records from business_info and add updated records
business_info.drop(labels=temp.index, axis=0, inplace=True)
business_info = pd.concat([business_info, temp], axis=0)


business_info['year'] = business_info['approx_start_date'].dt.strftime('%Y')  

# strip white space from object types
columns = business_info.select_dtypes(['object']).columns
business_info[columns] = business_info[columns].apply(lambda x: x.str.strip())

# create unique license num, year combo
business_info.drop_duplicates(subset=['aka_name', 'address', 'ward', 'precint',
       'police_district', 'license_code', 'license_description', 'license_num',
       'bus_activity_id', 'bus_activity','conditional_approval','year'], keep='last', inplace=True)

temp = business_info[business_info.duplicated(subset=['license_num','year'], keep=False).sort_values(ascending=True)].sort_values(by='aka_name', ascending=True)

# many 
idx = {}
description = {}
for index, row in temp.iterrows():
    if isinstance(row['bus_activity'], str):
        store0 = row['bus_activity_id']
        store1 = row['bus_activity']
        idx[row['license_num']] = store0
        description[row['license_num']] = store1

temp['bus_activity_id'] = temp['license_num'].map(idx)
temp['bus_activity'] = temp['license_num'].map(description)

business_info.drop(labels=temp.index, axis=0, inplace=True)
business_info = pd.concat([business_info, temp], axis=0)

business_info.drop_duplicates(subset=['aka_name', 'address', 'ward', 'precint',
       'police_district', 'license_code', 'license_description', 'license_num',
       'bus_activity_id', 'bus_activity','conditional_approval','year'], keep='last', inplace=True)

# remove all remaining missing date data - removes < 100 records
business_info.dropna(subset='approx_start_date', inplace=True)

# remove anything without an aka_name
business_info.dropna(subset=['aka_name'], axis=0, inplace=True)

business_info['year'] = business_info['year'].astype('int')

print(f'Number of records after cleaning:  {len(business_info)}')

business_info = business_info[['aka_name', 'license_id', 'address', 'ward', 'precint',
       'police_district', 'license_code', 'license_description', 'license_num',
       'bus_activity_id', 'bus_activity', 'application_type',
       'conditional_approval', 'approx_start_date', 'year']]

 # 48% of the business_activity and business_activity_id are NaN
business_info['bus_activity_id'].fillna('Unknown', inplace=True)
business_info['bus_activity'].fillna('Unknown', inplace=True)

# Calculate business age
min_max_year = business_info[['aka_name','address','year']].groupby(['aka_name','address']).agg({'year':['min','max']})
min_max_year.reset_index(inplace=True)
df = pd.DataFrame(min_max_year.to_records())
df.drop(labels=['index'], axis=1, inplace=True)
df.columns=['aka_name', 'address', 'year_min', 'year_max']

# Combine summarized results to with original dataframe
business_info = pd.merge(business_info, df, left_on=['aka_name', 'address'], right_on=['aka_name', 'address'], how='left')
business_info['bus_age'] = business_info['approx_start_date'].dt.year - business_info['year_min']
business_info.head()

business_info.head(2)

In [None]:
business_info.dtypes

In [None]:
business_info.shape

## Merge Both Datasets Together

In [None]:
# check that there are no duplicates in the right merge table based on the keys - this will cause duplication
business_info_nodupes = business_info.drop_duplicates(subset=['license_num', 'year'], keep='first')
# business_info_nodupes.groupby(['license_num','year']).count()['aka_name'].sort_values(ascending=False)
# # shows that there are only single records for each license number and year combination.

# Merge shows it still has 150k rows but some are NaN values
restaurant_df = pd.merge(restaurant_df, business_info_nodupes, left_on=['license_num','year'], right_on=['license_num','year'], how='left')
# restaurant_df.shape

# the first merge was successful for 16,000 rows but 79,000 rows were still mostly Nan
temp = restaurant_df[restaurant_df.isna().sum(axis=1) > 8][['inspect_id', 'aka_name_x', 'license_num', 'facility_type', 'risk',
       'address_x', 'city', 'state', 'zipcode', 'inspect_date', 'inspect_type',
       'results', 'violations', 'lat', 'lon', 'violations_list',
       'violation_count', 'violation_number', 'vl_must_comply_list',
       'vl_must_comply_count', 'vl_instructed_comply_list', 'vl_instructed_comply_count',
       'vl_not_cited_list', 'vl_citation_count', 'year', 'month']]

temp.reset_index(inplace=True)

# Strategey to combine more records is to reduce the specificity of the merge 
# First isolate the NaN values (> 8 NaN values in a row) 
# Merge to this limted dataframe, first on license number alone, then drop those indexes from the original dataframe and add the updated data to the original
# Next isolate the NaN values, and merge on name and address which will be less accurate then drop index from original dateaframe and add updated data to the original
# Important note:  make sure that right table of the merge has not duplcates for the merging keys - this prevents duplication of records in the original dataframe

# # this verifies that the right table does not have duplicates
business_info_nodupes2 = business_info_nodupes.drop_duplicates(subset=['license_num'], keep='first')
# business_info_nodupes2.groupby(['license_num']).count()['aka_name'].sort_values(ascending=False)

# the second merge was successful for 77,000 rows but 1,500 rows were still mostly Nan
temp2 = pd.merge(temp, business_info_nodupes2, left_on=['license_num'], right_on=['license_num'], how='left')
temp2.set_index('index', inplace=True)

temp2.drop(labels='year_y', axis=1, inplace=True)
temp2.rename(columns={'year_x':'year', 'aka_name':'aka_name_y', 'address':'address_y'}, inplace=True)

# remove temp records from restaurant_df and add updated records
restaurant_df.drop(labels=temp2.index, axis=0, inplace=True)
restaurant_df = pd.concat([restaurant_df, temp2], axis=0)

temp3 = temp2[temp2.isna().sum(axis=1) > 8]
# remove the final mismatches since there should be a match on license number
restaurant_df.drop(labels=temp3.index, axis=0, inplace=True)

# drop non-name records ~700 records
restaurant_df.dropna(subset='aka_name_x', inplace=True, axis=0)

# create a new license number for every name/address combination
temp = restaurant_df.groupby(['aka_name_x', 'address_x']).min()[['license_num']]
name_address = temp.index
new_license_number = {(i,j): k for k,(i,j) in enumerate(name_address)}

# add license alias to dataframe
restaurant_df['license_alias'] = restaurant_df.apply(lambda x: new_license_number[x.aka_name_x,x.address_x], axis=1)

# add number of chains that exist
temp = restaurant_df.groupby('aka_name_x').count()
names = temp.index

number_of_chains ={}
for name in names:
    num = restaurant_df[restaurant_df['aka_name_x']== name]['license_alias'].nunique()  
    number_of_chains[name] = num
    
restaurant_df['number_of_chains'] = restaurant_df.apply(lambda x: number_of_chains[x.aka_name_x], axis=1)

# not a fast calculation
Audio(sound_file, autoplay=True) 

restaurant_df.head()

In [None]:
restaurant_df.shape

In [None]:
restaurant_df.dtypes

In [None]:
# Note:  approximate_start_date should probably not be used; there is potential that the other categoricals could be wrong

In [None]:
# # export to csv
# restaurant_df.to_csv('./data/manipulated/combined_data.csv')