In [40]:
import numpy as np
import pandas as pd
from numpy import nan
import math

In [2]:
df = pd.read_csv('data/Restaurant_Scores_-_LIVES_Standard.csv')

In [3]:
df.head()

Unnamed: 0,business_id,business_name,business_address,business_city,business_state,business_postal_code,business_latitude,business_longitude,business_location,business_phone_number,inspection_id,inspection_date,inspection_score,inspection_type,violation_id,violation_description,risk_category
0,1757,Dar Bar Pakistani/Indian Cusine,1412 Polk St,San Francisco,CA,94109,37.789784,-122.420455,"(37.789784, -122.420455)",,1757_20170928,9/28/2017 0:00,86.0,Routine - Unscheduled,1757_20170928_103131,Moderate risk vermin infestation,Moderate Risk
1,4864,DRAGON CITY BAKERY & CAFE,2367 MISSION St,San Francisco,CA,94110,37.759174,-122.419066,"(37.759174, -122.419066)",14155830000.0,4864_20161206,12/6/2016 0:00,84.0,Routine - Unscheduled,4864_20161206_103157,Food safety certificate or food handler card n...,Low Risk
2,79782,Deli 23,2449 23rd St,San Francisco,CA,94110,,,,,79782_20160503,5/3/2016 0:00,92.0,Routine - Unscheduled,79782_20160503_103120,Moderate risk food holding temperature,Moderate Risk
3,73840,L'acajou Bakery and Cafe,498 09th St Ste. C,San Francisco,CA,94103,,,,,73840_20171207,12/7/2017 0:00,71.0,Routine - Unscheduled,73840_20171207_103105,Improper cooling methods,High Risk
4,76437,Sweetheart Cafe,909 Grant Ave,San Francisco,CA,94108,,,,,76437_20160329,3/29/2016 0:00,76.0,Routine - Unscheduled,76437_20160329_103113,Sewage or wastewater contamination,High Risk


## Basic information about the data

In [26]:
print('shape of data: {}'.format(df.shape))

shape of data: (51297, 17)


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 51297 entries, 0 to 51296
Data columns (total 17 columns):
business_id              51297 non-null int64
business_name            51297 non-null object
business_address         51297 non-null object
business_city            51297 non-null object
business_state           51297 non-null object
business_postal_code     50010 non-null object
business_latitude        30296 non-null float64
business_longitude       30296 non-null float64
business_location        30296 non-null object
business_phone_number    15917 non-null float64
inspection_id            51297 non-null object
inspection_date          51297 non-null object
inspection_score         38135 non-null float64
inspection_type          51297 non-null object
violation_id             38728 non-null object
violation_description    38728 non-null object
risk_category            38728 non-null object
dtypes: float64(4), int64(1), object(12)
memory usage: 6.7+ MB


In [25]:
colnames = df.columns
for colname in colnames:
    count = df[colname].count()
    print('{}: {}'.format(colname, count))

business_id: 51297
business_name: 51297
business_address: 51297
business_city: 51297
business_state: 51297
business_postal_code: 50010
business_latitude: 30296
business_longitude: 30296
business_location: 30296
business_phone_number: 15917
inspection_id: 51297
inspection_date: 51297
inspection_score: 38135
inspection_type: 51297
violation_id: 38728
violation_description: 38728
risk_category: 38728


### Summary of the data
1. The total number of rows is 51297. 
2. Violations id's, violation descriptions, and risk categories are 38728. 
3. 1287 postal codes are missing: 50010
4. Lots of latitudes, longitues, and location are missing: 30296
5. Lots of inspection scores are missing: 38135
6. Violation id's are missing for reinspection/followup, new construction, new ownership,

### Look for inspection types for missing violation id's

In [51]:
df2 = df[['violation_id', 'inspection_type']]

In [83]:
mask = df2['violation_id'].isnull()

In [89]:
df3 = df2[mask]
inspect_types = set()
for atype in df3['inspection_type']:
    inspect_types.add(atype)

In [92]:
print("Inpection types for missing violation id's: \n{}".format(inspect_types))

Inpection types for missing violation id's: 
{'Routine - Unscheduled', 'New Ownership - Followup', 'New Construction', 'Structural Inspection', 'Community Health Assessment', 'Administrative or Document Review', 'Complaint', 'Non-inspection site visit', 'New Ownership', 'Routine - Scheduled', 'Home Environmental Assessment', 'Special Event', 'Complaint Reinspection/Followup', 'Foodborne Illness Investigation', 'Reinspection/Followup'}


## Remove the missing violation id rows. 

In [107]:
# They are not the ones issued with violations.

In [139]:
mask_viol = df['violation_id'].isnull()

In [140]:
mask_viol.shape

(51297,)

In [141]:
df_viol = df[~mask_viol]

In [142]:
df_viol.shape

(38728, 17)

In [151]:
df_viol.head()

Unnamed: 0,business_id,business_name,business_address,business_city,business_state,business_postal_code,business_latitude,business_longitude,business_location,business_phone_number,inspection_id,inspection_date,inspection_score,inspection_type,violation_id,violation_description,risk_category
0,1757,Dar Bar Pakistani/Indian Cusine,1412 Polk St,San Francisco,CA,94109,37.789784,-122.420455,"(37.789784, -122.420455)",,1757_20170928,9/28/2017 0:00,86.0,Routine - Unscheduled,1757_20170928_103131,Moderate risk vermin infestation,Moderate Risk
1,4864,DRAGON CITY BAKERY & CAFE,2367 MISSION St,San Francisco,CA,94110,37.759174,-122.419066,"(37.759174, -122.419066)",14155830000.0,4864_20161206,12/6/2016 0:00,84.0,Routine - Unscheduled,4864_20161206_103157,Food safety certificate or food handler card n...,Low Risk
2,79782,Deli 23,2449 23rd St,San Francisco,CA,94110,,,,,79782_20160503,5/3/2016 0:00,92.0,Routine - Unscheduled,79782_20160503_103120,Moderate risk food holding temperature,Moderate Risk
3,73840,L'acajou Bakery and Cafe,498 09th St Ste. C,San Francisco,CA,94103,,,,,73840_20171207,12/7/2017 0:00,71.0,Routine - Unscheduled,73840_20171207_103105,Improper cooling methods,High Risk
4,76437,Sweetheart Cafe,909 Grant Ave,San Francisco,CA,94108,,,,,76437_20160329,3/29/2016 0:00,76.0,Routine - Unscheduled,76437_20160329_103113,Sewage or wastewater contamination,High Risk


## Clean up the violation id's. 

In [106]:
# 1757_20170928_103131: The first two series of numbers are the inspection id.

In [143]:
df_clean = df_viol[:]

In [144]:
a = df_clean['violation_id']

In [145]:
df_clean.head()

Unnamed: 0,business_id,business_name,business_address,business_city,business_state,business_postal_code,business_latitude,business_longitude,business_location,business_phone_number,inspection_id,inspection_date,inspection_score,inspection_type,violation_id,violation_description,risk_category
0,1757,Dar Bar Pakistani/Indian Cusine,1412 Polk St,San Francisco,CA,94109,37.789784,-122.420455,"(37.789784, -122.420455)",,1757_20170928,9/28/2017 0:00,86.0,Routine - Unscheduled,1757_20170928_103131,Moderate risk vermin infestation,Moderate Risk
1,4864,DRAGON CITY BAKERY & CAFE,2367 MISSION St,San Francisco,CA,94110,37.759174,-122.419066,"(37.759174, -122.419066)",14155830000.0,4864_20161206,12/6/2016 0:00,84.0,Routine - Unscheduled,4864_20161206_103157,Food safety certificate or food handler card n...,Low Risk
2,79782,Deli 23,2449 23rd St,San Francisco,CA,94110,,,,,79782_20160503,5/3/2016 0:00,92.0,Routine - Unscheduled,79782_20160503_103120,Moderate risk food holding temperature,Moderate Risk
3,73840,L'acajou Bakery and Cafe,498 09th St Ste. C,San Francisco,CA,94103,,,,,73840_20171207,12/7/2017 0:00,71.0,Routine - Unscheduled,73840_20171207_103105,Improper cooling methods,High Risk
4,76437,Sweetheart Cafe,909 Grant Ave,San Francisco,CA,94108,,,,,76437_20160329,3/29/2016 0:00,76.0,Routine - Unscheduled,76437_20160329_103113,Sewage or wastewater contamination,High Risk


In [146]:
a[0].split('_')[2]

'103131'

In [147]:
L_vid = []
for id in a:
    vid = id.split('_')[2]
    L_vid.append(vid)
ps_vid = pd.Series(L_vid)

In [152]:
df_clean.loc[:,'short_violation_id'] = ps_vid

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s


In [153]:
df_clean.head()

Unnamed: 0,business_id,business_name,business_address,business_city,business_state,business_postal_code,business_latitude,business_longitude,business_location,business_phone_number,inspection_id,inspection_date,inspection_score,inspection_type,violation_id,violation_description,risk_category,short_violation_id
0,1757,Dar Bar Pakistani/Indian Cusine,1412 Polk St,San Francisco,CA,94109,37.789784,-122.420455,"(37.789784, -122.420455)",,1757_20170928,9/28/2017 0:00,86.0,Routine - Unscheduled,1757_20170928_103131,Moderate risk vermin infestation,Moderate Risk,103131
1,4864,DRAGON CITY BAKERY & CAFE,2367 MISSION St,San Francisco,CA,94110,37.759174,-122.419066,"(37.759174, -122.419066)",14155830000.0,4864_20161206,12/6/2016 0:00,84.0,Routine - Unscheduled,4864_20161206_103157,Food safety certificate or food handler card n...,Low Risk,103157
2,79782,Deli 23,2449 23rd St,San Francisco,CA,94110,,,,,79782_20160503,5/3/2016 0:00,92.0,Routine - Unscheduled,79782_20160503_103120,Moderate risk food holding temperature,Moderate Risk,103120
3,73840,L'acajou Bakery and Cafe,498 09th St Ste. C,San Francisco,CA,94103,,,,,73840_20171207,12/7/2017 0:00,71.0,Routine - Unscheduled,73840_20171207_103105,Improper cooling methods,High Risk,103105
4,76437,Sweetheart Cafe,909 Grant Ave,San Francisco,CA,94108,,,,,76437_20160329,3/29/2016 0:00,76.0,Routine - Unscheduled,76437_20160329_103113,Sewage or wastewater contamination,High Risk,103113


## Creating features of violations for 6 different time periods: 
i) 8/2017 - 10/2017: first period  
ii) 5/2017 - 7/2017: second period  
iii) 2/2017 - 4/2017: third period  
iv) 11/2016 - 1/2017: fourth period  
v) 5/2016 - 10/2016: fifth period  
vi) 3/2015 - 4/2016: sixth period  