# Data preparation and Exploration

Prepare data from [restaurant inspections](https://data.cityofnewyork.us/Health/DOHMH-New-York-City-Restaurant-Inspection-Results/xx67-kt59) Note: Exported as csv.
After an initial preparation [StartingVisualizations]

Make some descriptives for the data set.

In [1]:
%matplotlib inline
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

## Basic Description for data
Some descriptives, complete table

Uses data transformed in DataPreparation.ipynb

Here, I only consider top 4 years.

And two types of initial inspections: initial cycle, and re-inspections.

In [2]:
# Separate cycle 
CycleInspections_out = pd.read_csv('CycleInspections_out.csv', sep=',',engine='python')
len(CycleInspections_out)

326871

## From Inspections table to violation types
From inspections with cycle inspections.

Add violation groups, from two digits in inspection type

In [3]:
CycleInspections_out['violation groups'] = CycleInspections_out['VIOLATION CODE'].str.extract('(\d\d)', expand=True)
CycleInspections_out.head()

Unnamed: 0.1,Unnamed: 0,CAMIS,date,BORO,DBA,INSPECTION TYPE,SCORE,GRADE DATE,RECORD DATE,year,month,day,VIOLATION CODE,VIOLATION DESCRIPTION,violation groups
0,1,41312955,2014-04-01,QUEENS,RIPE JUICE BAR & GRILL,Cycle Inspection / Re-inspection,12.0,04/01/2014,07/15/2017,2014,4,1,02G,Cold food item held above 41Ã‚Âº F (smoked fis...,2
1,2,41601691,2015-10-26,BROOKLYN,WAZA SUSHI,Cycle Inspection / Initial Inspection,25.0,,07/15/2017,2015,10,26,10H,Proper sanitization not provided for utensil w...,10
2,3,50043431,2017-05-19,MANHATTAN,SEATTLE CAFE,Cycle Inspection / Initial Inspection,35.0,,07/15/2017,2017,5,19,10F,Non-food contact surface improperly constructe...,10
3,4,50001580,2015-12-01,STATEN ISLAND,CIRO PIZZA CAFE,Cycle Inspection / Re-inspection,8.0,12/01/2015,07/15/2017,2015,12,1,02G,Cold food item held above 41Ã‚Âº F (smoked fis...,2
4,6,41722020,2017-04-12,BRONX,2 BROS PIZZA,Cycle Inspection / Initial Inspection,22.0,,07/15/2017,2017,4,12,04A,Food Protection Certificate not held by superv...,4


In [4]:
cycle_violations = CycleInspections_out[['CAMIS','date','BORO','year','INSPECTION TYPE','violation groups']]
cycle_violations.head()

Unnamed: 0,CAMIS,date,BORO,year,INSPECTION TYPE,violation groups
0,41312955,2014-04-01,QUEENS,2014,Cycle Inspection / Re-inspection,2
1,41601691,2015-10-26,BROOKLYN,2015,Cycle Inspection / Initial Inspection,10
2,50043431,2017-05-19,MANHATTAN,2017,Cycle Inspection / Initial Inspection,10
3,50001580,2015-12-01,STATEN ISLAND,2015,Cycle Inspection / Re-inspection,2
4,41722020,2017-04-12,BRONX,2017,Cycle Inspection / Initial Inspection,4


In [5]:
#cycle_violations.reset_index(inplace = True)
cycle_violations.set_index(['CAMIS','date','BORO','year'], inplace = True)

In [6]:
cycle_violations.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,INSPECTION TYPE,violation groups
CAMIS,date,BORO,year,Unnamed: 4_level_1,Unnamed: 5_level_1
41312955,2014-04-01,QUEENS,2014,Cycle Inspection / Re-inspection,2
41601691,2015-10-26,BROOKLYN,2015,Cycle Inspection / Initial Inspection,10
50043431,2017-05-19,MANHATTAN,2017,Cycle Inspection / Initial Inspection,10
50001580,2015-12-01,STATEN ISLAND,2015,Cycle Inspection / Re-inspection,2
41722020,2017-04-12,BRONX,2017,Cycle Inspection / Initial Inspection,4


In [7]:
cycle_re_inspections=cycle_violations[cycle_violations['INSPECTION TYPE'] == 'Cycle Inspection / Re-inspection']
cycle_re_inspections.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,INSPECTION TYPE,violation groups
CAMIS,date,BORO,year,Unnamed: 4_level_1,Unnamed: 5_level_1
41312955,2014-04-01,QUEENS,2014,Cycle Inspection / Re-inspection,2
50001580,2015-12-01,STATEN ISLAND,2015,Cycle Inspection / Re-inspection,2
41713457,2016-06-24,MANHATTAN,2016,Cycle Inspection / Re-inspection,2
41382811,2017-03-01,BRONX,2017,Cycle Inspection / Re-inspection,4
40937528,2015-11-16,STATEN ISLAND,2015,Cycle Inspection / Re-inspection,8


In [8]:
cycle_init_inspections=cycle_violations[cycle_violations['INSPECTION TYPE'] == 'Cycle Inspection / Initial Inspection']
cycle_init_inspections.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,INSPECTION TYPE,violation groups
CAMIS,date,BORO,year,Unnamed: 4_level_1,Unnamed: 5_level_1
41601691,2015-10-26,BROOKLYN,2015,Cycle Inspection / Initial Inspection,10
50043431,2017-05-19,MANHATTAN,2017,Cycle Inspection / Initial Inspection,10
41722020,2017-04-12,BRONX,2017,Cycle Inspection / Initial Inspection,4
41049159,2015-05-08,MANHATTAN,2015,Cycle Inspection / Initial Inspection,4
41469234,2016-02-17,QUEENS,2016,Cycle Inspection / Initial Inspection,8


Keep only the group of the reported violation. 

Add a dummy to hold if the violation is present.

In [9]:
#cycle_init_inspections=cycle_init_inspections[['CAMIS','date','BORO','year','violation_groups']]
cycle_re = cycle_re_inspections.drop('INSPECTION TYPE',axis=1)
cycle_re['violation present'] = 1
cycle_re.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,violation groups,violation present
CAMIS,date,BORO,year,Unnamed: 4_level_1,Unnamed: 5_level_1
41312955,2014-04-01,QUEENS,2014,2,1
50001580,2015-12-01,STATEN ISLAND,2015,2,1
41713457,2016-06-24,MANHATTAN,2016,2,1
41382811,2017-03-01,BRONX,2017,4,1
40937528,2015-11-16,STATEN ISLAND,2015,8,1


In [10]:
#cycle_init_inspections=cycle_init_inspections[['CAMIS','date','BORO','year','violation_groups']]
cycle_init = cycle_init_inspections.drop('INSPECTION TYPE',axis=1)
cycle_init['violation present'] = 1
cycle_init.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,violation groups,violation present
CAMIS,date,BORO,year,Unnamed: 4_level_1,Unnamed: 5_level_1
41601691,2015-10-26,BROOKLYN,2015,10,1
50043431,2017-05-19,MANHATTAN,2017,10,1
41722020,2017-04-12,BRONX,2017,4,1
41049159,2015-05-08,MANHATTAN,2015,4,1
41469234,2016-02-17,QUEENS,2016,8,1


In [11]:
# change the shape
# instead of ones put the sum, how?
cyc_re_long = cycle_re.pivot_table(index = ['CAMIS','date','BORO','year'],columns=['violation groups'], values = 'violation present',
                      aggfunc='sum')

In [12]:
cyc_re_long.reset_index(inplace=True)
cyc_re_long.head()

violation groups,CAMIS,date,BORO,year,02,03,04,05,06,07,08,09,10,22
0,30112340,2014-07-01,BROOKLYN,2014,1.0,,1.0,,1.0,,,,,
1,30112340,2014-12-30,BROOKLYN,2014,,,,,1.0,,,,1.0,
2,30112340,2016-04-30,BROOKLYN,2016,,,,,1.0,,1.0,,2.0,
3,30112340,2016-10-27,BROOKLYN,2016,,,1.0,,,,1.0,,,
4,30191841,2015-09-21,MANHATTAN,2015,,,,,1.0,,,,1.0,


In [13]:
# change the shape
# instead of ones put the sum, how?
    cyc_init_long = cycle_init.pivot_table(index = ['CAMIS','date','BORO','year'],columns=['violation groups'], values = 'violation present',
                      aggfunc='sum')

In [14]:
cyc_init_long.reset_index(inplace=True)
cyc_init_long.head()

violation groups,CAMIS,date,BORO,year,02,03,04,05,06,07,08,09,10,22
0,30075445,2015-02-09,BRONX,2015,,,,,1.0,,,,,
1,30075445,2016-02-18,BRONX,2016,,,1.0,,,,1.0,,,
2,30075445,2017-05-18,BRONX,2017,,,,,1.0,,,,1.0,
3,30112340,2014-11-13,BROOKLYN,2014,,,1.0,,1.0,,,,1.0,
4,30112340,2015-05-07,BROOKLYN,2015,,,1.0,,,,,,1.0,


In [15]:
# join tables by camis and year if date init < date re inspect
#DataFrame.merge(right, how='inner', on=None, left_on=None, right_on=None, left_index=False, right_index=False, sort=False, suffixes=('_x', '_y'), copy=True, indicator=False)
two_cycles = cyc_init_long.merge(cyc_re_long, how='inner', on=['CAMIS','year']) # 
two_cycles.head()

violation groups,CAMIS,date_x,BORO_x,year,02_x,03_x,04_x,05_x,06_x,07_x,...,02_y,03_y,04_y,05_y,06_y,07_y,08_y,09_y,10_y,22_y
0,30112340,2014-11-13,BROOKLYN,2014,,,1.0,,1.0,,...,1.0,,1.0,,1.0,,,,,
1,30112340,2014-11-13,BROOKLYN,2014,,,1.0,,1.0,,...,,,,,1.0,,,,1.0,
2,30112340,2016-10-03,BROOKLYN,2016,,,1.0,,2.0,,...,,,,,1.0,,1.0,,2.0,
3,30112340,2016-10-03,BROOKLYN,2016,,,1.0,,2.0,,...,,,1.0,,,,1.0,,,
4,30191841,2015-08-31,MANHATTAN,2015,,,1.0,,,,...,,,,,1.0,,,,1.0,


In [16]:
# only correctly ordered cycle inspections (first initial, then re-inspection)
# STILL to check if there are valid only by change of year
same_year_inspections = two_cycles[two_cycles['date_x'] < two_cycles['date_y']]
same_year_inspections.head()

violation groups,CAMIS,date_x,BORO_x,year,02_x,03_x,04_x,05_x,06_x,07_x,...,02_y,03_y,04_y,05_y,06_y,07_y,08_y,09_y,10_y,22_y
1,30112340,2014-11-13,BROOKLYN,2014,,,1.0,,1.0,,...,,,,,1.0,,,,1.0,
3,30112340,2016-10-03,BROOKLYN,2016,,,1.0,,2.0,,...,,,1.0,,,,1.0,,,
4,30191841,2015-08-31,MANHATTAN,2015,,,1.0,,,,...,,,,,1.0,,,,1.0,
5,40356151,2014-04-11,QUEENS,2014,1.0,,2.0,1.0,,,...,,,,,2.0,,,,,
6,40356151,2014-04-11,QUEENS,2014,1.0,,2.0,1.0,,,...,,,1.0,,1.0,,1.0,,,


In [17]:
len(same_year_inspections)

42169

In [18]:
same_year_inspections.sort_values(['CAMIS','date_x']).head(20)

violation groups,CAMIS,date_x,BORO_x,year,02_x,03_x,04_x,05_x,06_x,07_x,...,02_y,03_y,04_y,05_y,06_y,07_y,08_y,09_y,10_y,22_y
1,30112340,2014-11-13,BROOKLYN,2014,,,1.0,,1.0,,...,,,,,1.0,,,,1.0,
3,30112340,2016-10-03,BROOKLYN,2016,,,1.0,,2.0,,...,,,1.0,,,,1.0,,,
4,30191841,2015-08-31,MANHATTAN,2015,,,1.0,,,,...,,,,,1.0,,,,1.0,
5,40356151,2014-04-11,QUEENS,2014,1.0,,2.0,1.0,,,...,,,,,2.0,,,,,
6,40356151,2014-04-11,QUEENS,2014,1.0,,2.0,1.0,,,...,,,1.0,,1.0,,1.0,,,
8,40356151,2014-10-03,QUEENS,2014,,,2.0,,,,...,,,1.0,,1.0,,1.0,,,
9,40356151,2015-04-24,QUEENS,2015,,,1.0,,,,...,,,1.0,,,,,,1.0,
10,40359705,2015-04-01,BROOKLYN,2015,2.0,,,,,,...,,,,,1.0,,,,2.0,
11,40359705,2015-04-01,BROOKLYN,2015,2.0,,,,,,...,,,1.0,,,,1.0,,,
13,40359705,2015-08-20,BROOKLYN,2015,1.0,,1.0,,2.0,,...,,,1.0,,,,1.0,,,


In [23]:
# only inspection values
table_inspections = same_year_inspections.drop(["CAMIS","date_x","date_y","BORO_x","BORO_y","year"],axis=1)
table_inspections.head()

violation groups,02_x,03_x,04_x,05_x,06_x,07_x,08_x,09_x,10_x,22_x,02_y,03_y,04_y,05_y,06_y,07_y,08_y,09_y,10_y,22_y
1,,,1.0,,1.0,,,,1.0,,,,,,1.0,,,,1.0,
3,,,1.0,,2.0,,1.0,,1.0,,,,1.0,,,,1.0,,,
4,,,1.0,,,,1.0,,1.0,,,,,,1.0,,,,1.0,
5,1.0,,2.0,1.0,,,,,,,,,,,2.0,,,,,
6,1.0,,2.0,1.0,,,,,,,,,1.0,,1.0,,1.0,,,


In [None]:
# make a summary table
