In [1]:
import pandas as pd
import numpy as np
import sqlalchemy

In [2]:
# Csv file was uploaded to amazon aws rds mysql server as a database table
# Here, we use the connection string and user details to connect to the database
# In the following lines, we will look at the normalization operations done on the table. 

engine = sqlalchemy.create_engine('mysql://asharma:PatuKhola1@orchard.cvt6t7xaljhq.us-east-1.rds.amazonaws.com:3306')


In [9]:
existing_databases = engine.execute("SHOW DATABASES;")

In [12]:
existing_databases = [d[0] for d in existing_databases]

In [13]:
existing_databases

['information_schema',
 'innodb',
 'mysql',
 'orchard_schema_1',
 'performance_schema',
 'sys']

In [15]:
db= engine.execute("USE orchard_schema_1") # select database

In [20]:
# So far we created a connection to the aws database and saw the existing databases
# Now we are going to see the individual tables. 
# Can also write queries to create, modify tables. 

In [29]:
query= """
select * from orchard_schema_1.NYC_rest_data_sample
limit 5
"""
original_table = pd.read_sql_query(query,engine)

In [31]:
# Print out the full original table for viewing 
original_table

Unnamed: 0,CAMIS,DBA,BORO,BUILDING,STREET,ZIPCODE,PHONE,CUISINE DESCRIPTION,INSPECTION DATE,ACTION,VIOLATION CODE,VIOLATION DESCRIPTION,CRITICAL FLAG,SCORE,GRADE,GRADE DATE,RECORD DATE,INSPECTION TYPE
0,40373938,IHOP,BRONX,5655,BROADWAY,10463,7185494565,American,08/16/2016,Violations were cited in the following area(s).,04L,Evidence of mice or live mice present in facil...,Critical,15,,,10/18/2016,Cycle Inspection / Initial Inspection
1,41217775,S. WAN CAFE,MANHATTAN,85,ELDRIDGE STREET,10002,2129668269,Chinese,09/25/2013,Violations were cited in the following area(s).,10F,Non-food contact surface improperly constructe...,Not Critical,17,,,10/18/2016,Cycle Inspection / Initial Inspection
2,40808056,FEI TENG BAKERY,BROOKLYN,4007,9 AVENUE,11232,7188715889,Bakery,07/17/2014,Violations were cited in the following area(s).,06C,Food not protected from potential source of co...,Critical,8,A,07/17/2014,10/18/2016,Cycle Inspection / Re-inspection
3,40585040,CAFE RAKKA,MANHATTAN,81,ST MARKS PLACE,10003,2129829166,Mediterranean,05/29/2014,Violations were cited in the following area(s).,08A,Facility not vermin proof. Harborage or condit...,Not Critical,11,A,05/29/2014,10/18/2016,Cycle Inspection / Re-inspection
4,40694764,MANNY'S GRILL,BROOKLYN,1089,GRAND STREET,11211,7184868225,American,08/09/2016,Violations were cited in the following area(s).,08A,Facility not vermin proof. Harborage or condit...,Not Critical,24,,,10/18/2016,Cycle Inspection / Initial Inspection


In [32]:
# The queries written below extract normalized tables from the main table. 
# There are four subtables:

# BUSINESSES TABLE-   details of each business
# INSPECTIONS TABLE-  details of each inspection on every business
# GRADES TABLE-       details of grade for each business
# VIOLATIONS TABLE-   violation codes and their descriptions

# The dates are converted from text to date format for future consistency and proper comparison



query1 = """
create table Businesses AS
select
distinct CAMIS,
DBA,
BORO,
BUILDING,
STREET,
ZIPCODE,
PHONE,
`CUISINE DESCRIPTION` AS CUISINE_DESCRIPTION,
`RECORD DATE` AS RECORD_DATE
from NYC_rest_data_sample
"""
query2= """
alter table Businesses add Primary Key(CAMIS)
"""

query3 = """
create table `Violations` as
select
distinct `violation code` as violation_code,
`violation description` as violation_description
from NYC_rest_data_sample
"""
query4= """
alter table Violations alter column violation_code varchar(20)
alter table `Violations` add Primary Key (violation_code)
"""
query5 = """
create table Inspections as
select
CAMIS,
cast(concat(substr(`inspection date`, 7,4),'-',substr(`inspection date`, 1,2),'-',substr(`inspection date`, 4,2)) as date) as inspect_date,
ACTION,
`VIOLATION CODE` as violation_code,
`CRITICAL FLAG` as critical_flag,
SCORE,
`INSPECTION TYPE` as inspection_type
from NYC_rest_data_sample
"""
query6= """
alter table Inspections add primary key (camis, inspect_date)
"""

query7 = """
create table Gradings as
select
CAMIS,
cast(concat(substr(`grade date`, 7,4),'-',substr(`grade date`, 1,2),'-',substr(`grade date`, 4,2)) as date) as grade_date,
grade
from NYC_rest_data_sample
where `grade date` != ''
"""
query8= """
alter table Gradings add primary key (camis, grade_date)
"""




In [36]:
# Businesses table snapshot
query= """
select * from Businesses
limit 5
"""
businesses_table = pd.read_sql_query(query,engine)

In [37]:
businesses_table

Unnamed: 0,CAMIS,DBA,BORO,BUILDING,STREET,ZIPCODE,PHONE,CUISINE_DESCRIPTION,RECORD_DATE
0,40362715,THE COUNTRY CAFE,MANHATTAN,60,WALL STREET,10005,3474279132,Sandwiches/Salads/Mixed Buffet,10/18/2016
1,40363644,DOMINO'S,MANHATTAN,464,3 AVENUE,10016,2125450200,Pizza,10/18/2016
2,40365454,JOE & PAT'S PIZZERIA,STATEN ISLAND,1758,VICTORY BOULEVARD,10314,7189810887,Pizza/Italian,10/18/2016
3,40367677,CAPITOL RESTAURANT,MANHATTAN,4933,BROADWAY,10034,2129425090,American,10/18/2016
4,40369017,PALM TOO,MANHATTAN,840,2 AVENUE,10017,2126975198,American,10/18/2016


In [41]:
# Inspections table snapshot

query= """
select * from Inspections
limit 5
"""
inspections_table = pd.read_sql_query(query,engine)

In [42]:
inspections_table

Unnamed: 0,CAMIS,inspect_date,ACTION,violation_code,critical_flag,SCORE,inspection_type
0,40362715,2015-07-15,Violations were cited in the following area(s).,02B,Critical,19,Cycle Inspection / Initial Inspection
1,40362715,2016-09-06,Violations were cited in the following area(s).,06C,Critical,33,Cycle Inspection / Initial Inspection
2,40363644,2014-03-06,Violations were cited in the following area(s).,10B,Not Critical,11,Cycle Inspection / Re-inspection
3,40365454,2016-02-09,Violations were cited in the following area(s).,05D,Critical,26,Cycle Inspection / Initial Inspection
4,40367677,2016-09-29,Violations were cited in the following area(s).,08A,Not Critical,12,Cycle Inspection / Initial Inspection


In [45]:
# Gradings table snapshot

query= """
select * from Gradings
limit 5
"""
gradings_table = pd.read_sql_query(query,engine)

In [46]:
gradings_table

Unnamed: 0,CAMIS,grade_date,grade
0,40363644,2014-03-06,A
1,40367677,2016-09-29,A
2,40376029,2013-03-22,A
3,40381720,2016-09-19,B
4,40386554,2014-07-15,A


In [67]:
# Violations table snapshot

query= """
select * from Violations
limit 10
"""
violations_table = pd.read_sql_query(query,engine)

In [50]:
violations_table

Unnamed: 0,violation_code,violation_description
0,04L,Evidence of mice or live mice present in facil...
1,10F,Non-food contact surface improperly constructe...
2,06C,Food not protected from potential source of co...
3,08A,Facility not vermin proof. Harborage or condit...
4,09B,Thawing procedures improper.
5,15L,Smoke free workplace smoking policy inadequate...
6,,
7,04N,Filth flies or food/refuse/sewage-associated (...
8,09A,Canned food product observed dented and not se...
9,02G,Cold food item held above 41?? F (smoked fish ...


In [95]:
# Get some of my friend's favorite places
# He likes places that have 
# an inspection score less than 10
# only a grade of A
# in recent gradings that are from 2016 and no earlier

query= """
select 
Businesses.CAMIS as CAMIS, 
Businesses.DBA as NAME,
Businesses.ZIPCODE,
Businesses.PHONE,
Gradings.grade as Grade,
Gradings.grade_date as `Grade Date`
from 
Businesses left join
Inspections on Businesses.CAMIS = Inspections.CAMIS
left join Gradings on Businesses.CAMIS = Gradings.CAMIS
where Inspections.score<15 && Gradings.grade= 'A' && Gradings.grade_date >'2015-12-31'
order by score DESC
limit 10
"""
favorites_table = pd.read_sql_query(query,engine)

In [96]:
# A SAMPLE OF SOME PLACES THAT ARE LIKELY TO BE MY FRIEND'S FAVORITES
favorites_table

Unnamed: 0,CAMIS,NAME,ZIPCODE,PHONE,Grade,Grade Date
0,41221190,PANADERIA COATZINGO,11372,7184294160,A,2016-06-21
1,41320866,WO HOP 17,10013,2122672536,A,2016-05-20
2,50017041,JUST SALAD,10001,2123555808,A,2016-03-16
3,50050835,SUBWAY,11377,7187799166,A,2016-09-16
4,40367677,CAPITOL RESTAURANT,10034,2129425090,A,2016-09-29
5,40704305,LA DOLCE ITALIA BAKERY,11375,7182685297,A,2016-01-13
6,41008329,NATIONAL BAKERY,10472,7188422396,A,2016-01-19
7,41042416,CITY CINEMAS 123,10022,2128329054,A,2016-10-03
8,41074629,DOMINO'S,10460,7185424993,A,2016-08-04
9,41187602,INDIGO INDIAN BISTRO,10023,2125793900,A,2016-04-18
