In [None]:
import numpy as np
import pandas as pd
import csv

In [None]:
###############################
# LOAD THE OPEN NYC MAIN FILE #
###############################

In [None]:
nyc_file = "/Users/frimpter/Documents/data_science/ru_datascience_bootcamp/Project 2/Data/DOHMH_New_York_City_Restaurant_Inspection_Results.csv"

nyc_df = pd.read_csv(nyc_file, encoding="utf-8")

# MAKE AN ADDRESS COLUMN
nyc_df["ADDRESS"] = nyc_df["BUILDING"] + " " + nyc_df["STREET"]

#nyc_df.head()

In [None]:
# Clean up column names

nyc_df.columns = nyc_df.columns.str.strip().str.lower().str.replace(' ', '_')
#nyc_df.head()

# Review violation descriptions
#nyc_df["violation_description"].unique()
#nyc_df["violation_description"].value_counts()
#nyc_df["violation_code"].value_counts()

# Find instances where violation contains "mice" or "rats" or "roach" or "flies" or "sanitiz" or "hand"
nyc_df["mice"] = nyc_df['violation_description'].str.contains('mice')
nyc_df["rats"] = nyc_df['violation_description'].str.contains('rats')
nyc_df["roach"] = nyc_df['violation_description'].str.contains('roach')
nyc_df["flies"] = nyc_df['violation_description'].str.contains('flies')
nyc_df["hand"] = nyc_df['violation_description'].str.contains('hand')
nyc_df["sanitiz"] = nyc_df['violation_description'].str.contains('sanitiz')

# nyc_df.head()

In [None]:
nyc_df = nyc_df.replace({True:1, False:0}, regex=True)
# nyc_df.head()

In [None]:
#####################################
# CREATE THE SIDEWALK MAP DATA FILE #
#####################################

In [None]:
# CONDENSE THE NYC FILE JUST FOR WHAT WE NEED

nyc_brief = nyc_df[["dba", "address", "cuisine_description", "grade", "grade_date"]]

# nyc_brief.head()

In [None]:
# LOAD AND TRIM THE OPEN NYC SIDEWALK LICENSE FILE

sidewalk_file = "/Users/frimpter/Documents/data_science/ru_datascience_bootcamp/Project 2/Data/Sidewalk_Caf_Licenses_and_Applications.csv"

sidewalk_df = pd.read_csv(sidewalk_file, encoding="utf-8")

# ONLY KEEP THE COLUMNS WE REALLY NEED, MAKE AN ADDRESS COLUMN
sidewalk_df["ADDRESS"] = sidewalk_df["BUILDING"] + " " + sidewalk_df["STREET"]

sidewalk_df = sidewalk_df[["BUSINESS_NAME", "ADDRESS", "CITY", "ZIP", "LATITUDE", "LONGITUDE", "LIC_STATUS", "SWC_TYPE"]]
sidewalk_df.columns = sidewalk_df.columns.str.strip().str.lower().str.replace(' ', '_')

# sidewalk_df.head()

In [None]:
# CREATE A CONDENSED JSON FILE TO MAP SIDEWALK RESTAURANTS

map_df = pd.merge(nyc_brief, sidewalk_df, on="address", how="inner")

# MANAGE NaNs
map_df = map_df.dropna(axis=0, how="any")
#map_df.shape

# SORT BY MOST RECENT GRADE DATE
map_df["grade_date"] = pd.to_datetime(map_df["grade_date"])
#map_df.dtypes
map_df = map_df.sort_values("grade_date", ascending=False)

# KEEP ONLY MOST RECENT GRADE DATE
map_df = map_df.drop_duplicates(subset="dba", keep='first')

# REMOVE INACTIVE SIDEWALK SEATING LICENSES
map_df = map_df[map_df.lic_status != "Inactive"]

#CLEAN UP THE CUISINE DESCRIPTIONS
map_df = map_df.replace({"CafÃ©/Coffee/Tea":"Cafe/Coffee/Tea", "Latin \(Cuban, Dominican, Puerto Rican, South \& Central American\)":"Latin"}, regex=True)

# RENAME COLUMNS FOR JSON
map_df = map_df.rename(columns={"cuisine_description":"cuisine", "grade_date":"date"}).reset_index()

# map_df.head()

# SMALL FILE - MAKE A CSV TO INSPECT MANUALLY
#map_df.to_csv("SIDEWALK_MAP.csv", index=False, header=True)

# EXPORT JSON FILE
# map_df.to_json("SIDEWALK.json", orient="records", date_format="iso")

In [None]:
##################################
# SIDEWALK VIOLATIONS CHART DATA #
##################################

In [None]:
chart_df = pd.merge(nyc_df, sidewalk_df, on="address", how="inner")

# MANAGE NaNs
chart_df = chart_df.dropna(axis=0, how="any")
#chart_df.shape

# REMOVE INACTIVE SIDEWALK SEATING LICENSES
chart_df = chart_df[chart_df.lic_status != "Inactive"]

#CLEAN UP THE CUISINE DESCRIPTIONS
chart_df = chart_df.replace({"CafÃ©/Coffee/Tea":"Cafe/Coffee/Tea", "Latin \(Cuban, Dominican, Puerto Rican, South \& Central American\)":"Latin"}, regex=True)

# MATCH THE CATEGORIES TO THOSE IN THE MAP (AS CODED IN map.js)
chart_df["cuisine"] = chart_df["cuisine_description"]
chart_df["cuisine"] = chart_df["cuisine"].replace({"Greek":"Mediterranean", "Chinese":"Asian", "Korean":"Asian", "Japanese":"Asian", "Thai":"Asian", "Indian":"Asian", "Bakery":"BakeryCafe","Donuts":"BakeryCafe", "Cafe/Coffee/Tea":"BakeryCafe", "Pizza":"Italian", "Pizza/Italian":"Italian", "Italian/Italian":"Italian", "Spanish":"Mexican", "Latin":"Mexican"}, regex=True)

# CHANGE THE REMAINING TO "Other"
final_cuisines = ["American","Asian","BakeryCafe","French","Italian","Mediterranean","Mexican","Other"]
cuisines = chart_df["cuisine"]

for item in cuisines:
    if item not in final_cuisines:
        chart_df["cuisine"] = chart_df["cuisine"].replace({item:"Other"}, regex=True)

# REVIEW DF
#chart_df.shape
#chart_df.head()

# NEW DF LIMITED THE COLUMNS WE NEED
final_chart = chart_df[["camis","dba","boro","cuisine","inspection_date","action","violation_code","violation_description","critical_flag","score","grade","grade_date","mice","rats","roach","flies","hand","sanitiz","lic_status","swc_type"]]

# final_chart.head()

In [None]:
# FORMAT DF TO CHART REQUIREMENTS BY CUISINE TYPE

# MAKE A COPY
swv = final_chart
# del swv["camis"]
# swv.head()

# PULL OUT THE COUNTS FOR EACH CUISINE TUPE TO BECOME A TOTAL COLUMN (TO CALC PERCENTAGES)
counts = swv.groupby("cuisine", as_index=False).count()
# counts
# cuisine_count = counts["dba"]

# GROUPBY CUISINE TYPE AS A RUNNING COLUMN
swv = swv.groupby("cuisine", as_index=False).sum().round(1)

swv["total"] = counts["dba"]

swv["mean_score"] = (swv["score"] / swv["total"]).round(1)
swv["mice_pct"] = (swv["mice"] / swv["total"]*100).round(1)
swv["rats_pct"] = (swv["rats"] / swv["total"]*100).round(1)
swv["roach_pct"] = (swv["roach"] / swv["total"]*100).round(1)
swv["flies_pct"] = (swv["flies"] / swv["total"]*100).round(1)
swv["hand_pct"] = (swv["hand"] / swv["total"]*100).round(1)
swv["sanitiz_pct"] = (swv["sanitiz"] / swv["total"]*100).round(1)

# CALCULATE AN "Other" CATEGORY
swv["subtotal"] = swv["mice_pct"] + swv["rats_pct"] + swv["roach_pct"] + swv["flies_pct"] + swv["hand_pct"] + swv["sanitiz_pct"]
swv["other"] = 100 - swv["subtotal"]

# REVIEW
# swv

In [None]:
# FINAL COPY AND CLEAN IT UP FOR EXPORT TO JSON

swchart = swv
swchart = swchart[["cuisine","mice_pct","rats_pct","roach_pct","flies_pct","hand_pct","sanitiz_pct"]]
swchart = swchart.rename(columns={"mice_pct":"MICE", "rats_pct":"RATS", "roach_pct":"ROACHES","flies_pct":"FLIES","hand_pct":"HANDS","sanitiz_pct":"SANITIZATION"})

# swchart
# swchart.to_json("/Users/frimpter/Documents/data_science/ru_datascience_bootcamp/Project 2/SWCHART.json", orient="records")

In [None]:
#############################
# HEATMAP FROM ALL NYC DATA #
#############################

In [None]:

# TRIM THE COLUMNS
nyc_df = nyc_df[["camis","dba","boro","cuisine_description","inspection_date","violation_code","violation_description","critical_flag","score","grade","grade_date","mice","rats","roach","flies","hand","sanitiz"]]

# CLEAN AND ALIGN CUISINE DESCRIPTIONS
nyc_df = nyc_df.replace({"CafÃ©/Coffee/Tea":"Cafe/Coffee/Tea", "Latin \(Cuban, Dominican, Puerto Rican, South \& Central American\)":"Latin"}, regex=True)

# MATCH THE CATEGORIES TO THOSE IN THE MAP (AS CODED IN map2.js)
nyc_df["cuisine"] = nyc_df["cuisine_description"]
nyc_df["cuisine"] = nyc_df["cuisine"].replace({"Greek":"Mediterranean", "Chinese":"Asian", "Korean":"Asian", "Japanese":"Asian", "Thai":"Asian", "Indian":"Asian", "Bakery":"BakeryCafe","Donuts":"BakeryCafe", "Cafe/Coffee/Tea":"BakeryCafe", "Pizza":"Italian", "Pizza/Italian":"Italian", "Italian/Italian":"Italian", "Spanish":"Mexican", "Latin":"Mexican"}, regex=True)

# CHANGE THE REMAINING TO "Other"
final_cuisines = ["American","Asian","BakeryCafe","French","Italian","Mediterranean","Mexican","Other"]
cuisines = nyc_df["cuisine"]

for item in cuisines:
    if item not in final_cuisines:
        nyc_df["cuisine"] = nyc_df["cuisine"].replace({item:"Other"}, regex=True)

del nyc_df["cuisine_description"]

nyc_df["inspection_date"] = pd.to_datetime(nyc_df["inspection_date"])
# nyc_df.dtypes

# nyc_df.head()

In [None]:
# CREATE A GROUPBY AND CLEAN IT UP
heat_df = nyc_df.groupby(["boro","cuisine"], as_index=False).mean().round(1)
heat_df = heat_df[["boro","cuisine","score"]]

# Drop the Missing Boro data row
heat_df.drop([24], axis=0, inplace=True)

# There is no French Restaurant (apparently) in Staten Island in the data file, insert a "0" value here
heat_df.loc[35.5] = ["STATEN ISLAND", "French", "0"]
heat_df = heat_df.sort_index()
heat_df.reset_index()

# heat_df

In [None]:
# PULL THE DATA TO FORMAT A DF FOR THE HEATMAP

# STORE EACH SERIES
bronx = heat_df.loc[0:7,"score"]
brooklyn = heat_df.loc[8:15,"score"].reset_index()[["score"]]
manhattan = heat_df.loc[16:23,"score"].reset_index()[["score"]]
queens = heat_df.loc[24:31,"score"].reset_index()[["score"]]
statenisland = heat_df.loc[32:39,"score"].reset_index()[["score"]]

# LOAD EACH SERIES INTO ITS OWN COLUMN
newheat = heat_df[["cuisine"]][:8]
newheat["BRONX"] = bronx
newheat["BROOKLYN"] = brooklyn
newheat["MANHATTAN"] = manhattan
newheat["QUEENS"] = queens
newheat["STATEN_ISLAND"] = statenisland

# newheat

In [None]:
# FORMAT TO JSON SPLIT FILE

# Preview the JSON formatted data
heatlist = newheat[["BRONX","BROOKLYN","MANHATTAN","QUEENS","STATEN_ISLAND"]].to_json(orient="split")

# EXPORT
heatlist = newheat[["BRONX","BROOKLYN","MANHATTAN","QUEENS","STATEN_ISLAND"]].to_json("/Users/frimpter/Documents/data_science/ru_datascience_bootcamp/Project 2/HEAT.json", orient="split")


In [None]:
##########################
# ROLLING AVERAGE SCORES #
##########################

In [None]:
# RE-RUN AND REVIEW NYC DATA 
# nyc_df.head()

# MAKE A NEW, LEAN DF AND TIDY IT UP
line_df = nyc_df[["cuisine","inspection_date","score"]]
line_df = line_df.dropna(axis=0, how="any")
line_df.shape # (370367, 3) >> after dropna (350330, 3)

# SORT BY INSPECTION DATE
line_df = line_df.sort_values("inspection_date", ascending=True)

# line_df.head()

In [None]:
# GROUP BY CUISINE TYPE AND INSPECTION DATE, USING MEAN SCORE FOR A DAY'S INSPECTION SCORES
group_df = line_df.groupby(["cuisine", "inspection_date"], as_index=False).mean().round(1)

# PREVIEW THE .rolling FUNCTION TO CALCULATE ROLLING AVERAGE
# group_df["rolling"] = group_df["score"].rolling(5).mean().round(1)

# group_df

In [None]:
# ADD NEW COLUMNS FOR CUISINE DATE, SCORE AND ROLLING AVERAGE SCORE
# Calculate rolling average on the fly to keep separation between cuisine types

# American
group_df["Am_date"] = group_df.loc[group_df["cuisine"] == "American", "inspection_date"].reset_index()[["inspection_date"]]
group_df["Am_score"] = group_df.loc[group_df["cuisine"] == "American", "score"].reset_index()[["score"]]
group_df["Am_roll"] = group_df["Am_score"].rolling(100).mean().round(1)

# Asian
group_df["As_date"] = group_df.loc[group_df["cuisine"] == "Asian", "inspection_date"].reset_index()[["inspection_date"]]
group_df["As_score"] = group_df.loc[group_df["cuisine"] == "Asian", "score"].reset_index()[["score"]]
group_df["As_roll"] = group_df["As_score"].rolling(100).mean().round(1)

# BakeryCafe
group_df["BC_date"] = group_df.loc[group_df["cuisine"] == "BakeryCafe", "inspection_date"].reset_index()[["inspection_date"]]
group_df["BC_score"] = group_df.loc[group_df["cuisine"] == "BakeryCafe", "score"].reset_index()[["score"]]
group_df["BC_roll"] = group_df["BC_score"].rolling(100).mean().round(1)

# French
group_df["Fr_date"] = group_df.loc[group_df["cuisine"] == "French", "inspection_date"].reset_index()[["inspection_date"]]
group_df["Fr_score"] = group_df.loc[group_df["cuisine"] == "French", "score"].reset_index()[["score"]]
group_df["Fr_roll"] = group_df["Fr_score"].rolling(100).mean().round(1)

# Italian
group_df["It_date"] = group_df.loc[group_df["cuisine"] == "Italian", "inspection_date"].reset_index()[["inspection_date"]]
group_df["It_score"] = group_df.loc[group_df["cuisine"] == "Italian", "score"].reset_index()[["score"]]
group_df["It_roll"] = group_df["It_score"].rolling(100).mean().round(1)

# Mediterranean
group_df["Me_date"] = group_df.loc[group_df["cuisine"] == "Mediterranean", "inspection_date"].reset_index()[["inspection_date"]]
group_df["Me_score"] = group_df.loc[group_df["cuisine"] == "Mediterranean", "score"].reset_index()[["score"]]
group_df["Me_roll"] = group_df["Me_score"].rolling(100).mean().round(1)

# Mexican
group_df["Mx_date"] = group_df.loc[group_df["cuisine"] == "Mexican", "inspection_date"].reset_index()[["inspection_date"]]
group_df["Mx_score"] = group_df.loc[group_df["cuisine"] == "Mexican", "score"].reset_index()[["score"]]
group_df["Mx_roll"] = group_df["Mx_score"].rolling(100).mean().round(1)

# Other
group_df["Ot_date"] = group_df.loc[group_df["cuisine"] == "Other", "inspection_date"].reset_index()[["inspection_date"]]
group_df["Ot_score"] = group_df.loc[group_df["cuisine"] == "Other", "score"].reset_index()[["score"]]
group_df["Ot_roll"] = group_df["Ot_score"].rolling(100).mean().round(1)


# TRIM THE ORIGINAL COLUMNS OUT
del group_df["cuisine"]
del group_df["inspection_date"]
del group_df["score"]

# group_df.head(25)

In [None]:
# EXPORT TO JSON FILE
group_df.to_json("/Users/frimpter/Documents/data_science/ru_datascience_bootcamp/Project 2/ROLLING.json", orient="records", date_format="iso")