# Requirements
- install python3.9
- install mysql server on Mac via homebrew `brew install mysql`
- start the mysql server `brew services start mysql`
- install the mysql python client `pip3 install pymysql`
- run this notebook (e.g.: VSCode's jupyter extension)

# Links
- installing python3.9: https://www.python.org/downloads/
- installing homebrew: https://brew.sh/
- installing VSCode and the jupyter extension: https://marketplace.visualstudio.com/items?itemName=ms-toolsai.jupyter

# Imports

In [103]:
import pymysql
import pandas as pd

# Open Connection

In [104]:

connection = pymysql.connect(
	host='127.0.0.1',
	user='root',
	password= '',
	cursorclass=pymysql.cursors.DictCursor
)

cursor = connection.cursor()

#Create a database to run and store tables
cursor.execute("CREATE DATABASE IF NOT EXISTS mydatabase")
connection.select_db("mydatabase")
#execute a query to confirm first query worked
cursor.execute("SHOW DATABASES")
# print(cursor.fetchall()) #Verify that mydatabase appears in the list when print statement is run

5

# Extraction of Data from API's and CSV's

In [105]:
#EXTRACTION

collision_data = pd.read_csv("https://data.cityofnewyork.us/resource/h9gi-nx95.csv")
features = ["collision_id", "crash_time", "borough", "zip_code", "contributing_factor_vehicle_1", "contributing_factor_vehicle_2", "number_of_persons_injured", "number_of_persons_killed"]
collision_data = collision_data.loc[:,features]


In [106]:
uber_data = pd.read_csv("uber_nyc_enriched.csv")  
features = ["pickup_dt","pickups", "spd", "vsb", "temp", "pcp01", "pcp06", "pcp24", "sd", "borough", "hday"]
uber_data = uber_data.loc[:,features]

uber_data.insert(0, "uber_id", [i for i in range(len(uber_data))])

In [107]:
noise_incident_data = pd.read_csv("bar_locations.csv")  
features = ["Incident Zip", "Borough", "Location Type", "num_calls"]
noise_incident_data = noise_incident_data.loc[:,features]

noise_incident_data.insert(0, "incident_id", [i for i in range(len(noise_incident_data))])

# Transformation of Data

In [108]:
#split datetime into separate columns
uber_data['dates'] = pd.to_datetime(uber_data['pickup_dt'], format = '%m/%d/%y %H:%M').dt.date
uber_data['time'] = pd.to_datetime(uber_data['pickup_dt'], format = '%m/%d/%y %H:%M').dt.time
del uber_data['pickup_dt']

#remove NaN values
uber_data.dropna(inplace=True)

#remove unnecessary boroughs
uber_data.borough.unique()
uber_data = uber_data[uber_data.borough != 'EWR']

#match borough text with other dataframes
uber_data['borough'] = uber_data['borough'].str.upper()

#change data types
uber_data.dtypes
uber_data['uber_id'] = uber_data['uber_id'].astype(int)
uber_data['pickups'] = uber_data['pickups'].astype(int)
uber_data['borough'] = uber_data['borough'].astype(str)
uber_data['hday'] = uber_data['hday'].astype(str)
uber_data['dates'] = uber_data['dates'].astype(str)
uber_data['time'] = uber_data['time'].astype(str)

uber_data

Unnamed: 0,uber_id,pickups,spd,vsb,temp,pcp01,pcp06,pcp24,sd,borough,hday,dates,time
0,0,152,5.0,10.0,30.0,0.0,0.0,0.0,0.0,BRONX,Y,2015-01-01,01:00:00
1,1,1519,5.0,10.0,30.0,0.0,0.0,0.0,0.0,BROOKLYN,Y,2015-01-01,01:00:00
3,3,5258,5.0,10.0,30.0,0.0,0.0,0.0,0.0,MANHATTAN,Y,2015-01-01,01:00:00
4,4,405,5.0,10.0,30.0,0.0,0.0,0.0,0.0,QUEENS,Y,2015-01-01,01:00:00
5,5,6,5.0,10.0,30.0,0.0,0.0,0.0,0.0,STATEN ISLAND,Y,2015-01-01,01:00:00
...,...,...,...,...,...,...,...,...,...,...,...,...,...
29094,29094,67,7.0,10.0,75.0,0.0,0.0,0.0,0.0,BRONX,N,2015-06-30,23:00:00
29095,29095,990,7.0,10.0,75.0,0.0,0.0,0.0,0.0,BROOKLYN,N,2015-06-30,23:00:00
29097,29097,3828,7.0,10.0,75.0,0.0,0.0,0.0,0.0,MANHATTAN,N,2015-06-30,23:00:00
29098,29098,580,7.0,10.0,75.0,0.0,0.0,0.0,0.0,QUEENS,N,2015-06-30,23:00:00


In [109]:
#remove NaN values
collision_data.dropna(inplace=True)

#change crash_time values into datetime format
collision_data['crash_time'] = pd.to_datetime(collision_data['crash_time'], format = '%H:%M').dt.time

#change data types
collision_data.dtypes
collision_data['collision_id'] = collision_data['collision_id'].astype(int)
collision_data['crash_time'] = collision_data['crash_time'].astype(str)
collision_data['borough'] = collision_data['borough'].astype(str)
collision_data['zip_code'] = collision_data['zip_code'].astype(int)
collision_data['contributing_factor_vehicle_1'] = collision_data['contributing_factor_vehicle_1'].astype(str)
collision_data['contributing_factor_vehicle_2'] = collision_data['contributing_factor_vehicle_2'].astype(str)
collision_data['number_of_persons_injured'] = collision_data['number_of_persons_injured'].astype(int)
collision_data['number_of_persons_killed'] = collision_data['number_of_persons_killed'].astype(int)

collision_data

Unnamed: 0,collision_id,crash_time,borough,zip_code,contributing_factor_vehicle_1,contributing_factor_vehicle_2,number_of_persons_injured,number_of_persons_killed
3,4407811,16:00:00,BROOKLYN,11222,Following Too Closely,Unspecified,0,0
6,4408019,17:30:00,QUEENS,11106,Driver Inattention/Distraction,Unspecified,0,0
13,4136992,22:50:00,BROOKLYN,11201,Passing or Lane Usage Improper,Unspecified,0,0
15,4395664,14:50:00,BRONX,10461,Unspecified,Unspecified,0,0
17,4403773,22:20:00,BROOKLYN,11234,Driver Inexperience,Unspecified,1,0
...,...,...,...,...,...,...,...,...
992,4408495,17:29:00,BROOKLYN,11203,Unspecified,Unspecified,0,0
993,4408458,20:55:00,STATEN ISLAND,10305,Driver Inattention/Distraction,Unspecified,0,0
995,4407937,07:25:00,QUEENS,11379,Backing Unsafely,Unspecified,0,0
997,4407966,16:04:00,QUEENS,11435,Driver Inattention/Distraction,Unspecified,0,0


In [110]:
#remove unnecessary value in borough
noise_incident_data.Borough.unique()
noise_incident_data = noise_incident_data[noise_incident_data.Borough != 'Unspecified']

#change data types
noise_incident_data.dtypes
noise_incident_data['incident_id'] = noise_incident_data['incident_id'].astype(int)
noise_incident_data['Incident Zip'] = noise_incident_data['Incident Zip'].astype(int)
noise_incident_data['Borough'] = noise_incident_data['Borough'].astype(str)
noise_incident_data['Location Type'] = noise_incident_data['Location Type'].astype(str)
noise_incident_data['num_calls'] = noise_incident_data['num_calls'].astype(int)

noise_incident_data

Unnamed: 0,incident_id,Incident Zip,Borough,Location Type,num_calls
0,0,10308,STATEN ISLAND,Club/Bar/Restaurant,40
1,1,10012,MANHATTAN,Club/Bar/Restaurant,18
2,2,10308,STATEN ISLAND,Club/Bar/Restaurant,21
3,3,10034,MANHATTAN,Club/Bar/Restaurant,160
4,4,11220,BROOKLYN,Club/Bar/Restaurant,17
...,...,...,...,...,...
2435,2435,11211,BROOKLYN,Club/Bar/Restaurant,16
2436,2436,11104,QUEENS,Club/Bar/Restaurant,17
2437,2437,10012,MANHATTAN,Club/Bar/Restaurant,17
2438,2438,10304,STATEN ISLAND,Club/Bar/Restaurant,11
