# Requirements
- install python3.9
- install mysql server on Mac via homebrew `brew install mysql`
- start the mysql server `brew services start mysql`
- install the mysql python client `pip3 install pymysql`
- run this notebook (e.g.: VSCode's jupyter extension)

# Links
- installing python3.9: https://www.python.org/downloads/
- installing homebrew: https://brew.sh/
- installing VSCode and the jupyter extension: https://marketplace.visualstudio.com/items?itemName=ms-toolsai.jupyter

# Imports

In [4]:
import pymysql
import pandas as pd

ModuleNotFoundError: No module named 'pymysql'

# Open Connection

In [None]:

connection = pymysql.connect(
	host='127.0.0.1',
	user='root',
	password= '',
	cursorclass=pymysql.cursors.DictCursor
)

cursor = connection.cursor()

#Create a database to run and store tables
cursor.execute("CREATE DATABASE IF NOT EXISTS mydatabase")
connection.select_db("mydatabase")
#execute a query to confirm first query worked
cursor.execute("SHOW DATABASES")
# print(cursor.fetchall()) #Verify that mydatabase appears in the list when print statement is run

# Extraction of Data from API's and CSV's

In [None]:
#EXTRACTION

collision_data = pd.read_csv("https://data.cityofnewyork.us/resource/h9gi-nx95.csv")
features = ["collision_id", "crash_time", "borough", "zip_code", "contributing_factor_vehicle_1", "contributing_factor_vehicle_2", "number_of_persons_injured", "number_of_persons_killed"]
collision_data = collision_data.loc[:,features]


In [None]:
uber_data = pd.read_csv("uber_nyc_enriched.csv")  
features = ["pickups", "spd", "vsb", "temp", "pcp01", "pcp06", "pcp24", "sd", "borough", "hday"]
uber_data = uber_data.loc[:,features]

uber_data.insert(0, "uber_id", [i for i in range(len(uber_data))])

In [None]:
noise_incident_data = pd.read_csv("bar_locations.csv")  
features = ["Incident Zip", "Borough", "Location Type", "num_calls"]
noise_incident_data = noise_incident_data.loc[:,features]

noise_incident_data.insert(0, "incident_id", [i for i in range(len(noise_incident_data))])

# Transformation of Data

In [None]:
#split datetime into separate columns
uber_data['dates'] = pd.to_datetime(uber_data['pickup_dt'], format = '%m/%d/%y %H:%M').dt.date
uber_data['time'] = pd.to_datetime(uber_data['pickup_dt'], format = '%m/%d/%y %H:%M').dt.time
del uber_data['pickup_dt']

#remove NaN values
uber_data.dropna(inplace=True)

#remove unnecessary boroughs
uber_data.borough.unique()
uber_data = uber_data[uber_data.borough != 'EWR']

#match borough text with other dataframes
uber_data['borough'] = uber_data['borough'].str.upper()

uber_data

In [None]:
#remove NaN values
collision_data.dropna(inplace=True)

#change crash_time values into datetime format
collision_data['crash_time'] = pd.to_datetime(collision_data['crash_time'], format = '%H:%M').dt.time

#change zipe_code data type to int
collision_data['zip_code'] = collision_data['zip_code'].astype({'zip_code':int})

collision_data

In [None]:
#remove unnecessary value in borough
noise_incident_data.Borough.unique()
noise_incident_data = noise_incident_data[noise_incident_data.Borough != 'Unspecified']

noise_incident_data