# Requirements
- install python3.9
- install mysql server on Mac via homebrew `brew install mysql`
- start the mysql server `brew services start mysql`
- install the mysql python client `pip3 install pymysql`
- run this notebook (e.g.: VSCode's jupyter extension)

# Links
- installing python3.9: https://www.python.org/downloads/
- installing homebrew: https://brew.sh/
- installing VSCode and the jupyter extension: https://marketplace.visualstudio.com/items?itemName=ms-toolsai.jupyter

# Imports

In [None]:
import pymysql
import pandas as pd

# Open Connection

In [None]:

connection = pymysql.connect(
	host='127.0.0.1',
	user='root',
	password= '',
	cursorclass=pymysql.cursors.DictCursor
)

cursor = connection.cursor()

#Create a database to run and store tables
cursor.execute("CREATE DATABASE IF NOT EXISTS mydatabase")
connection.select_db("mydatabase")
#execute a query to confirm first query worked
cursor.execute("SHOW DATABASES")
# print(cursor.fetchall()) #Verify that mydatabase appears in the list when print statement is run

# Extraction of Data from API's and CSV's

In [None]:
#EXTRACTION

collision_data = pd.read_csv("https://data.cityofnewyork.us/resource/h9gi-nx95.csv")
features = ["collision_id", "crash_time", "borough", "zip_code", "contributing_factor_vehicle_1", "contributing_factor_vehicle_2", "number_of_persons_injured", "number_of_persons_killed","crash_date"]
collision_data = collision_data.loc[:,features]
collision_data

In [None]:
uber_data = pd.read_csv("uber_nyc_enriched.csv")  
features = ["pickups", "spd", "vsb", "temp", "pcp01", "pcp06", "pcp24", "sd", "borough", "hday", "pickup_dt"]
uber_data = uber_data.loc[:,features]

uber_data.insert(0, "uber_id", [i for i in range(len(uber_data))])
uber_data

In [None]:
noise_incident_data = pd.read_csv("party_in_nyc.csv")  
features = ["Created Date", "Incident Zip", "Borough", "Location Type"]
noise_incident_data = noise_incident_data.loc[:,features]

noise_incident_data.insert(0, "incident_id", [i for i in range(len(noise_incident_data))])
noise_incident_data

## Creating the fact dataframe

### ensure all tables are of same size so that fact table

In [None]:
fact_table_size = min(
    len(collision_data),
    len(uber_data),
    len(noise_incident_data)
)

### create table

In [None]:
facts = pd.DataFrame(data=uber_data['uber_id'])
facts.insert(1,'collision_id',collision_data['collision_id'])
facts.insert(2,'incident_id',noise_incident_data['incident_id'])
facts = facts.loc[:fact_table_size]
facts