### Import Packages

In [1]:
import pandas as pd
import numpy as np
import datetime
import matplotlib.pyplot as plt
%matplotlib inline

from weather import forecast

## Seattle Collision Data

https://data-seattlecitygis.opendata.arcgis.com/datasets/collisions/data?geometry=-122.526%2C47.676%2C-122.198%2C47.717&page=4

01/01/2014 - 03/29/2018

data with weather info available 01/01/2014 - 01/30/2018

(lat, lon) values have 6 decimal places (precision at individual human level)

### read in the original dataset

In [3]:
df_collision = pd.read_csv('../data/Collisions.csv')

### Check Missing Values in the Dataset

In [10]:
df_collision.speeding.isna().value_counts()

True     187386
False      8730
Name: speeding, dtype: int64

In [11]:
df_collision.X.isna().value_counts()

False    191019
True       5097
Name: X, dtype: int64

In [12]:
df_collision.Y.isna().value_counts()

False    191019
True       5097
Name: Y, dtype: int64

In [13]:
df_collision.location.isna().value_counts()

False    191810
True       4306
Name: location, dtype: int64

In [14]:
df_collision.weather.isna().value_counts()

False    173400
True      22716
Name: weather, dtype: int64

In [14]:
df_collision.loc[df_collision.X.isna() & (-df_collision.location.isna()), columns].shape

(791, 4)

###### To-Do: 
1. drop "speeding" variable
2. drop rows where "X" or "Y" is null, impute (X, Y) value by "location" value for 791 rows with Google's GeoCoding API?
3. impute "weather" value with other data sources?

### Feature Engineering on Incident Time Value

#### convert incident time field from string to datetime

In [15]:
df_collision['time'] = pd.DatetimeIndex(df_collision['incdttm'])

df_collision['year'] = df_collision['time'].dt.year

df_collision['month'] = df_collision['time'].dt.month

df_collision['day'] = df_collision['time'].dt.day

df_collision['hour'] = df_collision['time'].dt.hour

#### Function to round incdttm field to its nearest hour value

source: https://stackoverflow.com/questions/48107644/rounding-datetime-to-the-nearest-hour?rq=1

In [17]:
def round_to_hour(dt):
    dt_start_of_hour = dt.replace(minute=0, second=0, microsecond=0)
    dt_half_hour = dt.replace(minute=30, second=0, microsecond=0)

    if dt >= dt_half_hour:
        # round up
        dt = dt_start_of_hour + datetime.timedelta(hours=1)
    else:
        # round down
        dt = dt_start_of_hour

    return dt

In [271]:
df_collision['nearest_hour'] = df_collision['time'].apply(round_to_hour)

#### Convert seattle time to epoch time

In [20]:
from pytz import timezone

In [21]:
seattle_timezone = timezone('US/Pacific')

In [22]:
def convert_pacific_time_to_epoch(dt):
    return int(seattle_timezone.localize(dt).timestamp())

In [23]:
df_collision['epoch'] = df_collision['hour'].apply(convert_pacific_time_to_epoch)

### Retrieve Hourly Weather data with DarkSky API

#### Focus on accident occured after 2014/1/1

In [82]:
mask = (-df_collision.X.isna()) & (-df_collision.Y.isna())

unique_location_hour = df_collision.loc[mask, ['X', 'Y', 'epoch']].drop_duplicates()

In [238]:
start_date_epoch = int(datetime.datetime(2014,1,1,0,0).timestamp())

unique_location_hour_recent = unique_location_hour[unique_location_hour['epoch'] > start_date_epoch]

In [88]:
COLUMNS = ['Latitude', 'Longitude', 'EpochTime', 'Summary', 'DegreesFahrenheit', 'Humidity', 'Visibility',
           'WindBearing', 'WindGust', 'WindSpeed', "PrecipitationIntensity", "PrecipitationType"]

def get_weather_json_darksky(latitude, longitude, epoch_time):
    """ Calls the DarkSky API with the given latitude, longitude and epoch time to retrieve historical weather data.
    :param latitude: Float value representing latitude
    :param longitude: Float value representing longitude
    :param epoch_time: Numeric value representing
    :return: A darksky.forecast.Forecast object
    """
    response = forecast(API_KEY, latitude, longitude, int(epoch_time))
    return response

def get_weather_series_from_response(latitude, longitude, epoch_time, response):
    """ """
    response_keys = response._data.keys()

    row_data = [
                latitude,
                longitude,
                epoch_time,
                response['summary'] if 'summary' in response_keys else None,
                response['temperature'] if 'temperature' in response_keys else None,
                response['humidity'] if 'humidity' in response_keys else None,
                response['visibility'] if 'visibility' in response_keys else None,
                response['windBearing'] if 'windBearing' in response_keys else None,
                response['windGust'] if 'windGust' in response_keys else None,
                response['windSpeed'] if 'windSpeed' in response_keys else None,
                response['precipIntensity'] if 'precipIntensity' in response_keys else None,
                response['precipType'] if 'precipType' in response_keys else None
    ]
    row = pd.Series(row_data, index = COLUMNS)
    return row

In [217]:
df_weather_darksky = pd.DataFrame(columns = COLUMNS)

# for i in range(unique_location_hour.shape[0]):
    
for i in range(0, unique_location_hour_recent.shape[0]):
    response = get_weather_json_darksky(
                    unique_location_hour_recent.iloc[i,:].Y
                    , unique_location_hour_recent.iloc[i,:].X
                    , int(unique_location_hour_recent.iloc[i,:].epoch)
                    ).currently
    series = get_weather_series_from_response(
                    unique_location_hour_recent.iloc[i,:].Y
                    , unique_location_hour_recent.iloc[i,:].X
                    , int(unique_location_hour_recent.iloc[i,:].epoch)    
                    , response
                    )
    df_weather_darksky = df_weather_darksky.append(series, ignore_index=True)

### Save the resulting dataframes to csv files

#### join collision data with darksky weather data

In [344]:
columns = ['X', 'Y', 'addrtype', 'collisiontype', 'fatalities', 'injuries'
           , 'lightcond', 'roadcond', 'junctiontype', 'location'
           , 'pedcount', 'pedcylcount', 'personcount', 'sdot_coldesc', 'severitydesc'
           , 'speeding', 'weather', 'time', 'epoch', 'year', 'month', 'day', 'hour']

df_collision.loc[:, columns].head(1)

Unnamed: 0,X,Y,addrtype,collisiontype,fatalities,injuries,lightcond,roadcond,junctiontype,location,...,sdot_coldesc,severitydesc,speeding,weather,time,epoch,year,month,day,hour
0,-122.342323,47.627572,Block,Head On,0,0,Daylight,Dry,Mid-Block (not related to intersection),DEXTER AVE N BETWEEN ALOHA ST AND WARD ST,...,"MOTOR VEHICLE STRUCK MOTOR VEHICLE, FRONT END ...",Property Damage Only Collision,,Clear or Partly Cloudy,2016-11-04 09:55:00,1478278800,2016,11,4,9


In [345]:
collision_weather_darksky = pd.merge(df_collision.loc[:, columns], df_weather_darksky, how='left', left_on = ['Y', 'X', 'epoch'], right_on = ['Latitude', 'Longitude', 'EpochTime'])

In [346]:
collision_weather_darksky.to_csv('../data/Collisions_weather_darksky_fei.csv')

#### join collision data with iem weather data

In [251]:
df_weather_iem_hour = pd.read_csv('../../project_datasets/weather_iem_hour.csv')
df_weather_iem_day = pd.read_csv('../../project_datasets/weather_iem_day.csv')

In [258]:
df_weather_iem_hour.drop(["Unnamed: 0"], axis=1, inplace = True)
df_weather_iem_hour.columns = ['year', 'month', 'day', 'hour', 'temp_hourly', 'precipitation_hourly', 'wind_speed_hourly']

In [356]:
df_weather_iem_day.drop(["Unnamed: 0", "CountOfObs"], axis=1, inplace = True)
df_weather_iem_day.columns = ['year', 'month', 'day', 'temp_mean_daily', 'temp_high_daily', 'temp_low_daily', 'precipitation_daily', 'wind_speed_daily']

In [368]:
collision_weather_iem_hour = pd.merge(df_collision.loc[:, columns], df_weather_iem_hour, how='left', left_on = ['year', 'month', 'day', 'hour'], right_on = ['year', 'month', 'day', 'hour'])
collision_weather_iem_hour.to_csv('../data/Collisions_weather_iem_hour_fei.csv')

In [369]:
collision_weather_iem_day = pd.merge(df_collision.loc[:, columns], df_weather_iem_day, how='left', left_on = ['year', 'month', 'day'], right_on = ['year', 'month', 'day'])
collision_weather_iem_day.to_csv('../data/Collisions_weather_iem_day_fei.csv')