# Accident Analysis in Metro Nashville: 2013-2017 (Part 2)

### This notebook will:
* read in the cleaned data for accidents in Metro Nashville for 2013-2017,
* create data points for non-crash data,
* create a dataframe containing bootstrapped samples of crash/non-crash data,
* store that dataframe in a csv to be used for forecast model in Part 3.


In [3]:
import glob
import os
import random
import pandas as pd
from matplotlib import pyplot as plt
%matplotlib inline 
import datetime
import time as tm
import numpy as np
from time import gmtime, strftime, localtime

pd.options.mode.chained_assignment = None


In [50]:
%run ./'Nashville Accident Analysis -- Cleanup'.ipynb

CPU times: user 1.01 s, sys: 136 ms, total: 1.15 s
Wall time: 1.16 s


In [4]:
non_empty_crash_data_df = pd.read_csv('data/Clean_Crash_Data.csv')
non_empty_crash_data_df.drop('Unnamed: 0',axis=1,inplace=True)
non_empty_crash_data_df

  interactivity=interactivity, compiler=compiler, result=result)


Unnamed: 0,Accident Number,Date and Time,Number of Motor Vehicles,Number of Injuries,Number of Fatalities,Property Damage,Hit and Run,Reporting Officer,Collision Type,Collision Type Description,...,Month,Hour,Year,Week,Weekend,Rush Hour Morning,Rush Hour Afternoon,Ramp,Intersection,Interstate
0,20130000050,2013-01-01 00:15:00,2.0,3.0,0.0,N,N,414722,4.0,ANGLE,...,1,0,2013,1,0,0,0,0,1,0
1,20130000270,2013-01-01 00:30:00,2.0,0.0,0.0,N,Y,887608,5.0,SIDESWIPE - SAME DIRECTION,...,1,0,2013,1,0,0,0,0,1,1
2,20130000128,2013-01-01 00:43:00,2.0,0.0,0.0,N,Y,716877,5.0,SIDESWIPE - SAME DIRECTION,...,1,0,2013,1,0,0,0,0,1,0
3,20130000123,2013-01-01 00:45:00,2.0,0.0,0.0,N,N,834804,1.0,REAR END,...,1,0,2013,1,0,0,0,0,0,0
4,20130000160,2013-01-01 00:45:00,2.0,1.0,0.0,N,N,717708,1.0,REAR END,...,1,0,2013,1,0,0,0,0,0,0
5,20130000142,2013-01-01 01:00:00,2.0,0.0,0.0,N,N,217537,1.0,REAR END,...,1,1,2013,1,0,0,0,0,0,0
6,20130000138,2013-01-01 01:05:00,1.0,1.0,0.0,N,N,468160,0.0,NOT COLLISION W/MOTOR VEHICLE-TRANSPORT,...,1,1,2013,1,0,0,0,0,1,0
7,20130000206,2013-01-01 01:25:00,2.0,0.0,0.0,N,N,728059,1.0,REAR END,...,1,1,2013,1,0,0,0,0,0,0
8,20130000306,2013-01-01 01:25:00,2.0,0.0,0.0,N,N,VUPD531,1.0,REAR END,...,1,1,2013,1,0,0,0,0,1,0
9,20130000393,2013-01-01 01:26:00,1.0,0.0,0.0,N,Y,834804,0.0,NOT COLLISION W/MOTOR VEHICLE-TRANSPORT,...,1,1,2013,1,0,0,0,0,1,0


In [32]:
sample_size = 100000

In [33]:
random_locs = random.choices(list(non_empty_crash_data_df['Location']), k=sample_size)

In [34]:
random_times = random.choices(list(non_empty_crash_data_df['Rounded Date and Time']), k=sample_size)

#### For each location/timestamp pair, filter the crashes dataframe and if there are crashes, add them to a random crashes dataframe; otherwise, add a line to a random non-crashes dataframe that contains the time, lat/long, and best guess of street address for that location.

In [35]:
%%time

random_crashes_df = pd.DataFrame()
random_non_crashes_df = pd.DataFrame()

for idx in list(range(sample_size)):
    temp_df = non_empty_crash_data_df[(non_empty_crash_data_df['Rounded Date and Time'] == random_times[idx]) & 
                            (non_empty_crash_data_df['Location'] == random_locs[idx]) 
                           ]
    if temp_df.shape[0] > 0:
        random_crashes_df = random_crashes_df.append(temp_df.loc[:,['Rounded Date and Time',
                                                                    'Location',
                                                                    'Street Address',
                                                                    'Weather Description',
                                                                    'Precinct',
                                                                    'Zip',
                                                                    'City'
                                                                   ]])
    else:
        weather_description = non_empty_crash_data_df[(non_empty_crash_data_df['Rounded Date and Time'] == random_times[idx])]['Weather Description'].mode()[0]
        precinct = non_empty_crash_data_df[(non_empty_crash_data_df['Location'] == random_locs[idx])]['Precinct'].mode()[0]
        zip_code = non_empty_crash_data_df[(non_empty_crash_data_df['Location'] == random_locs[idx])]['Zip'].mode()[0]
        city = non_empty_crash_data_df[(non_empty_crash_data_df['Location'] == random_locs[idx])]['City'].mode()[0]
        street_address = non_empty_crash_data_df[(non_empty_crash_data_df['Location'] == random_locs[idx])]['Street Address'].mode()[0]
        random_non_crashes_df = random_non_crashes_df.append({'Rounded Date and Time': random_times[idx], 
                                                                          'Location': random_locs[idx],
                                                                          'Street Address': street_address,
                                                                          'Weather Description': weather_description,
                                                                            'Precinct': precinct,
                                                                            'Zip': zip_code,
                                                                            'City': city
                                                                          }, ignore_index=True)



CPU times: user 2h 12min 1s, sys: 36.4 s, total: 2h 12min 38s
Wall time: 2h 13min 6s


In [36]:
random_crashes_df

Unnamed: 0,Rounded Date and Time,Location,Street Address,Weather Description,Precinct,Zip,City
27063,2014-02-14 08:00:00,"(36.1580067122697, -86.80013346443769)",CHARLOTTE AVE & DR D B TODD JR BLVD,NO ADVERSE CONDITIONS,MIDTOW,37203.0,NASHVILLE
75562,2015-11-03 09:00:00,"(36.11595, -86.79512)",MM 3 7 I 440,CLEAR,MIDTOW,37215.0,NASHVILLE
96598,2016-06-28 17:00:00,"(36.09036, -86.62635999999999)",BELL RD & SMITH SPRINGS RD,CLEAR,HERMIT,37217.0,NASHVILLE
95028,2016-06-11 12:00:00,"(36.080220000000004, -86.71795)",RECOVERY RD & HARDING PL,CLEAR,SOUTH,37211.0,NASHVILLE
48101,2014-11-26 14:00:00,"(36.1773383091271, -86.7570083841827)",100 N 9TH ST,UNKNOWN,EAST,37206.0,NASHVILLE
29816,2014-03-31 07:00:00,"(36.292873781058205, -86.88556676955271)",CLARKSVILLE PIKE & OLD CLARKSVILLE PIKE,NO ADVERSE CONDITIONS,NORTH,37189.0,WHITES CREEK
43892,2014-10-20 17:00:00,"(36.07447, -86.92090999999999)",STATE HWY 251 & US HWY 70S,CLEAR,WEST,37221.0,NASHVILLE
57181,2015-04-06 14:00:00,"(36.0859, -86.64913)",UNA ANTIOCH PIKE & RANSOM CT,RAIN,SOUTH,37013.0,ANTIOCH
9167,2013-05-19 14:00:00,"(36.04522, -86.6591)",I 24 E & BELL RD,NO ADVERSE CONDITIONS,SOUTH,37013.0,ANTIOCH
46638,2014-11-23 13:00:00,"(36.17282, -86.76536999999999)",US HWY 31 & US HWY 31W,RAIN,EAST,37206.0,NASHVILLE


#### For the two new dataframes, add a column for Crash Recorded and add a 1 for all crashes and a 0 for all non-crashes.

In [37]:
random_crashes_df['Crash Recorded'] = 1
random_non_crashes_df['Crash Recorded'] = 0

In [38]:
random_crashes_df.shape

(109, 8)

In [39]:
random_non_crashes_df.shape

(99893, 8)

In [40]:
#random_crashes_df = non_empty_crash_data_df.loc[:,['Rounded Date and Time','Rounded Location','Street Address']]

#### Because there are so many more non-crashes than crashes, upsample the crashes to maintain class balance.

In [56]:
#this will upsample:
random_crashes_upsample_df = pd.DataFrame(random.choices(random_crashes_df.values, k=sample_size), columns=list(random_crashes_df.columns))
random_crashes_upsample_df

Unnamed: 0,Rounded Date and Time,Location,Street Address,Weather Description,Precinct,Zip,City,Crash Recorded
0,2017-03-13 16:00:00,"(36.1675995233, -86.778669375)",LEBANON PKE & BIG HORN DR,RAIN,HERMIT,37076.0,HERMITAGE,1
1,2017-02-22 06:00:00,"(36.29596, -86.81123000000001)",MM 39 1 I 24,RAIN,NORTH,37189.0,WHITES CREEK,1
2,2017-02-03 00:00:00,"(36.193459999999995, -86.83089)",CLARKSVILLE PKE & BUENA VISTA PKE,CLEAR,NORTH,37218.0,NASHVILLE,1
3,2016-07-18 08:00:00,"(36.08658, -86.77197)",FRANKLIN PKE & HARDING PL,CLEAR,MIDTOW,37204.0,NASHVILLE,1
4,2016-08-31 08:00:00,"(36.11221, -86.72318)",MM 53 5 I 24,CLEAR,SOUTH,37210.0,NASHVILLE,1
5,2017-08-30 17:00:00,"(36.13323, -86.80131)",PORTLAND AV & MAGNOLIA BLVD,RAIN,MIDTOW,37212.0,NASHVILLE,1
6,2014-11-07 17:00:00,"(36.1189, -86.72644)",MM 53 0 I 24,CLEAR,SOUTH,37210.0,NASHVILLE,1
7,2015-10-31 16:00:00,"(36.11091, -86.92201)",I40 E ENT RAMP & I 40,RAIN,WEST,37205.0,NASHVILLE,1
8,2017-12-18 08:00:00,"(36.21891, -86.8464)",BELL RD & COLLINS PARK DR,CLEAR,SOUTH,37013.0,ANTIOCH,1
9,2016-02-04 09:00:00,"(36.08365, -86.64721999999999)",MURFREESBORO PKE & BROOKSBORO PL,CLOUDY,SOUTH,37013.0,ANTIOCH,1


#### Merge the two dataframes into one, and then extract features.

In [77]:
def final_data_prep(df1, df2):
    df_out = pd.concat([df1, df2])
    df_out['Rounded Date and Time'] = pd.to_datetime(df_out['Rounded Date and Time'])
    extract_time_features(df_out)
    extract_address_features(df_out)
    df_temp = df_out.loc[:,['Weather Description', 'Precinct', 'Zip', 'City', 'Day Of Week', 'Day Of Month',
            'Month', 'Hour', 'Week']].astype('category')
    df_temp_2 = df_out.loc[:,['Weekend', 'Rush Hour Morning', 'Rush Hour Afternoon', 'Ramp',
            'Intersection', 'Interstate','Crash Recorded']]
    df_temp = pd.get_dummies(df_temp)
    df_out = pd.concat([df_temp, df_temp_2],axis=1)
    return df_out

def final_data_prep(df1, df2):
    df_out = pd.concat([df1, df2])
    extract_time_features(df_out)
    extract_address_features(df_out)
    df_out = df_out.loc[:,['Day Of Week', 'Day Of Month', 'Month', 'Hour',
       'Week', 'Weekend', 'Rush Hour Morning', 'Rush Hour Afternoon', 
                                     #'Bad Weather',
                                     'Intersection',
                                     'Interstate',
                                     #'Street Address',
                                     'Ramp',
                                     'Crash Recorded']]
    return df_out

In [78]:
model_data_df = final_data_prep(random_non_crashes_df,random_crashes_upsample_df)

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  


In [79]:
model_data_df.to_csv('data/Random Model Data.csv')

In [85]:
model_data_df

Unnamed: 0,Weather Description_BLOWING SAND/SOIL/DIRT,Weather Description_BLOWING SNOW,Weather Description_CLEAR,Weather Description_CLOUDY,Weather Description_FOG,Weather Description_NO ADVERSE CONDITIONS,Weather Description_OTHER (NARRATIVE),Weather Description_RAIN,Weather Description_RAIN AND FOG,Weather Description_SEVERE CROSSWIND,...,Week_51,Week_52,Week_53,Weekend,Rush Hour Morning,Rush Hour Afternoon,Ramp,Intersection,Interstate,Crash Recorded
0,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,1,0
1,0,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
2,0,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,1,0
3,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
4,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,1,0,0
5,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
6,0,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
7,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
8,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,1,0,0
9,0,0,0,0,0,1,0,0,0,0,...,0,0,0,1,0,0,0,0,1,0
