# Accident Analysis in Metro Nashville: 2013-2017 (Part 2)

### This notebook will:
* read in the cleaned data for accidents in Metro Nashville for 2013-2017,
* create data points for non-crash data,
* create a dataframe containing bootstrapped samples of crash/non-crash data,
* store that dataframe in a csv to be used for forecast model in Part 3.


In [29]:
import glob
import os
import random
import pandas as pd
from matplotlib import pyplot as plt
%matplotlib inline 
import datetime
import time as tm
import numpy as np
from time import gmtime, strftime, localtime

pd.options.mode.chained_assignment = None


In [33]:
non_empty_crash_data_df = pd.read_csv('data/Clean_Crash_Data.csv')
non_empty_crash_data_df.drop('Unnamed: 0',axis=1,inplace=True)
non_empty_crash_data_df

  interactivity=interactivity, compiler=compiler, result=result)


Unnamed: 0,Accident Number,Date and Time,Number of Motor Vehicles,Number of Injuries,Number of Fatalities,Property Damage,Hit and Run,Reporting Officer,Collision Type,Collision Type Description,...,Month,Hour,Year,Week,Weekend,Rush Hour Morning,Rush Hour Afternoon,Ramp,Intersection,Interstate
0,20130000050,2013-01-01 00:15:00,2.0,3.0,0.0,N,N,414722,4.0,ANGLE,...,1,0,2013,1,0,0,0,0,1,0
1,20130000270,2013-01-01 00:30:00,2.0,0.0,0.0,N,Y,887608,5.0,SIDESWIPE - SAME DIRECTION,...,1,0,2013,1,0,0,0,0,1,1
2,20130000128,2013-01-01 00:43:00,2.0,0.0,0.0,N,Y,716877,5.0,SIDESWIPE - SAME DIRECTION,...,1,0,2013,1,0,0,0,0,1,0
3,20130000123,2013-01-01 00:45:00,2.0,0.0,0.0,N,N,834804,1.0,REAR END,...,1,0,2013,1,0,0,0,0,0,0
4,20130000160,2013-01-01 00:45:00,2.0,1.0,0.0,N,N,717708,1.0,REAR END,...,1,0,2013,1,0,0,0,0,0,0
5,20130000142,2013-01-01 01:00:00,2.0,0.0,0.0,N,N,217537,1.0,REAR END,...,1,1,2013,1,0,0,0,0,0,0
6,20130000138,2013-01-01 01:05:00,1.0,1.0,0.0,N,N,468160,0.0,NOT COLLISION W/MOTOR VEHICLE-TRANSPORT,...,1,1,2013,1,0,0,0,0,1,0
7,20130000206,2013-01-01 01:25:00,2.0,0.0,0.0,N,N,728059,1.0,REAR END,...,1,1,2013,1,0,0,0,0,0,0
8,20130000306,2013-01-01 01:25:00,2.0,0.0,0.0,N,N,VUPD531,1.0,REAR END,...,1,1,2013,1,0,0,0,0,1,0
9,20130000393,2013-01-01 01:26:00,1.0,0.0,0.0,N,Y,834804,0.0,NOT COLLISION W/MOTOR VEHICLE-TRANSPORT,...,1,1,2013,1,0,0,0,0,1,0


In [16]:
sample_size = 100000

In [17]:
random_locs = random.choices(list(non_empty_crash_data_df['Location']), k=sample_size)

In [18]:
random_times = random.choices(list(non_empty_crash_data_df['Rounded Date and Time']), k=sample_size)

#### For each location/timestamp pair, filter the crashes dataframe and if there are crashes, add them to a random crashes dataframe; otherwise, add a line to a random non-crashes dataframe that contains the time, lat/long, and best guess of street address for that location.

In [19]:
%%time

random_crashes_df = pd.DataFrame()
random_non_crashes_df = pd.DataFrame()

for idx in list(range(sample_size)):
    temp_df = non_empty_crash_data_df[(non_empty_crash_data_df['Rounded Date and Time'] == random_times[idx]) & 
                            (non_empty_crash_data_df['Location'] == random_locs[idx]) 
                           ]
    if temp_df.shape[0] > 0:
        random_crashes_df = random_crashes_df.append(temp_df.loc[:,['Rounded Date and Time','Location','Street Address']])
    else:
        #weather_description = non_empty_crash_data_df[(non_empty_crash_data_df['Rounded Date and Time'] == random_times[idx])]['Weather Description'].mode()[0]
        street_address = non_empty_crash_data_df[(non_empty_crash_data_df['Location'] == random_locs[idx])]['Street Address'].mode()[0]
        random_non_crashes_df = random_non_crashes_df.append({'Rounded Date and Time': random_times[idx], 
                                                                          'Location': random_locs[idx],
                                                                          'Street Address': street_address
                                                                          #'Weather Description': weather_description,
                                                                          }, ignore_index=True)



CPU times: user 1h 6min 36s, sys: 11.4 s, total: 1h 6min 47s
Wall time: 6h 27min 40s


#### For the two new dataframes, add a column for Crash Recorded and add a 1 for all crashes and a 0 for all non-crashes.

In [20]:
random_crashes_df['Crash Recorded'] = 1
random_non_crashes_df['Crash Recorded'] = 0

In [21]:
random_crashes_df.shape

(112, 4)

In [22]:
random_non_crashes_df.shape

(99888, 4)

In [23]:
#random_crashes_df = non_empty_crash_data_df.loc[:,['Rounded Date and Time','Rounded Location','Street Address']]

#### Because there are so many more non-crashes than crashes, upsample the crashes to maintain class balance.

In [24]:
#this will upsample:
random_crashes_upsample_df = pd.DataFrame(random.choices(random_crashes_df.values, k=sample_size), columns=['Rounded Date and Time','Location','Street Address','Crash Recorded'])
random_crashes_upsample_df

Unnamed: 0,Rounded Date and Time,Location,Street Address,Crash Recorded
0,2015-10-01 06:00:00,"(36.13293, -86.72674)",MM 52 0 I 24,1
1,2014-09-03 12:00:00,"(36.045609999999996, -86.65888000000001)",I 24 W & BELL RD,1
2,2013-07-19 11:00:00,"(36.1775604401412, -86.7499528664536)",CLEARVIEW AVE & WOODLAND ST,1
3,2014-06-27 15:00:00,"(36.151340000000005, -86.76836)",RAMP & I 40,1
4,2017-01-06 10:00:00,"(36.1835, -86.61076)",OLD HICKORY BLVD & DODSON CHAPEL RD,1
5,2017-04-03 16:00:00,"(36.15565, -86.78971999999999)",I40 E ENT RAMP & BROADWAY,1
6,2014-09-03 12:00:00,"(36.045609999999996, -86.65888000000001)",I 24 W & BELL RD,1
7,2017-06-28 18:00:00,"(36.26168, -86.71238000000001)",GALLATIN PKES & MAPLE ST,1
8,2017-01-25 17:00:00,"(36.22542, -86.86381999999999)",BRILEY PKWYW & BRILEY PKWYE,1
9,2016-09-20 15:00:00,"(36.23775, -86.78721)",I24 E EXT RAMP & I 24,1


#### Merge the two dataframes into one, and then extract features.

In [25]:
def final_data_prep(df1, df2):
    df_out = pd.concat([df1, df2])
    extract_time_features(df_out)
    extract_address_features(df_out)
    df_out = df_out.loc[:,['Day Of Week', 'Day Of Month', 'Month', 'Hour',
       'Week', 'Weekend', 'Rush Hour Morning', 'Rush Hour Afternoon', 
                                     #'Bad Weather',
                                     'Intersection',
                                     'Interstate',
                                     #'Street Address',
                                     'Ramp',
                                     'Crash Recorded']]
    return df_out

In [26]:
model_data_df = final_data_prep(random_non_crashes_df,random_crashes_upsample_df)

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  


In [28]:
model_data_df.to_csv('data/Random Model Data.csv')

In [35]:
#### testing out random forest model
#%%time

#from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import RandomForestClassifier
labels = np.array(model_data_df['Crash Recorded'])
model_data_df_no_labels = model_data_df.drop('Crash Recorded', axis = 1)
feature_list = list(model_data_df.columns)
model_data_no_labels_array = np.array(model_data_df_no_labels)

X_train, X_test, y_train, y_test = train_test_split(model_data_no_labels_array,
                                                   labels,
                                                   test_size = 0.33,
                                                   random_state = 42)

#rfc = RandomForestRegressor(n_estimators=1000, max_depth=11, random_state=2)
rfc = RandomForestClassifier(n_estimators=1000, max_depth=11, random_state=2)
rfc.fit(X_train, y_train)

y_pred = rfc.predict(X_test)



NameError: name 'train_test_split' is not defined