[Data Cleaning](#data_clean)<br>
[Data Loading](#data_loading)

# Predicting Location Data With Latitude and Longitude Data Cleaning

In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, Lasso
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.preprocessing import StandardScaler

In [2]:
# Load data from csv
accidents_df = pd.read_csv('./data/US_Accidents_Dec20.csv')

## Data Cleaning <a id='data_clean'><a>

In [20]:
# Replacing Null values with "accident value" from Data Dictionary
accidents_df['TMC'].fillna(201.0, inplace=True)
accidents_df.drop(columns=['End_Lat', 'End_Lng', 'Timezone', 'Airport_Code', 'Wind_Chill(F)'], inplace=True)

KeyError: "['End_Lat' 'End_Lng' 'Timezone' 'Airport_Code' 'Wind_Chill(F)'] not found in axis"

In [21]:
# Creating Copy of dataframe to create two seperate cv's later
accidents_with_location_data_df = accidents_df.copy()
accidents_with_location_data_df.columns

Index(['ID', 'Source', 'TMC', 'Severity', 'Start_Time', 'End_Time',
       'Start_Lat', 'Start_Lng', 'Distance(mi)', 'Description', 'Side',
       'Weather_Timestamp', 'Temperature(F)', 'Humidity(%)', 'Pressure(in)',
       'Visibility(mi)', 'Wind_Direction', 'Wind_Speed(mph)',
       'Precipitation(in)', 'Weather_Condition', 'Amenity', 'Bump', 'Crossing',
       'Give_Way', 'Junction', 'No_Exit', 'Railway', 'Roundabout', 'Station',
       'Stop', 'Traffic_Calming', 'Traffic_Signal', 'Turning_Loop',
       'Sunrise_Sunset', 'Civil_Twilight', 'Nautical_Twilight',
       'Astronomical_Twilight', 'Start_Hour', 'Month'],
      dtype='object')

In [5]:
# Getting rid of information data from the information predicting set
accidents_df = accidents_df.drop(columns=['Number', 'Street', 'County', 'State', 'Zipcode', 'Country', 'City'])

In [22]:
# Replace Null files in Temperate, Humidity, Pressure, Nautical Twilight, Visibility, Wind_Direction, Wind_Speed, 
# Data should be large enough that dropping rows is not too important
accidents_with_location_data_df = accidents_with_location_data_df.dropna(how = 'any', subset = ['Temperature(F)', 'Humidity(%)', 'Pressure(in)', 'Nautical_Twilight', 'Visibility(mi)', 'Wind_Direction', 'Wind_Speed(mph)'])

accidents_df = accidents_df.dropna(how = 'any', subset = ['Temperature(F)', 'Humidity(%)', 'Pressure(in)', 'Nautical_Twilight', 'Visibility(mi)', 'Wind_Direction', 'Wind_Speed(mph)'])

In [23]:
# Replacing Weather_Conditions and Precipitation with most common values
accidents_with_location_data_df['Weather_Condition'].fillna('Fair', inplace=True)
accidents_with_location_data_df['Precipitation(in)'].fillna(0.0, inplace=True)

accidents_df['Weather_Condition'].fillna('Fair', inplace=True)
accidents_df['Precipitation(in)'].fillna(0.0, inplace=True)

In [25]:
#Changing to Datetime and Data Engineering the Month and Day columns to dummy later
accidents_with_location_data_df['Start_Time'] = pd.to_datetime(accidents_with_location_data_df['Start_Time'])
accidents_with_location_data_df['End_Time'] = pd.to_datetime(accidents_with_location_data_df['End_Time'])

accidents_df['Start_Time'] = pd.to_datetime(accidents_df['Start_Time'])
accidents_df['End_Time'] = pd.to_datetime(accidents_df['End_Time'])

In [10]:
accidents_with_location_data_df['Start_Hour'] = accidents_with_location_data_df['Start_Time'].dt.hour
accidents_with_location_data_df['Month'] = accidents_with_location_data_df['Start_Time'].dt.month

accidents_df['Start_Hour'] = accidents_df['Start_Time'].dt.hour
accidents_df['Month'] = accidents_df['Start_Time'].dt.month

In [11]:
accidents_with_location_data_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3717817 entries, 2 to 4229393
Data columns (total 46 columns):
 #   Column                 Dtype         
---  ------                 -----         
 0   ID                     object        
 1   Source                 object        
 2   TMC                    float64       
 3   Severity               int64         
 4   Start_Time             datetime64[ns]
 5   End_Time               datetime64[ns]
 6   Start_Lat              float64       
 7   Start_Lng              float64       
 8   Distance(mi)           float64       
 9   Description            object        
 10  Number                 float64       
 11  Street                 object        
 12  Side                   object        
 13  City                   object        
 14  County                 object        
 15  State                  object        
 16  Zipcode                object        
 17  Country                object        
 18  Weather_Timestamp     

In [12]:
accidents_df['Start_Lat'].describe()

count    3.717817e+06
mean     3.640554e+01
std      5.024404e+00
min      2.455527e+01
25%      3.346821e+01
50%      3.585106e+01
75%      4.030347e+01
max      4.900220e+01
Name: Start_Lat, dtype: float64

In [13]:
accidents_df['Start_Lng'].describe()

count    3.717817e+06
mean    -9.526923e+01
std      1.720508e+01
min     -1.246238e+02
25%     -1.172515e+02
50%     -8.961905e+01
75%     -8.087476e+01
max     -6.711317e+01
Name: Start_Lng, dtype: float64

In [14]:
accidents_df.dtypes

ID                               object
Source                           object
TMC                             float64
Severity                          int64
Start_Time               datetime64[ns]
End_Time                 datetime64[ns]
Start_Lat                       float64
Start_Lng                       float64
Distance(mi)                    float64
Description                      object
Side                             object
Weather_Timestamp                object
Temperature(F)                  float64
Humidity(%)                     float64
Pressure(in)                    float64
Visibility(mi)                  float64
Wind_Direction                   object
Wind_Speed(mph)                 float64
Precipitation(in)               float64
Weather_Condition                object
Amenity                            bool
Bump                               bool
Crossing                           bool
Give_Way                           bool
Junction                           bool


In [26]:
#Cutting the Data down so it can be used later
accidents_df = accidents_df.sample(frac = .2)
accidents_with_location_data_df = accidents_with_location_data_df.sample(frac = .2)

In [16]:
accidents_df

Unnamed: 0,ID,Source,TMC,Severity,Start_Time,End_Time,Start_Lat,Start_Lng,Distance(mi),Description,...,Stop,Traffic_Calming,Traffic_Signal,Turning_Loop,Sunrise_Sunset,Civil_Twilight,Nautical_Twilight,Astronomical_Twilight,Start_Hour,Month
2722223,A-2722346,Bing,201.0,2,2016-03-29 11:29:18,2016-03-29 17:29:18,32.871380,-117.196250,0.732,At La Jolla Village Dr/Miramar Rd - Accident. ...,...,False,False,False,False,Day,Day,Day,Day,11,3
3745758,A-3755766,Bing,201.0,2,2019-12-28 12:43:00,2019-12-28 14:17:17,44.859740,-93.362420,0.000,At US-169/Exit 10 - Accident.,...,False,False,False,False,Day,Day,Day,Day,12,12
2783964,A-2784087,Bing,201.0,2,2016-10-03 09:00:56,2016-10-03 15:00:56,41.806200,-72.660460,1.498,Between CT-159/Exit 34 and Jennings Rd/Exit 33...,...,False,False,False,False,Day,Day,Day,Day,9,10
3894473,A-3904484,Bing,201.0,2,2019-02-06 06:59:15,2019-02-06 07:29:11,36.881260,-76.211460,0.257,At VA-165/Military Hwy/Exit 281 - Accident. La...,...,False,False,False,False,Night,Day,Day,Day,6,2
924250,A-924342,MapQuest,201.0,2,2020-01-30 07:09:27,2020-01-30 08:27:02,42.311642,-87.904938,0.000,Accident on I-94 Tri-State Tollway Eastbound n...,...,False,False,False,False,Day,Day,Day,Day,7,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
479954,A-479967,MapQuest,201.0,2,2017-06-02 12:54:12,2017-06-02 13:39:01,39.097374,-77.513481,0.000,Accident on Fort Evans Rd at River Creek Pkwy.,...,False,False,True,False,Day,Day,Day,Day,12,6
4166889,A-4176902,Bing,201.0,3,2019-04-26 23:32:40,2019-04-27 00:01:00,30.010847,-90.013180,0.527,At Downman Rd/Exit 240 - Accident. Left lane b...,...,False,False,False,False,Night,Night,Night,Night,23,4
3853544,A-3863555,Bing,201.0,2,2019-09-27 12:31:00,2019-09-27 13:06:45,38.575643,-121.568254,0.000,At Capitol Ave/Enterprise Blvd - Accident.,...,False,False,False,False,Day,Day,Day,Day,12,9
3683857,A-3693854,Bing,201.0,3,2020-01-19 12:24:57,2020-01-19 12:54:42,39.841790,-105.038910,0.000,At Sheridan Blvd - Accident.,...,False,False,False,False,Day,Day,Day,Day,12,1


## Creating new csv's for later loading with models <a id = 'data_loading'><a>

In [17]:
accidents_df.to_csv('./Data/smaller_csv.csv', index = None)
accidents_with_location_data_df.to_csv('./Data/location_data.csv')