In [1]:
# Imports
import pandas as pd
from sklearn.preprocessing import OneHotEncoder
import numpy as np

In [2]:
# Train data needs Data_Raw, train_data, X_train, y_train
# Test data needs Data_Raw, test_data, X_test, y_test
def data_input(folder1, folder2, file_x, file_y):
    X = pd.read_csv(f"../Accident_Data_Preprocessing/{folder1}/{folder2}/{file_x}.csv")
    y = pd.read_csv(f"../Accident_Data_Preprocessing/{folder1}/{folder2}/{file_y}.csv")
    df = pd.concat([X, y], axis=1)
    return df

In [3]:
# TEST INPUT
df = data_input("Data_Raw", "test_data", "X_test", "y_test")
df.head()

Unnamed: 0,ID,Source,Start_Time,End_Time,Start_Lat,Start_Lng,End_Lat,End_Lng,Distance(mi),Description,...,Station,Stop,Traffic_Calming,Traffic_Signal,Turning_Loop,Sunrise_Sunset,Civil_Twilight,Nautical_Twilight,Astronomical_Twilight,Severity
0,A-237917,Source2,2016-10-23 08:04:38,2016-10-23 09:04:14,39.01189,-76.904198,,,0.01,Accident on I-95 Capital Beltway Southbound at...,...,False,False,False,False,False,Day,Day,Day,Day,3
1,A-5361038,Source1,2022-03-10 17:06:30.000000000,2022-03-10 17:28:30.000000000,38.599807,-75.913079,38.606758,-75.902799,0.734,Stationary traffic on MD-392 from MD-14/E New ...,...,False,False,False,False,False,Day,Day,Day,Day,2
2,A-4110445,Source1,2022-09-19 04:45:21,2022-09-19 06:01:06,38.930465,-76.013555,38.931595,-76.019579,0.333,Incident on MD-404 WB near DULIN RD Drive with...,...,False,False,False,False,False,Night,Night,Night,Night,2
3,A-1228048,Source2,2020-12-15 08:03:36,2020-12-15 09:18:22,39.56218,-76.361221,,,0.0,Accident on Kahoe Rd at Bynum Ridge Rd.,...,False,False,False,False,False,Day,Day,Day,Day,2
4,A-1450465,Source2,2020-05-28 08:21:25,2020-05-28 09:05:57,39.277962,-77.323425,,,0.0,Accident on I-270 Northbound before Exit 22 MD...,...,False,False,False,False,False,Day,Day,Day,Day,2


In [4]:
# TRAIN INPUT
#df = data_input("Data_Raw", "train_data", "X_train", "y_train")
#df.head()

In [5]:
# View data, remove duplicate rows
df.head()
df = df.drop_duplicates()

In [6]:
# Check all unique values for all columns
for col in df.columns:
    print(df[col].value_counts(sort=True))

ID
A-5887205    1
A-237917     1
A-5361038    1
A-4110445    1
A-1228048    1
            ..
A-6985967    1
A-4223854    1
A-1495843    1
A-5332319    1
A-1337540    1
Name: count, Length: 28084, dtype: int64
Source
Source1    19701
Source2     8250
Source3      133
Name: count, dtype: int64
Start_Time
2021-05-28 07:31:48              8
2020-11-16 15:24:21              5
2021-10-17 19:13:00              5
2021-01-27 16:33:28              5
2020-12-17 06:26:18              5
                                ..
2022-11-08 17:49:00.000000000    1
2021-04-24 04:49:00              1
2022-11-17 17:07:10              1
2017-06-17 14:38:33              1
2022-06-08 14:51:27              1
Name: count, Length: 26867, dtype: int64
End_Time
2021-07-28 19:18:21              4
2022-08-24 21:05:53              4
2022-07-24 16:05:56              3
2018-02-17 00:44:51              3
2020-10-29 16:05:16              3
                                ..
2022-02-24 09:14:51              1
2022-10-08 08:27

In [7]:
# Worth keeping weather time stamp? Examine value types
print(df["Weather_Timestamp"].value_counts(sort=True))

Weather_Timestamp
2021-05-28 07:58:00    9
2022-03-13 01:55:00    9
2022-05-31 07:39:00    8
2020-06-04 07:54:00    8
2022-01-03 10:47:00    7
                      ..
2022-08-11 21:55:00    1
2022-08-23 09:52:00    1
2021-04-24 04:56:00    1
2018-03-13 07:54:00    1
2022-03-21 16:54:00    1
Name: count, Length: 21261, dtype: int64


In [8]:
# Check all values of all remaining cols
for col in df.columns:
    print(df[col].value_counts(sort=True))

ID
A-5887205    1
A-237917     1
A-5361038    1
A-4110445    1
A-1228048    1
            ..
A-6985967    1
A-4223854    1
A-1495843    1
A-5332319    1
A-1337540    1
Name: count, Length: 28084, dtype: int64
Source
Source1    19701
Source2     8250
Source3      133
Name: count, dtype: int64
Start_Time
2021-05-28 07:31:48              8
2020-11-16 15:24:21              5
2021-10-17 19:13:00              5
2021-01-27 16:33:28              5
2020-12-17 06:26:18              5
                                ..
2022-11-08 17:49:00.000000000    1
2021-04-24 04:49:00              1
2022-11-17 17:07:10              1
2017-06-17 14:38:33              1
2022-06-08 14:51:27              1
Name: count, Length: 26867, dtype: int64
End_Time
2021-07-28 19:18:21              4
2022-08-24 21:05:53              4
2022-07-24 16:05:56              3
2018-02-17 00:44:51              3
2020-10-29 16:05:16              3
                                ..
2022-02-24 09:14:51              1
2022-10-08 08:27

In [9]:
# Date/time conversion of start and end time
df['Start_Time'] = pd.to_datetime(df['Start_Time'],format='mixed')
df['End_Time'] = pd.to_datetime(df['End_Time'], format = 'mixed')
df

Unnamed: 0,ID,Source,Start_Time,End_Time,Start_Lat,Start_Lng,End_Lat,End_Lng,Distance(mi),Description,...,Station,Stop,Traffic_Calming,Traffic_Signal,Turning_Loop,Sunrise_Sunset,Civil_Twilight,Nautical_Twilight,Astronomical_Twilight,Severity
0,A-237917,Source2,2016-10-23 08:04:38,2016-10-23 09:04:14,39.011890,-76.904198,,,0.010,Accident on I-95 Capital Beltway Southbound at...,...,False,False,False,False,False,Day,Day,Day,Day,3
1,A-5361038,Source1,2022-03-10 17:06:30,2022-03-10 17:28:30,38.599807,-75.913079,38.606758,-75.902799,0.734,Stationary traffic on MD-392 from MD-14/E New ...,...,False,False,False,False,False,Day,Day,Day,Day,2
2,A-4110445,Source1,2022-09-19 04:45:21,2022-09-19 06:01:06,38.930465,-76.013555,38.931595,-76.019579,0.333,Incident on MD-404 WB near DULIN RD Drive with...,...,False,False,False,False,False,Night,Night,Night,Night,2
3,A-1228048,Source2,2020-12-15 08:03:36,2020-12-15 09:18:22,39.562180,-76.361221,,,0.000,Accident on Kahoe Rd at Bynum Ridge Rd.,...,False,False,False,False,False,Day,Day,Day,Day,2
4,A-1450465,Source2,2020-05-28 08:21:25,2020-05-28 09:05:57,39.277962,-77.323425,,,0.000,Accident on I-270 Northbound before Exit 22 MD...,...,False,False,False,False,False,Day,Day,Day,Day,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
28079,A-7005722,Source1,2020-05-01 15:02:13,2020-05-01 16:22:11,39.378680,-76.718500,39.378680,-76.718500,0.000,At E Sudbrook Ln - Accident.,...,False,False,False,True,False,Day,Day,Day,Day,2
28080,A-3468597,Source1,2016-11-08 17:55:16,2016-11-08 23:55:16,39.352120,-76.745910,39.345290,-76.745300,0.473,At MD-26/Exit 18 - Accident.,...,False,False,False,False,False,Night,Night,Day,Day,3
28081,A-6590777,Source1,2020-10-25 05:01:51,2020-10-25 07:21:33,39.158078,-77.204488,39.157831,-77.204819,0.025,Incident on MD-124 SB near LOST KNIFE RD Road ...,...,False,False,False,True,False,Night,Night,Night,Night,4
28082,A-6480985,Source1,2021-02-04 14:41:00,2021-02-04 16:56:10,39.097107,-77.177733,39.051836,-77.152782,3.402,Incident on I-270 SB near I-370 Road closed. T...,...,False,False,False,False,False,Day,Day,Day,Day,2


In [10]:
# Check unique values for all cols again
for col in df.columns:
    print(df[col].value_counts(sort=True))

ID
A-5887205    1
A-237917     1
A-5361038    1
A-4110445    1
A-1228048    1
            ..
A-6985967    1
A-4223854    1
A-1495843    1
A-5332319    1
A-1337540    1
Name: count, Length: 28084, dtype: int64
Source
Source1    19701
Source2     8250
Source3      133
Name: count, dtype: int64
Start_Time
2021-05-28 07:31:48    9
2022-05-31 07:48:30    7
2021-08-31 15:40:00    6
2022-02-25 15:23:30    6
2021-12-24 15:56:30    5
                      ..
2021-04-04 07:24:10    1
2023-01-23 06:50:16    1
2017-08-23 16:48:45    1
2018-04-04 18:57:55    1
2023-01-06 04:54:42    1
Name: count, Length: 26446, dtype: int64
End_Time
2022-08-24 21:05:53    5
2022-03-12 16:54:44    4
2022-04-14 16:01:28    4
2021-11-22 18:45:58    4
2021-07-28 19:18:21    4
                      ..
2021-04-04 10:51:38    1
2023-01-23 08:09:27    1
2017-08-23 22:48:45    1
2018-04-05 00:57:55    1
2023-01-06 06:14:17    1
Name: count, Length: 27214, dtype: int64
Start_Lat
38.971164    45
39.442020    40
39.379276    

In [11]:
# Create month column out of date time
df['Month'] = df['Start_Time'].dt.month
df

Unnamed: 0,ID,Source,Start_Time,End_Time,Start_Lat,Start_Lng,End_Lat,End_Lng,Distance(mi),Description,...,Stop,Traffic_Calming,Traffic_Signal,Turning_Loop,Sunrise_Sunset,Civil_Twilight,Nautical_Twilight,Astronomical_Twilight,Severity,Month
0,A-237917,Source2,2016-10-23 08:04:38,2016-10-23 09:04:14,39.011890,-76.904198,,,0.010,Accident on I-95 Capital Beltway Southbound at...,...,False,False,False,False,Day,Day,Day,Day,3,10
1,A-5361038,Source1,2022-03-10 17:06:30,2022-03-10 17:28:30,38.599807,-75.913079,38.606758,-75.902799,0.734,Stationary traffic on MD-392 from MD-14/E New ...,...,False,False,False,False,Day,Day,Day,Day,2,3
2,A-4110445,Source1,2022-09-19 04:45:21,2022-09-19 06:01:06,38.930465,-76.013555,38.931595,-76.019579,0.333,Incident on MD-404 WB near DULIN RD Drive with...,...,False,False,False,False,Night,Night,Night,Night,2,9
3,A-1228048,Source2,2020-12-15 08:03:36,2020-12-15 09:18:22,39.562180,-76.361221,,,0.000,Accident on Kahoe Rd at Bynum Ridge Rd.,...,False,False,False,False,Day,Day,Day,Day,2,12
4,A-1450465,Source2,2020-05-28 08:21:25,2020-05-28 09:05:57,39.277962,-77.323425,,,0.000,Accident on I-270 Northbound before Exit 22 MD...,...,False,False,False,False,Day,Day,Day,Day,2,5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
28079,A-7005722,Source1,2020-05-01 15:02:13,2020-05-01 16:22:11,39.378680,-76.718500,39.378680,-76.718500,0.000,At E Sudbrook Ln - Accident.,...,False,False,True,False,Day,Day,Day,Day,2,5
28080,A-3468597,Source1,2016-11-08 17:55:16,2016-11-08 23:55:16,39.352120,-76.745910,39.345290,-76.745300,0.473,At MD-26/Exit 18 - Accident.,...,False,False,False,False,Night,Night,Day,Day,3,11
28081,A-6590777,Source1,2020-10-25 05:01:51,2020-10-25 07:21:33,39.158078,-77.204488,39.157831,-77.204819,0.025,Incident on MD-124 SB near LOST KNIFE RD Road ...,...,False,False,True,False,Night,Night,Night,Night,4,10
28082,A-6480985,Source1,2021-02-04 14:41:00,2021-02-04 16:56:10,39.097107,-77.177733,39.051836,-77.152782,3.402,Incident on I-270 SB near I-370 Road closed. T...,...,False,False,False,False,Day,Day,Day,Day,2,2


In [12]:
# Use month column to create season column:
# Dec/Jan/Feb: Winter
# Mar/Apr/May: Spring
# Jun/Jul/Aug: Summer
# Sep/Oct/Nov: Fall
def season(x):
    tomap = {1:"Winter",2:"Winter",3:"Spring",4:"Spring",5:"Spring",6:"Summer",7:"Summer",8:"Summer",9:"Fall",10:"Fall",11:"Fall",12:"Winter"}
    if x in tomap:
        return tomap[x]
    return x

In [13]:
# Pandas giving data issues, confirmation that month is created and datetime
df['Start_Time'] = pd.to_datetime(df['Start_Time'],format='mixed')
df['Month'] = df['Start_Time'].dt.month

# Apply season function based on month col
df["Season"] = df.Month.apply(season)
df['Season'].value_counts()

Season
Winter    8068
Fall      7576
Summer    6254
Spring    6186
Name: count, dtype: int64

In [14]:
# Convert street names to strings
df['Street']=df['Street'].astype(str)

In [15]:
# Added misc to handle any names that did not fit into the below categories, typos, misspellings, unsure what street
def street_type(x):
    if "Tunl" in x or "Tunnel" in x:
        return "Tunnel"#nsew(x,"Tunnel")
    if "I-" in x:
        return "Interstate"
    if " Dr " in x or " Ave " in x or " Rd " in x or " Xing" in x or " Way " in x:
        return "local"
    if "Beltway" in x:
        return "Beltway"
    if "US-" in x[:4] or "MD-" in x[:4] or "Expy" in x:
        return "Highway"
    if "Hwy" in x or "Fwy" in x or "Highway" in x or "Route" in x or "Connector" in x or "Byp" in x or "Bypass" in x or "Gtwy" in x or "Brg" in x or "Trwy" in x:
        return "Highway"
    if x[-2:] in ["St","Pl","Dr","Sq"]:
        return "local"
    if "Ln" in x or "Rd" in x or "Ct" in x:
        return "local"
    if x[-3:] in ["Way","Cir","Ter","Trl","Plz","Aly","Grn","way","Ext"]:
        return "local"
    if "Ave" in x[-5:] or " Blvd" in x or " Pike" in x or "Pkwy" in x or "Crse" in x or "Alameda" in x:
        return "large_local"
    if "Garth" in x or "Spell" in x or "Psge" in x or "Chase" in x or "Cutoff" in x or "Spire" in x:
        return "local"
    if "Overlook" in x or "Battlefield" in x or "Cemetery" in x or "Park" in x or "Base" in x or "Concourse" in x or "Memorial" in x or "Booth's" in x:
        return "special"
    if "Loop" in x or "Greenway" in x or "West" in x or "ville" in x or "wood" in x:
        return "local"
    else:
        return "misc"
    return x

In [16]:
# Apply street categories to street name col
df["Street"] = df.Street.apply(street_type)

In [17]:
# Remove rows with "special" or "misc" street names
df=df.loc[~df.Street.str.contains("special") & ~df.Street.str.contains("misc")]


In [18]:
df.Street.value_counts()

Street
Interstate     7808
local          6812
large_local    4987
Highway        4952
Beltway        2832
Tunnel          410
Name: count, dtype: int64

In [19]:
# Proper formatting of county names (I took an educated guess that \"Baltimore\" referred to Balt. County and not City, this
# may not be correct)
# Also apostrophes needed to be removed to work with Weka for binning after cleaning data

def replace_misspelled(text):
    corrections = {"St Mary's": "St. Marys", "Prince George's": "Prince Georges", "Queen Anne's": "Queen Annes",
                   "Baltimore (City)":"Baltimore City","Saint Mary's":"St Marys", "Baltimore County":"Baltimore",
                   "St. Mary's":"St Marys", "Saint Marys":"St Marys"}
    for word, correction in corrections.items():
        text = text.replace(word, correction)
    return text

# Apply spelling function to data
df.loc[:,'County'] = df['County'].apply(replace_misspelled)
df["County"].value_counts(sort=True)

County
Prince Georges          5380
Baltimore               4451
Montgomery              4437
Frederick               2630
Anne Arundel            2361
Baltimore City          2059
Harford                 1342
Howard                  1165
Washington               948
Queen Annes              504
Cecil                    482
Charles                  387
Carroll                  261
Talbot                   234
Worcester                229
Caroline                 172
Allegany                 152
St Marys                 134
Wicomico                 127
Dorchester               116
Kent                      70
Garrett                   69
Somerset                  45
Calvert                   45
District Of Columbia       1
Name: count, dtype: int64

In [20]:
# One hot function (Could not find the one line function Dr. Olsen referred to)
def onehotinator(df, string):
    one_hot = pd.get_dummies(df[string], prefix=string)
    one_hot = one_hot.astype(int)
    df = df.join(one_hot)
    df = df.drop(columns=string)
    return df

In [21]:
# Boolean column feature for if snowy OR icy conditions were present
# Tried to fix the warning about setting value to a DF slice, was not able to fix
lst = ['Snow', 'Freezing', 'Ice', 'Wintry', 'Hail', 'Sleet']
df['Snow_Ice']=np.where(df.loc[:, 'Weather_Condition'].str.contains('|'.join(lst), na=False), True, False)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Snow_Ice']=np.where(df.loc[:, 'Weather_Condition'].str.contains('|'.join(lst), na=False), True, False)


In [22]:
# Boolean column feature for if mist OR fog OR haze was present, OR if visibility in miles was less than 0.001 mi
lst = ['Fog', 'Mist', 'Haze', 'Heavy']
df['Low_Vis_Weather']=np.where((df['Weather_Condition'].str.contains('|'.join(lst), na=False) | df["Visibility(mi)"] < 0.001), True, False)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Low_Vis_Weather']=np.where((df['Weather_Condition'].str.contains('|'.join(lst), na=False) | df["Visibility(mi)"] < 0.001), True, False)


In [23]:
# Boolean column feature for if road conditions were wet (weather was rain/storm related, OR there was more than 0 inches
# of precipitation at time of crash)
lst = ['Rain', 'Drizzle', 'Thunderstorm', 'T-Storm', 'Precipitation']
df['Rainy']=np.where((df['Weather_Condition'].str.contains('|'.join(lst), na=False) | df['Precipitation(in)'] >= 0.01), True, False)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Rainy']=np.where((df['Weather_Condition'].str.contains('|'.join(lst), na=False) | df['Precipitation(in)'] >= 0.01), True, False)


In [24]:
# Check which columns to delete
df


Unnamed: 0,ID,Source,Start_Time,End_Time,Start_Lat,Start_Lng,End_Lat,End_Lng,Distance(mi),Description,...,Sunrise_Sunset,Civil_Twilight,Nautical_Twilight,Astronomical_Twilight,Severity,Month,Season,Snow_Ice,Low_Vis_Weather,Rainy
0,A-237917,Source2,2016-10-23 08:04:38,2016-10-23 09:04:14,39.011890,-76.904198,,,0.010,Accident on I-95 Capital Beltway Southbound at...,...,Day,Day,Day,Day,3,10,Fall,False,False,False
1,A-5361038,Source1,2022-03-10 17:06:30,2022-03-10 17:28:30,38.599807,-75.913079,38.606758,-75.902799,0.734,Stationary traffic on MD-392 from MD-14/E New ...,...,Day,Day,Day,Day,2,3,Spring,False,False,False
2,A-4110445,Source1,2022-09-19 04:45:21,2022-09-19 06:01:06,38.930465,-76.013555,38.931595,-76.019579,0.333,Incident on MD-404 WB near DULIN RD Drive with...,...,Night,Night,Night,Night,2,9,Fall,False,False,False
3,A-1228048,Source2,2020-12-15 08:03:36,2020-12-15 09:18:22,39.562180,-76.361221,,,0.000,Accident on Kahoe Rd at Bynum Ridge Rd.,...,Day,Day,Day,Day,2,12,Winter,False,False,False
4,A-1450465,Source2,2020-05-28 08:21:25,2020-05-28 09:05:57,39.277962,-77.323425,,,0.000,Accident on I-270 Northbound before Exit 22 MD...,...,Day,Day,Day,Day,2,5,Spring,False,False,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
28079,A-7005722,Source1,2020-05-01 15:02:13,2020-05-01 16:22:11,39.378680,-76.718500,39.378680,-76.718500,0.000,At E Sudbrook Ln - Accident.,...,Day,Day,Day,Day,2,5,Spring,False,True,False
28080,A-3468597,Source1,2016-11-08 17:55:16,2016-11-08 23:55:16,39.352120,-76.745910,39.345290,-76.745300,0.473,At MD-26/Exit 18 - Accident.,...,Night,Night,Day,Day,3,11,Fall,False,True,False
28081,A-6590777,Source1,2020-10-25 05:01:51,2020-10-25 07:21:33,39.158078,-77.204488,39.157831,-77.204819,0.025,Incident on MD-124 SB near LOST KNIFE RD Road ...,...,Night,Night,Night,Night,4,10,Fall,False,False,False
28082,A-6480985,Source1,2021-02-04 14:41:00,2021-02-04 16:56:10,39.097107,-77.177733,39.051836,-77.152782,3.402,Incident on I-270 SB near I-370 Road closed. T...,...,Day,Day,Day,Day,2,2,Winter,False,False,False


In [25]:
# Remove all but one day/night column

df = df.drop(columns=['Civil_Twilight', 'Nautical_Twilight', 'Astronomical_Twilight', 'Weather_Condition'])


df['Sunrise_Sunset'] = df['Sunrise_Sunset'].astype("string")

#df = df.dropna(subset = ['Sunrise_Sunset'], axis = 0)
df['Sunrise_Sunset'].value_counts()

Sunrise_Sunset
Day      18538
Night     9209
Name: count, dtype: Int64

In [26]:
# Reference for data conversion
df.dtypes

ID                           object
Source                       object
Start_Time           datetime64[ns]
End_Time             datetime64[ns]
Start_Lat                   float64
Start_Lng                   float64
End_Lat                     float64
End_Lng                     float64
Distance(mi)                float64
Description                  object
Street                       object
City                         object
County                       object
State                        object
Zipcode                      object
Country                      object
Timezone                     object
Airport_Code                 object
Weather_Timestamp            object
Temperature(F)              float64
Wind_Chill(F)               float64
Humidity(%)                 float64
Pressure(in)                float64
Visibility(mi)              float64
Wind_Direction               object
Wind_Speed(mph)             float64
Precipitation(in)           float64
Amenity                     

In [27]:
# Convert all boolean columns to 1/0 integer columns and day/night column to 1/0

bools = ['Amenity', 'Bump', 'Crossing', 'Give_Way', 'Junction', 'No_Exit', 'Railway', 'Roundabout', 'Station', 'Stop', 'Traffic_Calming', 
         'Traffic_Signal', 'Turning_Loop', 'Low_Vis_Weather', 'Snow_Ice', 'Rainy']

for i in bools:
    df = df.astype({i:int})
df['Sunrise_Sunset'] = df["Sunrise_Sunset"].map({'Day':1,'Night':0})
df.dtypes

ID                           object
Source                       object
Start_Time           datetime64[ns]
End_Time             datetime64[ns]
Start_Lat                   float64
Start_Lng                   float64
End_Lat                     float64
End_Lng                     float64
Distance(mi)                float64
Description                  object
Street                       object
City                         object
County                       object
State                        object
Zipcode                      object
Country                      object
Timezone                     object
Airport_Code                 object
Weather_Timestamp            object
Temperature(F)              float64
Wind_Chill(F)               float64
Humidity(%)                 float64
Pressure(in)                float64
Visibility(mi)              float64
Wind_Direction               object
Wind_Speed(mph)             float64
Precipitation(in)           float64
Amenity                     

In [None]:
# Function to tally number of traffic speed changing elements present at location of accident
def slowdown_level(row):
    return sum([row['Bump'],row["Crossing"], row["Give_Way"], row["Junction"], row["Railway"], row['Roundabout'], \
                row["Stop"], row["Traffic_Calming"], row["Traffic_Signal"], row["Turning_Loop"]])

df['Qty_Slowing_Elements'] = df.apply(slowdown_level, axis=1)

df['Qty_Slowing_Elements'].value_counts()

Qty_Slowing_Elements
0    20241
1     6239
2     1231
3       90
Name: count, dtype: int64

In [29]:
df = df.drop(columns = 'Turning_Loop')

In [30]:
df.head()

Unnamed: 0,ID,Source,Start_Time,End_Time,Start_Lat,Start_Lng,End_Lat,End_Lng,Distance(mi),Description,...,Traffic_Calming,Traffic_Signal,Sunrise_Sunset,Severity,Month,Season,Snow_Ice,Low_Vis_Weather,Rainy,Qty_Slowing_Elements
0,A-237917,Source2,2016-10-23 08:04:38,2016-10-23 09:04:14,39.01189,-76.904198,,,0.01,Accident on I-95 Capital Beltway Southbound at...,...,0,0,1.0,3,10,Fall,0,0,0,0
1,A-5361038,Source1,2022-03-10 17:06:30,2022-03-10 17:28:30,38.599807,-75.913079,38.606758,-75.902799,0.734,Stationary traffic on MD-392 from MD-14/E New ...,...,0,0,1.0,2,3,Spring,0,0,0,2
2,A-4110445,Source1,2022-09-19 04:45:21,2022-09-19 06:01:06,38.930465,-76.013555,38.931595,-76.019579,0.333,Incident on MD-404 WB near DULIN RD Drive with...,...,0,0,0.0,2,9,Fall,0,0,0,0
3,A-1228048,Source2,2020-12-15 08:03:36,2020-12-15 09:18:22,39.56218,-76.361221,,,0.0,Accident on Kahoe Rd at Bynum Ridge Rd.,...,0,0,1.0,2,12,Winter,0,0,0,0
4,A-1450465,Source2,2020-05-28 08:21:25,2020-05-28 09:05:57,39.277962,-77.323425,,,0.0,Accident on I-270 Northbound before Exit 22 MD...,...,0,0,1.0,2,5,Spring,0,0,1,1


In [31]:
# Reference for what to do next
df.isnull().sum()

ID                         0
Source                     0
Start_Time                 0
End_Time                   0
Start_Lat                  0
Start_Lng                  0
End_Lat                 8369
End_Lng                 8369
Distance(mi)               0
Description                0
Street                     0
City                       1
County                     0
State                      0
Zipcode                   51
Country                    0
Timezone                  51
Airport_Code              65
Weather_Timestamp       2698
Temperature(F)          3122
Wind_Chill(F)           9517
Humidity(%)             3148
Pressure(in)            2901
Visibility(mi)          5483
Wind_Direction          4015
Wind_Speed(mph)         6859
Precipitation(in)       9836
Amenity                    0
Bump                       0
Crossing                   0
Give_Way                   0
Junction                   0
No_Exit                    0
Railway                    0
Roundabout    

In [32]:
# Column for potentially freezing conditions if temp at accident time was below freezing OR windchill was below freezing
# OR if the previous column checking for snow and ice was true regarding snow/ice present

def freezing(row):
   if (row["Temperature(F)"]<=32) | (row["Wind_Chill(F)"]<=32) | (row["Snow_Ice"] == 1):
      return 1
   else:
      return 0
df['Freezing'] = df.apply(freezing, axis=1)
df["Freezing"].value_counts()

Freezing
0    24019
1     3782
Name: count, dtype: int64

In [33]:
# Due to a technical ctrl+z error, I lost all instances of removing columns as I went, this is a quick fix (I didn't remember
# which columns were removed at which points)
df = df.drop(columns=['ID','Source','Start_Time','End_Time','Start_Lat','Start_Lng','End_Lat','End_Lng','Weather_Timestamp',
                      'Temperature(F)','Wind_Chill(F)','Humidity(%)','Pressure(in)','Visibility(mi)','Wind_Direction',
                      'Wind_Speed(mph)','Precipitation(in)','Description','Street','City','State','Zipcode','Country','Timezone',
                      'Airport_Code','Month'])

# Dropping remaining null values left

df.isnull().sum()
df


Unnamed: 0,Distance(mi),County,Amenity,Bump,Crossing,Give_Way,Junction,No_Exit,Railway,Roundabout,...,Traffic_Calming,Traffic_Signal,Sunrise_Sunset,Severity,Season,Snow_Ice,Low_Vis_Weather,Rainy,Qty_Slowing_Elements,Freezing
0,0.010,Prince Georges,0,0,0,0,0,0,0,0,...,0,0,1.0,3,Fall,0,0,0,0,0
1,0.734,Dorchester,0,0,1,0,0,0,1,0,...,0,0,1.0,2,Spring,0,0,0,2,0
2,0.333,Talbot,0,0,0,0,0,0,0,0,...,0,0,0.0,2,Fall,0,0,0,0,0
3,0.000,Harford,0,0,0,0,0,0,0,0,...,0,0,1.0,2,Winter,0,0,0,0,1
4,0.000,Montgomery,0,0,0,0,1,0,0,0,...,0,0,1.0,2,Spring,0,0,1,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
28079,0.000,Baltimore,0,0,0,0,0,0,0,0,...,0,1,1.0,2,Spring,0,1,0,1,0
28080,0.473,Baltimore,0,0,0,0,0,0,0,0,...,0,0,0.0,3,Fall,0,1,0,0,0
28081,0.025,Montgomery,0,0,0,0,0,0,0,0,...,0,1,0.0,4,Fall,0,0,0,1,0
28082,3.402,Montgomery,0,0,0,0,0,0,0,0,...,0,0,1.0,2,Winter,0,0,0,0,0


In [34]:
df = df.dropna(how='any')

In [35]:
df.dtypes

# Somehow sun up/down col was a float, quick fix to be int
df.Sunrise_Sunset = df.Sunrise_Sunset.astype('int64')
df.dtypes

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.Sunrise_Sunset = df.Sunrise_Sunset.astype('int64')


Distance(mi)            float64
County                   object
Amenity                   int64
Bump                      int64
Crossing                  int64
Give_Way                  int64
Junction                  int64
No_Exit                   int64
Railway                   int64
Roundabout                int64
Station                   int64
Stop                      int64
Traffic_Calming           int64
Traffic_Signal            int64
Sunrise_Sunset            int64
Severity                  int64
Season                   object
Snow_Ice                  int64
Low_Vis_Weather           int64
Rainy                     int64
Qty_Slowing_Elements      int64
Freezing                  int64
dtype: object

In [36]:
df=df.drop_duplicates()

In [37]:
# Reference for thinking about CFD based division (We have 4 classes, and CFD only takes 2 at a time)
df['Severity'].value_counts()

Severity
2    15445
4     2041
3     1970
1       73
Name: count, dtype: int64

In [38]:
df.columns

Index(['Distance(mi)', 'County', 'Amenity', 'Bump', 'Crossing', 'Give_Way',
       'Junction', 'No_Exit', 'Railway', 'Roundabout', 'Station', 'Stop',
       'Traffic_Calming', 'Traffic_Signal', 'Sunrise_Sunset', 'Severity',
       'Season', 'Snow_Ice', 'Low_Vis_Weather', 'Rainy',
       'Qty_Slowing_Elements', 'Freezing'],
      dtype='object')

In [41]:
df.shape

(19529, 22)

In [42]:
# Function to re-separate cleaned class column from the cleaned rest of the dataset, and export both separately into the same folder
def data_output(df, folder1, folder2, file_x, file_y):
    col_class = ['Class']
    col_data = ['Distance(mi)', 'County', 'Amenity', 'Bump', 'Crossing', 'Give_Way',
       'Junction', 'No_Exit', 'Railway', 'Roundabout', 'Station', 'Stop',
       'Traffic_Calming', 'Traffic_Signal', 'Sunrise_Sunset',
       'Season', 'Snow_Ice', 'Low_Vis_Weather', 'Rainy',
       'Qty_Slowing_Elements', 'Freezing']
    
    df = df.rename(columns = {'Severity' : 'Class'})
    df = df.drop_duplicates()
    
    X = df.loc[:, df.columns != 'Class']
    y = df.Class
    
    X.to_csv(f'../Accident_Data_Preprocessing/{folder1}/{folder2}/{file_x}.csv', columns = col_data, index = False)
    y.to_csv(f'../Accident_Data_Preprocessing/{folder1}/{folder2}/{file_y}.csv', columns = col_class, index = False)

In [43]:
# TRAIN OUTPUT
#data_output(df, "Data_Cleaned", "train_data", "X_train_feat_eng", "y_train_feat_eng")

In [44]:
# TEST OUTPUT
data_output(df, "Data_Cleaned", "test_data", "X_test_feat_eng", "y_test_feat_eng")