In [None]:
# Imports
import pandas as pd
from sklearn.preprocessing import OneHotEncoder
import numpy as np

In [None]:
# Read raw data
data = pd.read_csv("C:\\Users\\bkb3\\Documents\\US_Accidents_March23.csv")

In [None]:
# Subset to MD based data only
df = data[data["State"]=='MD']

In [None]:
# View data, remove duplicate rows
df.head()
df = df.drop_duplicates()

In [None]:
# Check all unique values for all columns
for col in df.columns:
    print(df[col].value_counts(sort=True))

In [None]:
# Worth keeping weather time stamp? Examine value types
print(df["Weather_Timestamp"].value_counts(sort=True))

In [None]:
# Check all values of all remaining cols
for col in df.columns:
    print(df[col].value_counts(sort=True))

In [None]:
# Date/time conversion, calculate length of traffic holdup in hours/fractions of an hour
df['Start_Time'] = pd.to_datetime(df['Start_Time'],format='mixed')
df['End_Time'] = pd.to_datetime(df['End_Time'], format = 'mixed')
df

In [None]:
# Check unique values for all cols again
for col in df.columns:
    print(df[col].value_counts(sort=True))

In [None]:
# Error checking of date time error I was getting over and over (One row had date and time recorded with higher
# granularity)
df.iloc[46456, :]

In [None]:
# Create year, month, and day columns out of date time
df['Year'], df['Month'], df['Day'] = df['Start_Time'].dt.year, df['Start_Time'].dt.month, df['Start_Time'].dt.day
df

In [None]:
def season(x):
    tomap = {1:"Winter",2:"Winter",3:"Spring",4:"Spring",5:"Spring",6:"Summer",7:"Summer",8:"Summer",9:"Fall",10:"Fall",11:"Fall",12:"Winter"}
    if x in tomap:
        return tomap[x]
    return x

In [None]:
df['Start_Time'] = pd.to_datetime(df['Start_Time'],format='mixed')
df['Month'] = df['Start_Time'].dt.month
df["Season"] = df.Month.apply(season)

In [None]:
# Convert street names to strings
df['Street'].astype(str)

In [None]:
def street_type(x):
    if "Tunl" in x or "Tunnel" in x:
        return "Tunnel"#nsew(x,"Tunnel")
    if "I-" in x:
        return "Interstate"
    if " Dr " in x or " Ave " in x or " Rd " in x or " Xing" in x or " Way " in x:
        return "local"
    if "Beltway" in x:
        return "Beltway"
    if "US-" in x[:4] or "MD-" in x[:4] or "Expy" in x:
        return "Highway"
    if "Hwy" in x or "Fwy" in x or "Highway" in x or "Route" in x or "Connector" in x or "Byp" in x or "Bypass" in x or "Gtwy" in x or "Brg" in x or "Trwy" in x:
        return "Highway"
    if x[-2:] in ["St","Pl","Dr","Sq"]:
        return "local"
    if "Ln" in x or "Rd" in x or "Ct" in x:
        return "local"
    if x[-3:] in ["Way","Cir","Ter","Trl","Plz","Aly","Grn","way","Ext"]:
        return "local"
    if "Ave" in x[-5:] or " Blvd" in x or " Pike" in x or "Pkwy" in x or "Crse" in x or "Alameda" in x:
        return "large_local"
    if "Garth" in x or "Spell" in x or "Psge" in x or "Chase" in x or "Cutoff" in x or "Spire" in x:
        return "local"
    if "Overlook" in x or "Battlefield" in x or "Cemetery" in x or "Park" in x or "Base" in x or "Concourse" in x or "Memorial" in x:
        return "special"
    if "Loop" in x or "Greenway" in x or "West" in x or "ville" in x or "wood" in x:
        return "local"
    
    return x

In [None]:
df["Street"] = df["Street"].astype(str)
df["Street"] = df.Street.apply(street_type)

In [None]:
df=df.loc[~df.Street.str.contains("special")]

In [None]:
df.Street.value_counts()

In [None]:
# Proper formatting of county names (I took an educated guess that \"Baltimore\" referred to Balt. County and not City, this
# may not be correct)

def replace_misspelled(text):
    corrections = {"St Mary's": "St. Marys", "Prince George's": "Prince Georges", "Queen Anne's": "Queen Annes",
                   "Baltimore (City)":"Baltimore City","Saint Mary's":"St Marys", "Baltimore County":"Baltimore"}
    for word, correction in corrections.items():
        text = text.replace(word, correction)
    return text

df['County'] = df['County'].apply(replace_misspelled)
df["County"].value_counts(sort=True)

In [None]:
# One hot function (Could not find the one line function Dr. Olsen referred to)
def onehotinator(df, string):
    one_hot = pd.get_dummies(df[string], prefix=string)
    one_hot = one_hot.astype(int)
    df = df.join(one_hot)
    df = df.drop(columns=string)
    return df

In [None]:
# Boolean column feature for if snowy OR icy conditions were present
lst = ['Snow', 'Freezing', 'Ice', 'Wintry', 'Hail', 'Sleet']
df['Snow_Ice']=np.where(df['Weather_Condition'].str.contains('|'.join(lst), na=False), True, False)

In [None]:
# Boolean column feature for if mist OR fog OR haze was present, OR if visibility in miles was less than 0.001 mi
lst = ['Fog', 'Mist', 'Haze', 'Heavy']
df['Low_Vis_Weather']=np.where((df['Weather_Condition'].str.contains('|'.join(lst), na=False) | df["Visibility(mi)"] < 0.001), True, False)

In [None]:
# Boolean column feature for if road conditions were wet (weather was rain/storm related, OR there was more than 0 inches
# of precipitation at time of crash)
lst = ['Rain', 'Drizzle', 'Thunderstorm', 'T-Storm', 'Precipitation']
df['Rainy']=np.where((df['Weather_Condition'].str.contains('|'.join(lst), na=False) | df['Precipitation(in)'] >= 0.01), True, False)

In [None]:
# Check which columns to delete
df.head()
df.columns

In [None]:
# Remove all but one day/night column
# Col measures were: 
# Sun up/down based on the time of day
# Sun up/down based on street lights being needed or not
# Sun up/down based on being able to see the horizon at sea
# Sun up/down based on being able to see stars
df = df.drop(columns=['Civil_Twilight', 'Nautical_Twilight', 'Astronomical_Twilight', 'Weather_Condition'])
df.columns

In [None]:
# Reference for data conversion
df.dtypes

In [None]:
# Convert all boolean columns to 1/0 integer columns and only day/night column to 1/0

bools = ['Amenity', 'Bump', 'Crossing', 'Give_Way', 'Junction', 'No_Exit', 'Railway', 'Roundabout', 'Station', 'Stop', 'Traffic_Calming', 
         'Traffic_Signal', 'Turning_Loop', 'Low_Vis_Weather', 'Snow_Ice', 'Rainy']

for i in bools:
    df = df.astype({i:int})
    df['Sunrise_Sunset'] = df.Sunrise_Sunset.map({'Day':1,'Night':0})
df.dtypes

In [None]:
def slowdown_level(row):
    return sum([row['Bump'],row["Crossing"], \
    row["Give_Way"], row["Junction"], row["Railway"],  \
    row['Roundabout'], row["Stop"], row["Traffic_Calming"], \
    row["Traffic_Signal"], row["Turning_Loop"]])
df['Qty_Slowing_Elements'] = df.apply(slowdown_level, axis=1)
df['Qty_Slowing_Elements'].value_counts()

In [None]:
# Reference for what to do next
df.isnull().sum()

In [None]:
# Reference for feature engineering ideas
max(df['Temperature(F)']), min(df['Temperature(F)'])

In [None]:
# Reference for feature engineering ideas
df['Wind_Chill(F)'].value_counts(sort=True)

In [None]:
# Column for potentially freezing conditions if temp at accident time was below freezing OR windchill was below freezing
# OR if the previous column checking for snow and ice was true snow/ice present

def freezing(row):
   if (row["Temperature(F)"]<=32) | (row["Wind_Chill(F)"]<=32) | (row["Snow_Ice"] == 1):
      return 1
   else:
      return 0
df['Freezing'] = df.apply(freezing, axis=1)
df["Freezing"].value_counts()

In [None]:
# Due to a technical ctrl+z error, I lost all instances of removing columns as I went, this is a quick fix (I didn't remember
# which columns were removed at which points)
df.isnull().sum()
df.columns
for col in df.columns:
    print(df[col].name, df[col].isnull().sum())
df = df.drop(columns=['ID','Source','Start_Time','End_Time','Start_Lat','Start_Lng','End_Lat','End_Lng','Weather_Timestamp',
                      'Temperature(F)','Wind_Chill(F)','Humidity(%)','Pressure(in)','Visibility(mi)','Wind_Direction',
                      'Wind_Speed(mph)','Precipitation(in)','Description','Street','City','State','Zipcode','Country','Timezone',
                      'Airport_Code'])

# Dropping remaining null values left

df = df.dropna(how='any')
df.isnull().sum()


In [None]:
for col in df.columns:
    print(df[col].name, df[col].isnull().sum())

# Just to be safe, drop duplicates again
df = df.drop_duplicates()

In [None]:
df.shape
df.dtypes
# Somehow sun up/down col was a float, quick fix to be int
df["Sunrise_Sunset"]=df["Sunrise_Sunset"].astype(int)
df.dtypes
# Checked min and max of distance to decide on binning by hand or not
print(df['Distance(mi)'].min(),df['Distance(mi)'].max())

In [None]:
df.to_csv('fenrir_feat_eng.csv', index=False)
#df.to_csv('Fenrir_Feat_Eng.csv', index=False)

In [None]:
# Reference for thinking about CFD based division (We have 4 classes, and CFD only takes 2 at a time)
df['Severity'].value_counts()

In [None]:
df.columns

In [None]:
def feat_eng_class_merge(df, c1, c2, c3, c4):
    df['Severity'] = df['Severity'].replace(c1, 0)
    df['Severity'] = df['Severity'].replace([c2, c3, c4], 1)
    df_c1 = df[df.Severity != 1]
    df_c2 = df[df.Severity != 0]
    df_c1.to_csv(f"feat_eng_class{c1}.csv", index = False)
    df_c2.to_csv(f"feat_eng_class{c2}_{c3}_{c4}_merge.csv", index = False)

    df['Severity'] = df['Severity'].replace(c2, 0)
    df['Severity'] = df['Severity'].replace([c1, c3, c4], 1)
    df_c1 = df[df.Severity != 1]
    df_c2 = df[df.Severity != 0]
    df_c1.to_csv(f"feat_eng_class{c2}.csv", index = False)
    df_c2.to_csv(f"feat_eng_class{c1}_{c3}_{c4}_merge.csv", index = False)

    df['Severity'] = df['Severity'].replace(c3, 0)
    df['Severity'] = df['Severity'].replace([c2, c1, c4], 1)
    df_c1 = df[df.Severity != 1]
    df_c2 = df[df.Severity != 0]
    df_c1.to_csv(f"feat_eng_class{c3}.csv", index = False)
    df_c2.to_csv(f"feat_eng_class{c2}_{c1}_{c4}_merge.csv", index = False)

    df['Severity'] = df['Severity'].replace(c4, 0)
    df['Severity'] = df['Severity'].replace([c2, c3, c1], 1)
    df_c1 = df[df.Severity != 1]
    df_c2 = df[df.Severity != 0]
    df_c1.to_csv(f"feat_eng_class{c4}.csv", index = False)
    df_c2.to_csv(f"feat_eng_class{c2}_{c3}_{c1}_merge.csv", index = False)

In [None]:
#feat_eng_class_merge(df, 1, 2, 3, 4)