In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import datetime as dt

from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split,GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVR

In [None]:
cars = pd.read_csv('./data/US_Accidents_Dec20.csv')

In [None]:
pd.set_option('display.max_columns', 150)

## Data

In [None]:
cars.columns = cars.columns.str.lower()

In [None]:
cars.drop(columns = ['id','source','end_lat','end_lng','description','number','street','city',
                     'county','state','country','timezone','weather_timestamp'], inplace = True)

In [None]:
cars.isnull().sum()

In [None]:
cars.head()

In [None]:
cars['start_time'] = pd.to_datetime(cars['start_time'])

In [None]:
cars['start_hour'] = cars['start_time'].dt.hour

In [None]:
cars['month'] = cars['start_time'].dt.month

In [None]:
cars['end_time'] = pd.to_datetime(cars['end_time'])

In [None]:
cars['end_hour'] = cars['end_time'].dt.hour

In [None]:
cars.head()

In [None]:
def datetime_to_minutes(df):
    total_duration = []
    
    for x in range(len(df)-1):
        total_duration.append(((df['end_time'][x] - df['start_time'][x]).total_seconds())/60)
        
    return total_duration

In [None]:
total_duration = datetime_to_minutes(cars)

In [None]:
total_duration.append(((cars['end_time'][4229393] - cars['start_time'][4229393]).total_seconds())/60)

In [None]:
cars['total_duration'] = total_duration

In [None]:
def bool_to_int(bool):
    if bool == True:
        return 1
    else:
        return 0

In [None]:
def bool_to_int_columns(df, columns):
    for col in columns:
        df[col] = df[col].apply(bool_to_int)
        
    return df

In [None]:
cars = bool_to_int_columns(cars,['amenity','bump','crossing','give_way','junction','no_exit','railway',
                          'roundabout','station','stop','traffic_calming','traffic_signal',
                          'turning_loop'])

In [None]:
def d_n_to_int(string):
    if string == 'Day':
        return 1
    else:
        return 0

In [None]:
def d_n_to_int_columns(df, columns):
    for col in columns:
        df[col] = df[col].apply(d_n_to_int)
        
    return df

In [None]:
cars = d_n_to_int_columns(cars,['sunrise_sunset','civil_twilight',
                                'nautical_twilight','astronomical_twilight'])

In [None]:
cars['side'] = cars['side'].map({'R': 1,'L': 0})

In [None]:
cars['wind_direction'] = cars['wind_direction'].fillna(cars['wind_direction'].mode()[0])

In [None]:
cars['weather_condition'] = cars['weather_condition'].fillna(cars['weather_condition'].mode()[0])

In [None]:
cars = pd.get_dummies(cars, columns = ['weather_condition','wind_direction'])

In [None]:
cars['precipitation(in)'] = cars['precipitation(in)'].fillna(cars['precipitation(in)'].mode()[0])

In [None]:
cars['wind_speed(mph)'] = cars['wind_speed(mph)'].fillna(cars['wind_speed(mph)'].mean())

In [None]:
cars['wind_direction'] = cars['wind_direction'].fillna(cars['wind_direction'].mode()[0])

In [None]:
cars['visibility(mi)'] = cars['visibility(mi)'].fillna(cars['visibility(mi)'].mean())

In [None]:
cars['pressure(in)'] = cars['pressure(in)'].fillna(cars['pressure(in)'].mean())

In [None]:
cars['humidity(%)'] = cars['humidity(%)'].fillna(cars['humidity(%)'].mean())

In [None]:
cars['wind_chill(f)'] = cars['wind_chill(f)'].fillna(cars['wind_chill(f)'].mean())

In [None]:
cars['temperature(f)'] = cars['temperature(f)'].fillna(cars['temperature(f)'].mean())

In [None]:
cars['tmc'] = cars['tmc'].fillna(cars['tmc'].mode()[0])

In [None]:
cars.head()

## Logr

In [None]:
features = ['distance(mi)','total_duration','precipitation(in)',
            'amenity','bump','crossing','give_way','junction',
            'no_exit','railway','roundabout','station','stop','traffic_calming',
            'traffic_signal','turning_loop','sunrise_sunset']
            
#             'temperature(f)','wind_chill(f)','humidity(%)','pressure(in)',
#             'visibility(mi)','wind_direction','wind_speed(mph)','precipitation(in)',
#             'weather_condition','amenity','bump','crossing','give_way','junction',
#             'no_exit','railway','roundabout','station','stop','traffic_calming',
#             'traffic_signal','turning_loop','sunrise_sunset']
            
#             side is filled in, but for some reason it comes up as infinity or NAN 
#             error when added to the model
X = cars[features]
y = cars['severity']

In [None]:
logr_pipe = Pipeline([
    ('sc', StandardScaler()),
    ('logr', LogisticRegression())
])

In [None]:
# set parameters
logr_params = {
    'logr__solver': ['lbfgs','liblinear'],
}

In [None]:
# set up GridSearch
logr = GridSearchCV(
    estimator = logr_pipe,
    param_grid = logr_params,
    cv = 3,
    verbose = 1,
    n_jobs = 6
)

In [None]:
X_train,X_test,y_train,y_test = train_test_split(X,y,random_state = 42, stratify = y)

In [None]:
sc = StandardScaler()
Z_train = sc.fit_transform(X_train)
Z_test = sc.transform(X_test)

In [None]:
logr2 = LogisticRegression()

In [None]:
logr2.fit(Z_train,y_train)

In [None]:
logr2.score(Z_train,y_train)

In [None]:
logr2.score(Z_test,y_test)

In [None]:
# fit GridSearch
logr.fit(X,y)

In [None]:
# look at best parameters
logr.best_params_

In [None]:
# look at best scores
logr.best_score_

In [None]:
# turn results into a dataframe
logr_results = pd.DataFrame(logr.cv_results_)

In [None]:
# look at GridSearch results
logr_results

## SVR

In [None]:
svr_pipe = Pipeline([
    ('sc', StandardScaler()),
    ('svr', SVR())
])

In [None]:
# set parameters
svr_params = {
    'C' : np.linspace(.1,.5,1,5),
    'kernel':['linear', 'rbf', 'polynomial'],
    'degree':[1,2,3,9]
}

In [None]:
# set up GridSearch
svr = GridSearchCV(
    estimator = svr_pipe,
    param_grid = svr_params,
    cv = 5,
    verbose = 1,
    n_jobs = 6
)

In [None]:
# fit GridSearch
svr.fit(X,y)

In [None]:
# look at best parameters
svr.best_params_

In [None]:
# look at best scores
svr.best_score_

In [None]:
# turn results into a dataframe
svr_results = pd.DataFrame(svr.cv_results_)

In [None]:
# look at GridSearch results
svr_results