In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import datetime as dt

from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split,GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVR

In [None]:
cars = pd.read_csv('./data/US_Accidents_Dec20.csv')

In [None]:
pd.set_option('display.max_columns', 100) # Changing to 100 to check git commit from colab

## Data

In [None]:
cars.columns = cars.columns.str.lower()

In [None]:
cars.drop(columns = ['id','source','end_lat','end_lng','description','number','street','city',
                     'county','state','country','timezone','weather_timestamp'], inplace = True)

In [None]:
cars.isnull().sum()

tmc                         0
severity                    0
start_time                  0
end_time                    0
start_lat                   0
start_lng                   0
distance(mi)                0
side                        1
zipcode                  1291
airport_code             8961
temperature(f)              0
wind_chill(f)               0
humidity(%)                 0
pressure(in)                0
visibility(mi)              0
wind_direction              0
wind_speed(mph)             0
precipitation(in)           0
weather_condition           0
amenity                     0
bump                        0
crossing                    0
give_way                    0
junction                    0
no_exit                     0
railway                     0
roundabout                  0
station                     0
stop                        0
traffic_calming             0
traffic_signal              0
turning_loop                0
sunrise_sunset              0
civil_twil

In [None]:
cars.head()

In [None]:
FMT = '%H:%M:%S'
tdelta = datetime.strptime(s2, FMT) - datetime.strptime(s1, FMT)


In [None]:
cars['start_time'] = pd.to_datetime(cars['start_time'])

In [None]:
cars['start_hour'] = cars['start_time'].dt.hour

In [None]:
cars['month'] = cars['start_time'].dt.month

In [None]:
cars['end_time'] = pd.to_datetime(cars['end_time'])

In [None]:
cars['end_hour'] = cars['end_time'].dt.hour

In [None]:
range(len(cars))

range(0, 4229394)

In [None]:
for x in range(len(cars)):
    if x == 0:
        print(cars['start_time'][x])

2016-02-08 05:46:00


In [None]:
range(len('end_time'))

In [None]:
cars.head()

Unnamed: 0,tmc,severity,start_time,end_time,start_lat,start_lng,distance(mi),side,zipcode,airport_code,temperature(f),wind_chill(f),humidity(%),pressure(in),visibility(mi),wind_direction,wind_speed(mph),precipitation(in),weather_condition,amenity,bump,crossing,give_way,junction,no_exit,railway,roundabout,station,stop,traffic_calming,traffic_signal,turning_loop,sunrise_sunset,civil_twilight,nautical_twilight,astronomical_twilight,start_hour,month,end_hour,total_duration
0,201.0,3,2016-02-08 05:46:00,2016-02-08 11:00:00,39.865147,-84.058723,0.01,1.0,45424,KFFO,36.9,54.894139,91.0,29.68,10.0,0.0,7.904503,0.02,6,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,5,2,11,314.0
1,201.0,2,2016-02-08 06:07:59,2016-02-08 06:37:59,39.928059,-82.831184,0.01,0.0,43068-3402,KCMH,37.9,54.894139,100.0,29.65,10.0,0.0,7.904503,0.0,6,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,6,2,6,30.0
2,201.0,2,2016-02-08 06:49:27,2016-02-08 07:19:27,39.063148,-84.032608,0.01,1.0,45176,KI69,36.0,33.3,100.0,29.67,10.0,0.0,3.5,0.0,4,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,1,6,2,7,30.0
3,201.0,3,2016-02-08 07:23:34,2016-02-08 07:53:34,39.747753,-84.205582,0.01,1.0,45417,KDAY,35.1,31.0,96.0,29.64,9.0,0.0,4.6,0.0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,1,7,2,7,30.0
4,201.0,2,2016-02-08 07:39:07,2016-02-08 08:09:07,39.627781,-84.188354,0.01,1.0,45459,KMGY,36.0,33.3,89.0,29.65,6.0,0.0,3.5,0.0,2,0,0,0,0,0,0,0,0,0,0,0,1,0,1,1,1,1,7,2,8,30.0


In [None]:
def datetime_to_minutes(df):
    total_duration = []
    
    for x in range(len(df)):
        total_duration.append(((df['end_time'][x] - df['start_time'][x]).total_seconds())/60)
        
    return total_duration

In [None]:
total_duration = datetime_to_minutes(cars)

In [None]:
cars['total_duration'] = total_duration

In [None]:
def bool_to_int(bool):
    if bool == True:
        return 1
    else:
        return 0

In [None]:
def bool_to_int_columns(df, columns):
    for col in columns:
        df[col] = df[col].apply(bool_to_int)
        
    return df

In [None]:
cars = bool_to_int_columns(cars,['amenity','bump','crossing','give_way','junction','no_exit','railway',
                          'roundabout','station','stop','traffic_calming','traffic_signal',
                          'turning_loop'])

In [None]:
def d_n_to_int(string):
    if string == 'Day':
        return 1
    else:
        return 0

In [None]:
def d_n_to_int_columns(df, columns):
    for col in columns:
        df[col] = df[col].apply(d_n_to_int)
        
    return df

In [None]:
cars = d_n_to_int_columns(cars,['sunrise_sunset','civil_twilight',
                                'nautical_twilight','astronomical_twilight'])

In [None]:
cars['side'] = cars['side'].map({'R': 1,'L': 0})

In [None]:
cars = pd.get_dummies(cars, columns = ['weather_condition','wind_direction'])

In [None]:
cars['weather_condition'] = cars['weather_condition'].fillna(cars['weather_condition'].mode()[0])

In [None]:
cars['precipitation(in)'] = cars['precipitation(in)'].fillna(cars['precipitation(in)'].mode()[0])

In [None]:
cars['wind_speed(mph)'] = cars['wind_speed(mph)'].fillna(cars['wind_speed(mph)'].mean())

In [None]:
cars['wind_direction'] = cars['wind_direction'].fillna(cars['wind_direction'].mode()[0])

In [None]:
cars['visibility(mi)'] = cars['visibility(mi)'].fillna(cars['visibility(mi)'].mean())

In [None]:
cars['pressure(in)'] = cars['pressure(in)'].fillna(cars['pressure(in)'].mean())

In [None]:
cars['humidity(%)'] = cars['humidity(%)'].fillna(cars['humidity(%)'].mean())

In [None]:
cars['wind_chill(f)'] = cars['wind_chill(f)'].fillna(cars['wind_chill(f)'].mean())

In [None]:
cars['temperature(f)'] = cars['temperature(f)'].fillna(cars['temperature(f)'].mean())

In [None]:
cars['tmc'] = cars['tmc'].fillna(cars['tmc'].mode()[0])

## Logr

In [None]:
features = ['distance(mi)','total_duration','precipitation(in)',
            'amenity','bump','crossing','give_way','junction',
            'no_exit','railway','roundabout','station','stop','traffic_calming',
            'traffic_signal','turning_loop','sunrise_sunset']
            
#             'temperature(f)','wind_chill(f)','humidity(%)','pressure(in)',
#             'visibility(mi)','wind_direction','wind_speed(mph)','precipitation(in)',
#             'weather_condition','amenity','bump','crossing','give_way','junction',
#             'no_exit','railway','roundabout','station','stop','traffic_calming',
#             'traffic_signal','turning_loop','sunrise_sunset']
            
#             side is filled in, but for some reason it comes up as infinity or NAN 
#             error when added to the model
X = cars[features]
y = cars['severity']

In [None]:
logr_pipe = Pipeline([
    ('sc', StandardScaler()),
    ('logr', LogisticRegression())
])

In [None]:
# set parameters
logr_params = {
    'logr__solver': ['lbfgs','liblinear'],
}

In [None]:
# set up GridSearch
logr = GridSearchCV(
    estimator = logr_pipe,
    param_grid = logr_params,
    cv = 3,
    verbose = 1,
    n_jobs = 6
)

In [None]:
# fit GridSearch
logr.fit(X,y)

Fitting 3 folds for each of 2 candidates, totalling 6 fits


[Parallel(n_jobs=6)]: Using backend LokyBackend with 6 concurrent workers.
[Parallel(n_jobs=6)]: Done   2 out of   6 | elapsed:  7.4min remaining: 14.8min
[Parallel(n_jobs=6)]: Done   6 out of   6 | elapsed: 18.6min finished


In [None]:
# look at best parameters
logr.best_params_

In [None]:
# look at best scores
logr.best_score_

In [None]:
# turn results into a dataframe
logr_results = pd.DataFrame(logr.cv_results_)

In [None]:
# look at GridSearch results
logr_results

In [None]:
X_train,X_test,y_train,y_test = train_test_split(X,y,random_state = 42, stratify = y)

In [None]:
sc = StandardScaler()
Z_train = sc.fit_transform(X_train)
Z_test = sc.transform(X_test)

In [None]:
logr2 = LogisticRegression()

In [None]:
logr2.fit(Z_train,y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


LogisticRegression()

In [None]:
logr2.score(Z_train,y_train)

0.7118760925522809

In [None]:
logr2.score(Z_test,y_test)

0.7119399554924628

## SVR

In [None]:
svr_pipe = Pipeline([
    ('sc', StandardScaler()),
    ('svr', SVR())
])

In [None]:
# set parameters
svr_params = {
    'C' : np.linspace(.1,.5,1,5),
    'kernel':['linear', 'rbf', 'polynomial'],
    'degree':[1,2,3,9]
}

In [None]:
# set up GridSearch
svr = GridSearchCV(
    estimator = svr_pipe,
    param_grid = svr_params,
    cv = 5,
    verbose = 1,
    n_jobs = 6
)

In [None]:
# fit GridSearch
svr.fit(X,y)

Fitting 5 folds for each of 12 candidates, totalling 60 fits


[Parallel(n_jobs=6)]: Using backend LokyBackend with 6 concurrent workers.


ValueError: Invalid parameter C for estimator Pipeline(steps=[('sc', StandardScaler()), ('svr', SVR())]). Check the list of available parameters with `estimator.get_params().keys()`.

In [None]:
# look at best parameters
svr.best_params_

In [None]:
# look at best scores
svr.best_score_

In [None]:
# turn results into a dataframe
svr_results = pd.DataFrame(svr.cv_results_)

In [None]:
# look at GridSearch results
svr_results