<h1>Feature Engineering and Predictions</h1>

<h3>Focus on feature selection, processing speed, model iteration</h3>
<ul>
    <li>Will create different training sets w/ reature mixes</li>
    <li>PyTorch, Numba, Parralization, and Dask for processing speed</li>
    <li>Processing tutorials: 
        <a href = 'https://towardsdatascience.com/speed-up-your-algorithms-part-3-parallelization-4d95c0888748'>speeding up your algorithms</a>, 
        <a href = 'https://towardsdatascience.com/improving-random-forest-in-python-part-1-893916666cd'>improving random forest</a>
    </li>
    <li>Split train into 3 train/test splits, run models and compare results before running on final test</li>
</ul>

In [1]:
from importlib import reload #for changes in helpers
import time

import pandas as pd
import numpy as np
import scipy.stats

pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

import matplotlib.pyplot as plt
import seaborn as sns


from sklearn import preprocessing
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from sklearn.preprocessing import LabelEncoder

from sklearn.model_selection import train_test_split
from sklearn import linear_model 
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error

import helpers
reload(helpers)

sns.set()

In [2]:
train = pd.read_csv('../geotab-data/train.csv')
test = pd.read_csv('../geotab-data/test.csv')

<h1>Target PCA</h1>

<ul>
    <li>Light summary eda</li>
    <li>Min max scaler and target PCA</li>
    <li>Not sure if this is necessary for purpose of this notebook</li>
    </ul>

In [3]:
targets = train.iloc[:,12:27]

#targets to predict
total_time = targets[['TotalTimeStopped_p20','TotalTimeStopped_p50', 'TotalTimeStopped_p80']]
distance_to_first = targets[['DistanceToFirstStop_p20','DistanceToFirstStop_p50','DistanceToFirstStop_p80']]
target_cols = list(total_time.columns) + list(distance_to_first.columns)
p_targets = targets[target_cols]

#optional targets
time_from_first = targets[['TimeFromFirstStop_p20','TimeFromFirstStop_p50','TimeFromFirstStop_p80']]

print('Target Summaries:')
display(helpers.summarize(p_targets, True))

Target Summaries:


Unnamed: 0,name,dtypes,missing,unique,first_val,last_val,max,mean,median,stdev,entropy
0,TotalTimeStopped_p20,int64,0,172,0.0,0.0,273.0,1.731272,0.0,7.080017,0.92
1,TotalTimeStopped_p50,int64,0,264,0.0,0.0,343.0,7.681874,0.0,15.553418,2.7
2,TotalTimeStopped_p80,int64,0,403,0.0,0.0,689.0,22.948071,16.0,28.118134,5.06
3,DistanceToFirstStop_p20,float64,0,3479,0.0,0.0,1902.7,6.56445,0.0,28.003261,1.35
4,DistanceToFirstStop_p50,float64,0,7483,0.0,0.0,3099.5,28.255852,0.0,71.72009,4.16
5,DistanceToFirstStop_p80,float64,0,13267,0.0,0.0,4064.3,81.922639,60.4,152.68276,8.1


In [4]:
#scale targets w/ a min max scalers and append to train
for col in target_cols:
    train[col+str("_minmax")] = (preprocessing.minmax_scale(train[col], feature_range=(0,1)))
    
min_max_cols = ['TotalTimeStopped_p20_minmax', 'TotalTimeStopped_p50_minmax',
                'TotalTimeStopped_p80_minmax', 'DistanceToFirstStop_p20_minmax',
                'DistanceToFirstStop_p50_minmax', 'DistanceToFirstStop_p80_minmax']

pca = PCA(n_components=3, random_state=5)

principalComponents = pca.fit_transform(train[min_max_cols])
principalDf = pd.DataFrame(principalComponents)
pca.explained_variance_ratio_

array([0.66396904, 0.17536384, 0.07856878])

<h1>Feature Engineering</h1>

Time and day features

In [5]:
#making hour columns cyclical
train = helpers.date_cyc_enc(train, 'Hour', 24)
test = helpers.date_cyc_enc(test, 'Hour', 24) 

#encoding time of day
train['is_day'] = train['Hour'].apply(lambda x: 1 if 7 < x < 18 else 0)
test['is_day'] = test['Hour'].apply(lambda x: 1 if 7 < x < 18 else 0)

train['is_morning'] = train['Hour'].apply(lambda x: 1 if 6 < x < 10 else 0)
test['is_morning'] = test['Hour'].apply(lambda x: 1 if 6 < x < 10 else 0)

train['is_night'] = train['Hour'].apply(lambda x: 1 if 17 < x < 20 else 0)
test['is_night'] = test['Hour'].apply(lambda x: 1 if 17 < x < 20 else 0)

#encoding weekend vs not weekend
train['is_day_weekend'] = np.where((train['is_day'] == 1) & (train['Weekend'] == 1), 1,0)
test['is_day_weekend'] = np.where((test['is_day'] == 1) & (train['Weekend'] == 1), 1,0)

train['is_mor_weekend'] = np.where((train['is_morning'] == 1) & (train['Weekend'] == 1), 1,0)
test['is_mor_weekend'] = np.where((test['is_morning'] == 1) & (train['Weekend'] == 1), 1,0)

train['is_nig_weekend'] = np.where((train['is_night'] == 1) & (train['Weekend'] == 1), 1,0)
test['is_nig_weekend'] = np.where((test['is_night'] == 1) & (train['Weekend'] == 1), 1,0)

Location and direction features

In [6]:
#intersection and city concat
train["Intersec"] = train["IntersectionId"].astype(str) + train["City"]
test["Intersec"] = test["IntersectionId"].astype(str) + test["City"]

#make numerical and drop concat
le = LabelEncoder()
le.fit(pd.concat([train["Intersec"],test["Intersec"]]).drop_duplicates().values)
train["Intersec"] = le.transform(train["Intersec"])
test["Intersec"] = le.transform(test["Intersec"])

train['EntryType'] = train['EntryStreetName'].apply(helpers.road_encode)
train['ExitType'] = train['ExitStreetName'].apply(helpers.road_encode)
test['EntryType'] = test['EntryStreetName'].apply(helpers.road_encode)
test['ExitType'] = test['ExitStreetName'].apply(helpers.road_encode)

#map directional encoding in in train and test
train['EntryHeading'] = train['EntryHeading'].map(helpers.directions)
train['ExitHeading'] = train['ExitHeading'].map(helpers.directions)
test['EntryHeading'] = test['EntryHeading'].map(helpers.directions)
test['ExitHeading'] = test['ExitHeading'].map(helpers.directions)

#heading differences will tell us the degree at which you traveled, (0 degrees is straight, 180 u turn)
train['diffHeading'] = train['EntryHeading']-train['ExitHeading']  
test['diffHeading'] = test['EntryHeading']-test['ExitHeading'] 

#if you stay on the same street feature
train["same_str"] = (train["EntryStreetName"] ==  train["ExitStreetName"]).astype(int)
test["same_str"] = (test["EntryStreetName"] ==  test["ExitStreetName"]).astype(int)

Secondary features, monthly rainfall by city 

In [7]:
# Concatenating the city and month into one variable
train['city_month'] = train["City"] + train["Month"].astype(str)
test['city_month'] = test["City"] + test["Month"].astype(str)

# Creating a new column by mapping the city_month variable to it's corresponding average monthly rainfall
train["average_rainfall"] = train['city_month'].map(helpers.monthly_rainfall)
test["average_rainfall"] = test['city_month'].map(helpers.monthly_rainfall)

Replace city w/ dummy variables, can't run cells above after this

In [8]:
#dummy variables for city, drops city
try:
    train = pd.get_dummies(train, columns=['City' ],prefix=['City'], drop_first=False)
    test = pd.get_dummies(test, columns=['City' ],prefix=['City'], drop_first=False)
except KeyError:
    pass

Scale lat and longitude

In [9]:
#wonder if this would be different with min max
scaler = preprocessing.StandardScaler()
for col in ['Latitude','Longitude']:
    scaler.fit(train[col].values.reshape(-1, 1))
    train[col] = scaler.transform(train[col].values.reshape(-1, 1))
    test[col] = scaler.transform(test[col].values.reshape(-1, 1))

In [10]:
train.drop(['RowId', 'Path','EntryStreetName','ExitStreetName'],axis=1, inplace=True)
test.drop(['RowId', 'Path','EntryStreetName','ExitStreetName'],axis=1, inplace=True)

In [11]:
final_features = ['IntersectionId', 'Latitude', 'Longitude', 'EntryHeading',
                    'ExitHeading', 'Hour', 'Weekend', 'Month',
                    'is_morning', 'is_night', 'is_day_weekend', 'is_mor_weekend',
                    'is_nig_weekend', 
                    #'Hour', 
                    'Hour_sin', 
                    'Hour_cos', 
                    'same_str', 'Intersec', 'EntryType',
                    'ExitType', 'diffHeading', 'average_rainfall', 'is_day',
                    'City_Boston', 'City_Chicago', 'City_Philadelphia', 
                    'City_Atlanta']

<h1>Model building</h1>
Useful Vaiables:
<ul>
    <li>final_features - list final set of features for prediction</li>
    <li>target_cols - list of targets to predict</li>
    <li>train - full train data set with derived features</li>
    <li>test - full test data set with derived features</li>
</ul>

In [12]:
print("Train dataset shape: "+ str(train.shape))
print("Test dataset shape:  "+ str(test.shape))

Train dataset shape: (857409, 48)
Test dataset shape:  (1920335, 27)


In [13]:
#X and y for train set
print('Final Features: \n',final_features, '\n\nTargets: \n', target_cols)

Final Features: 
 ['IntersectionId', 'Latitude', 'Longitude', 'EntryHeading', 'ExitHeading', 'Hour', 'Weekend', 'Month', 'is_morning', 'is_night', 'is_day_weekend', 'is_mor_weekend', 'is_nig_weekend', 'Hour_sin', 'Hour_cos', 'same_str', 'Intersec', 'EntryType', 'ExitType', 'diffHeading', 'average_rainfall', 'is_day', 'City_Boston', 'City_Chicago', 'City_Philadelphia', 'City_Atlanta'] 

Targets: 
 ['TotalTimeStopped_p20', 'TotalTimeStopped_p50', 'TotalTimeStopped_p80', 'DistanceToFirstStop_p20', 'DistanceToFirstStop_p50', 'DistanceToFirstStop_p80']


<h2>Modeling w/ H2O</h2>
<ul>
    <li>train test split simple with H2O</li>
</ul>

In [14]:
import h2o
h2o.init()

Checking whether there is an H2O instance running at http://localhost:54321 . connected.


0,1
H2O cluster uptime:,5 hours 11 mins
H2O cluster timezone:,America/Chicago
H2O data parsing timezone:,UTC
H2O cluster version:,3.26.0.10
H2O cluster version age:,1 month and 4 days
H2O cluster name:,H2O_from_python_devonnavon_2eb76m
H2O cluster total nodes:,1
H2O cluster free memory:,6.791 Gb
H2O cluster total cores:,12
H2O cluster allowed cores:,12


<h2>Splitting Data</h2>
<ul>
    <li>og_train</li>
    <li>og_test</li>
    <li>final_features</li>
    <li>target_cols</li>
</ul>

In [15]:
h2_train = h2o.H2OFrame(train)
h2_test = h2o.H2OFrame(test)

Parse progress: |█████████████████████████████████████████████████████████| 100%
Parse progress: |█████████████████████████████████████████████████████████| 100%


In [16]:
og_split = h2_train.split_frame(ratios = [0.75], seed = 232)
og_train = og_split[0] # using 80% for training
og_test = og_split[1] #rest 20% for testing
print(og_train.shape, og_test.shape)

(643207, 48) (214202, 48)


<h3>H2O</h3>

<h3>Linear Model (first try)</h3>

In [17]:
from h2o.estimators.random_forest import H2ORandomForestEstimator
rfe = H2ORandomForestEstimator(model_id="rf_covType_v1",
                                ntrees=120,
                                #stopping_rounds=2,
                                score_each_iteration=True,
                                seed=12,
                                nfolds=5,
                                min_split_improvement=.0001
                              )

In [18]:
rfe.train(x = final_features, y = 'TotalTimeStopped_p20', training_frame = h2_train, max_runtime_secs=0)
pred1 = rfe.predict(h2_test)

drf Model Build progress: |███████████████████████████████████████████████| 100%
drf prediction progress: |████████████████████████████████████████████████| 100%


In [19]:
rfe.train(x = final_features, y = 'TotalTimeStopped_p50', training_frame = h2_train, max_runtime_secs=0)
pred2 = rfe.predict(h2_test)

drf Model Build progress: |███████████████████████████████████████████████ (cancelled) 100%


H2OJobCancelled: Job<$03017f00000132d4ffffffff$_ba3600ed135752151d63d23ee3874814> was cancelled by the user.

In [None]:
rfe.train(x = final_features, y = 'TotalTimeStopped_p80', training_frame = h2_train, max_runtime_secs=0)
pred3 = rfe.predict(h2_test)

In [None]:
rfe.train(x = final_features, y = 'DistanceToFirstStop_p20', training_frame = h2_train, max_runtime_secs=0)
pred4 = rfe.predict(h2_test)

In [None]:
rfe.train(x = final_features, y = 'DistanceToFirstStop_p50', training_frame = h2_train, max_runtime_secs=0)
pred5 = rfe.predict(h2_test)

In [None]:
rfe.train(x = final_features, y = 'DistanceToFirstStop_p80', training_frame = h2_train, max_runtime_secs=0)
pred6 = rfe.predict(h2_test)

In [None]:
pred1 = h2o.as_list(pred1, use_pandas=True)
pred2 = h2o.as_list(pred2, use_pandas=True)
pred3 = h2o.as_list(pred3, use_pandas=True)
pred4 = h2o.as_list(pred4, use_pandas=True)
pred5 = h2o.as_list(pred5, use_pandas=True)
pred6 = h2o.as_list(pred6, use_pandas=True)

In [None]:
pred1.to_csv("../geotab-data/pred1.csv")
pred2.to_csv("../geotab-data/pred2.csv")
pred3.to_csv("../geotab-data/pred3.csv")
pred4.to_csv("../geotab-data/pred4.csv")
pred5.to_csv("../geotab-data/pred5.csv")
pred6.to_csv("../geotab-data/pred6.csv")

In [None]:
for i,x in enumerate([pred1,pred2,pred3,pred4,pred5,pred6]):
    x.columns=[i]

In [None]:
predictions = []

for i in range(len(pred1)):
    for j in [np.array(pred1),np.array(pred2),np.array(pred3),np.array(pred4),np.array(pred5),np.array(pred6)]:
        predictions.append(j[i])

In [None]:
submission = pd.read_csv('../geotab-data/sample_submission.csv')
submission['Target'] = predictions
submission.to_csv("../geotab-data/h2opredictions.csv", index = False)