## Forecasting congestion in October with Random Forest

In [1]:
## Importing libraries
import warnings
import numpy as np
import pandas as pd
import rf_modeling as rf_m
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split, GridSearchCV
pd.set_option('display.max_columns', 50)
from sklearn.exceptions import DataConversionWarning
warnings.filterwarnings(action='ignore', category=DataConversionWarning)

In [2]:
## Reading the data sets
train = pd.read_csv('/Users/EvanCallaghan/Documents/Data_Science_Group/Analytics_Data_Science/Forecasting_Traffic_Flow/Data/train.csv')
test = pd.read_csv('/Users/EvanCallaghan/Documents/Data_Science_Group/Analytics_Data_Science/Forecasting_Traffic_Flow/Data/test.csv')

## Getting the 'time' variable in the proper format
train['time'] = pd.to_datetime(train['time'], format = '%Y-%m-%d %H:%M:%S')
test['time'] = pd.to_datetime(test['time'], format = '%Y-%m-%d %H:%M:%S')

## Printing the first five observations
train.head()

Unnamed: 0,row_id,time,x,y,direction,congestion
0,0,1991-04-01,0,0,EB,70
1,1,1991-04-01,0,0,NB,49
2,2,1991-04-01,0,0,SB,24
3,3,1991-04-01,0,1,EB,18
4,4,1991-04-01,0,1,NB,60


### Variable Engineering:

In [3]:
## Extracting day, hour and minute
train['day'] = train['time'].dt.dayofweek
train['hour'] = train['time'].dt.hour
train['minute'] = train['time'].dt.minute

test['day'] = test['time'].dt.dayofweek
test['hour'] = test['time'].dt.hour
test['minute'] = test['time'].dt.minute

## Changing direction to dummies
train = pd.concat([train, pd.get_dummies(train['direction'])], axis = 1)
test = pd.concat([test, pd.get_dummies(test['direction'])], axis = 1)

### Modeling:

In [4]:
## Calling the 'main_rf' function in the rf_modeling script
results = rf_m.main_rf(train, test)

## Storing results as csv files
results[0].to_csv('results_validation.csv', index = False)
results[1].to_csv('results_test.csv', index = False)

Working on direction: EB
Working on location: ( 0 , 0 )
Working on location: ( 0 , 1 )
Working on location: ( 0 , 2 )
Working on location: ( 0 , 3 )
Working on location: ( 1 , 0 )
Working on location: ( 1 , 1 )
Working on location: ( 1 , 2 )
Working on location: ( 1 , 3 )
Working on location: ( 2 , 0 )
Working on location: ( 2 , 1 )
Working on location: ( 2 , 2 )
Working on location: ( 2 , 3 )
Working on direction: NB
Working on location: ( 0 , 0 )
Working on location: ( 0 , 1 )
Working on location: ( 0 , 2 )
Working on location: ( 0 , 3 )
Working on location: ( 1 , 0 )
Working on location: ( 1 , 1 )
Working on location: ( 1 , 2 )
Working on location: ( 1 , 3 )
Working on location: ( 2 , 0 )
Working on location: ( 2 , 1 )
Working on location: ( 2 , 2 )
Working on location: ( 2 , 3 )
Working on direction: SB
Working on location: ( 0 , 0 )
Working on location: ( 0 , 1 )
Working on location: ( 0 , 2 )
Working on location: ( 0 , 3 )
Working on location: ( 1 , 0 )
Working on location: ( 1 ,

### Evaluation:

In [6]:
from sklearn.metrics import mean_absolute_error

## Reading the newly created validation results data set
validation = pd.read_csv('results_validation.csv')

## Computing the MAE of congestion predictions
print('MAE of validation set predictions:', mean_absolute_error(validation['congestion'], validation['congestion_pred']))

MAE of validation set predictions: 6.725180149993974
