In [50]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.svm import SVR
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import train_test_split, cross_val_score, KFold, GridSearchCV, RandomizedSearchCV


## Importing train and test data
train = pd.read_csv('/Users/gabrielvictorgomesferreira/Desktop/Analytics_Data_Science/train.csv')
test = pd.read_csv('/Users/gabrielvictorgomesferreira/Desktop/Analytics_Data_Science/test.csv')

## Puting time in the right format 
train['time'] = pd.to_datetime(train['time'], format = '%Y-%m-%d %H:%M:%S')
test['time'] = pd.to_datetime(test['time'], format = '%Y-%m-%d %H:%M:%S')

train.head()

Unnamed: 0,row_id,time,x,y,direction,congestion
0,0,1991-04-01,0,0,EB,70
1,1,1991-04-01,0,0,NB,49
2,2,1991-04-01,0,0,SB,24
3,3,1991-04-01,0,1,EB,18
4,4,1991-04-01,0,1,NB,60


In [51]:
test.head()

Unnamed: 0,row_id,time,x,y,direction
0,848835,1991-09-30 12:00:00,0,0,EB
1,848836,1991-09-30 12:00:00,0,0,NB
2,848837,1991-09-30 12:00:00,0,0,SB
3,848838,1991-09-30 12:00:00,0,1,EB
4,848839,1991-09-30 12:00:00,0,1,NB


In [52]:
test.describe()

Unnamed: 0,row_id,x,y
count,2340.0,2340.0,2340.0
mean,850004.5,1.138462,1.630769
std,675.644137,0.801649,1.089611
min,848835.0,0.0,0.0
25%,849419.75,0.0,1.0
50%,850004.5,1.0,2.0
75%,850589.25,2.0,3.0
max,851174.0,2.0,3.0


In [53]:
train.describe()

Unnamed: 0,row_id,x,y,congestion
count,848835.0,848835.0,848835.0,848835.0
mean,424417.0,1.138462,1.630769,47.815305
std,245037.70221,0.801478,1.089379,16.799392
min,0.0,0.0,0.0,0.0
25%,212208.5,0.0,1.0,35.0
50%,424417.0,1.0,2.0,47.0
75%,636625.5,2.0,3.0,60.0
max,848834.0,2.0,3.0,100.0


In [54]:
train.shape

(848835, 6)

In [55]:
test.shape

(2340, 5)

In [56]:
train['direction'].value_counts()

NB    156708
EB    156708
SB    156708
WB    143649
NE     91413
SW     91413
SE     26118
NW     26118
Name: direction, dtype: int64

In [57]:
test['direction'].value_counts()

SB    432
EB    432
NB    432
WB    396
SW    252
NE    252
SE     72
NW     72
Name: direction, dtype: int64

In [58]:
## Extracting day, hour and minute
train['day'] = train['time'].dt.dayofweek
train['hour'] = train['time'].dt.hour
train['minute'] = train['time'].dt.minute

test['day'] = test['time'].dt.dayofweek
test['hour'] = test['time'].dt.hour
test['minute'] = test['time'].dt.minute

## Changing direction to dummies
train = pd.concat([train, pd.get_dummies(train['direction'])], axis = 1)
test = pd.concat([test, pd.get_dummies(test['direction'])], axis = 1)

In [59]:
train.describe()

Unnamed: 0,row_id,x,y,congestion,day,hour,minute,EB,NB,NE,NW,SB,SE,SW,WB
count,848835.0,848835.0,848835.0,848835.0,848835.0,848835.0,848835.0,848835.0,848835.0,848835.0,848835.0,848835.0,848835.0,848835.0,848835.0
mean,424417.0,1.138462,1.630769,47.815305,2.988897,11.479057,20.012252,0.184615,0.184615,0.107692,0.030769,0.184615,0.030769,0.107692,0.169231
std,245037.70221,0.801478,1.089379,16.799392,2.006717,6.926522,16.329937,0.387985,0.387985,0.309992,0.172692,0.387985,0.172692,0.309992,0.374956
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,212208.5,0.0,1.0,35.0,1.0,5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,424417.0,1.0,2.0,47.0,3.0,11.0,20.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,636625.5,2.0,3.0,60.0,5.0,17.0,40.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,848834.0,2.0,3.0,100.0,6.0,23.0,40.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [63]:
train.head()

Unnamed: 0,row_id,time,x,y,direction,congestion,day,hour,minute,EB,NB,NE,NW,SB,SE,SW,WB
0,0,1991-04-01,0,0,EB,70,0,0,0,1,0,0,0,0,0,0,0
1,1,1991-04-01,0,0,NB,49,0,0,0,0,1,0,0,0,0,0,0
2,2,1991-04-01,0,0,SB,24,0,0,0,0,0,0,0,1,0,0,0
3,3,1991-04-01,0,1,EB,18,0,0,0,1,0,0,0,0,0,0,0
4,4,1991-04-01,0,1,NB,60,0,0,0,0,1,0,0,0,0,0,0


## Modeling

In [65]:
# Defining input and target variable
X = train.drop(['congestion', 'row_id', 'direction', 'time'], axis = 1)
Y = train['congestion']

# Splitting the data
X_train, X_val, Y_train, Y_val = train_test_split(X, Y, test_size = 0.2)
X_val, X_test, Y_val, Y_test = train_test_split(X_val, Y_val, test_size = 0.2)

# Scaling the data
scaler = MinMaxScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.fit_transform(X_test)
X_val = scaler.fit_transform(X_val)

In [66]:
## Defining the hyper-parameters for svm
svm_param_grid = {'kernel': ['rbf', 'poly', 'sigmoid'], 
                  'C': [0.01, 0.1, 1, 10],
                  'gamma': [0.001, 0.01, 0.1, 1]}


svm_grid_search = GridSearchCV(SVR(), svm_param_grid, cv = 3, scoring = 'neg_mean_squared_error', n_jobs = -1).fit(X_train, Y_train)

# Extracting the best model
svm_md = svm_grid_search.best_estimator_

# Predicting on validation and test
svm_val_pred = svm_md.predict(X_val)
svm_test_pred = svm_md.predict(X_test)

# Computing the mse on validation and test
svm_val_mse = mean_squared_error(Y_val, svm_val_pred)
svm_test_mse = mean_squared_error(Y_test, svm_test_pred)
print("The mse of the model Support Vector Machine Regressior on the validation dataset is: ", round(svm_val_mse, 1))
print("The mse of the model Support Vector Machine Regressior on the test dataset is: ", round(svm_test_mse, 1))

KeyboardInterrupt: 