# Random Forest Regression - Metro Traffic 

In [2]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns

### EDA

In [4]:
data=pd.read_csv('data/metro.csv')
data.head()

Unnamed: 0,holiday,temp,rain_1h,snow_1h,clouds_all,weather_main,weather_description,date_time,traffic_volume
0,,288.28,0.0,0.0,40,Clouds,scattered clouds,2012-10-02 09:00:00,5545
1,,289.36,0.0,0.0,75,Clouds,broken clouds,2012-10-02 10:00:00,4516
2,,289.58,0.0,0.0,90,Clouds,overcast clouds,2012-10-02 11:00:00,4767
3,,290.13,0.0,0.0,90,Clouds,overcast clouds,2012-10-02 12:00:00,5026
4,,291.14,0.0,0.0,75,Clouds,broken clouds,2012-10-02 13:00:00,4918


In [5]:
data = data.drop(['weather_description'],axis=1)

In [6]:
import datetime
data['date_time'] = data['date_time'].apply(lambda x : int(x.split()[1].split(":")[0]))
data.head()

Unnamed: 0,holiday,temp,rain_1h,snow_1h,clouds_all,weather_main,date_time,traffic_volume
0,,288.28,0.0,0.0,40,Clouds,9,5545
1,,289.36,0.0,0.0,75,Clouds,10,4516
2,,289.58,0.0,0.0,90,Clouds,11,4767
3,,290.13,0.0,0.0,90,Clouds,12,5026
4,,291.14,0.0,0.0,75,Clouds,13,4918


In [7]:
data_bkup = data.copy()

In [8]:
data = pd.get_dummies(data,drop_first=True)

In [9]:
data.head()

Unnamed: 0,temp,rain_1h,snow_1h,clouds_all,date_time,traffic_volume,holiday_Columbus Day,holiday_Independence Day,holiday_Labor Day,holiday_Martin Luther King Jr Day,...,weather_main_Clouds,weather_main_Drizzle,weather_main_Fog,weather_main_Haze,weather_main_Mist,weather_main_Rain,weather_main_Smoke,weather_main_Snow,weather_main_Squall,weather_main_Thunderstorm
0,288.28,0.0,0.0,40,9,5545,False,False,False,False,...,True,False,False,False,False,False,False,False,False,False
1,289.36,0.0,0.0,75,10,4516,False,False,False,False,...,True,False,False,False,False,False,False,False,False,False
2,289.58,0.0,0.0,90,11,4767,False,False,False,False,...,True,False,False,False,False,False,False,False,False,False
3,290.13,0.0,0.0,90,12,5026,False,False,False,False,...,True,False,False,False,False,False,False,False,False,False
4,291.14,0.0,0.0,75,13,4918,False,False,False,False,...,True,False,False,False,False,False,False,False,False,False


In [10]:
data.isna().sum()

temp                                 0
rain_1h                              0
snow_1h                              0
clouds_all                           0
date_time                            0
traffic_volume                       0
holiday_Columbus Day                 0
holiday_Independence Day             0
holiday_Labor Day                    0
holiday_Martin Luther King Jr Day    0
holiday_Memorial Day                 0
holiday_New Years Day                0
holiday_State Fair                   0
holiday_Thanksgiving Day             0
holiday_Veterans Day                 0
holiday_Washingtons Birthday         0
weather_main_Clouds                  0
weather_main_Drizzle                 0
weather_main_Fog                     0
weather_main_Haze                    0
weather_main_Mist                    0
weather_main_Rain                    0
weather_main_Smoke                   0
weather_main_Snow                    0
weather_main_Squall                  0
weather_main_Thunderstorm

### Splitting X and Y

In [12]:
X = data.drop("traffic_volume",axis=1)
y = data['traffic_volume']

### Splitting test and train data

In [14]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33,random_state=42) #0.67 data will be for training.

### RFR

In [16]:
from sklearn.model_selection import GridSearchCV #GridSearchCV is for parameter tuning
from sklearn.ensemble import RandomForestRegressor
regressor = RandomForestRegressor()

In [17]:
# n_estimators=[75,100] #number of decision trees in the forest, default = 100
# criterion=['mse'] #criteria for choosing nodes default = 'mse'
# max_depth=[3,5,10] #maximum number of nodes in a tree default = None (it will go till all possible nodes)
# #max_features = ['sqrt']
# parameters={'n_estimators': n_estimators,'criterion':criterion,'max_depth':max_depth} #this will undergo 2*1*3 = 6 iterations

In [18]:
n_estimators = [75, 100]  # number of decision trees in the forest, default = 100
criterion = ['squared_error', 'friedman_mse']  # valid criteria for RandomForestRegressor
max_depth = [3, 5, 10]  # maximum depth of the tree, default = None

parameters = {
    'n_estimators': n_estimators,
    'criterion': criterion,
    'max_depth': max_depth
}

# Perform grid search with cross-validation
RFR_reg = GridSearchCV(regressor, parameters)
RFR_reg.fit(X_train, y_train)

In [19]:
RFR_reg.best_params_

{'criterion': 'squared_error', 'max_depth': 10, 'n_estimators': 100}

In [29]:
regressor = RandomForestRegressor(criterion = "squared_error",max_depth = 10 , n_estimators = 100)
regressor.fit(X_train,y_train)
ypred = regressor.predict(X_test)

In [30]:
from sklearn.metrics import mean_squared_error #calculating MSE
MSE=mean_squared_error(ypred,y_test)
print("MSE:",MSE)

MSE: 837287.4315223212


In [31]:
import math
math.sqrt(MSE)

915.0341149499953

In [24]:
# END