In [None]:
import numpy as np
import os
import pandas as pd
import glob
from ast import literal_eval
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.model_selection import KFold
from sklearn.model_selection import TimeSeriesSplit
from sklearn.model_selection import GridSearchCV

from matplotlib.pylab import plt #load plot library
# indicate the output of plotting function is printed to the notebook
%matplotlib inline 

from sklearn import datasets, linear_model
from sklearn.metrics import mean_squared_error, r2_score

In [None]:
#Import data
data = pd.read_hdf('random_forest.h5')
data.head()

### Data cleaning

In [None]:
data = data.drop(['zip5'], axis = 1)

In [None]:
data = data[data['datetime'].dt.year < 2019]
data = data.drop(['x', 'y', 'any_impute_col', 'impute_row', 'datetime'], axis = 1)

### Training and Validation Data Split

In [None]:
#Split data to training and validation (based on percentage)

end_ind = int(0.7*data.shape[0])
trainData, testData = data.loc[0:end_ind,:], data.loc[end_ind+1:data.shape[0],:]
print(trainData.shape, testData.shape)

X_train = trainData.drop(["impact_score"], axis = 1)
X_test = testData.drop(["impact_score"], axis = 1)

Y_train = trainData["impact_score"]
Y_test = testData["impact_score"]
print('train data: ', X_train.shape, Y_train.shape, '\ntest data: ', X_test.shape, Y_test.shape)

### Multiple Linear Regression

In [None]:
# Create linear regression object
regr = linear_model.LinearRegression()

# Train the model using the training sets
regr.fit(X_train.values, Y_train.values)

# Make predictions using the testing set
y_pred_MLR_ori = regr.predict(X_test.values)

# The coefficients
#print('Coefficients: \n', regr.coef_)
# The mean squared error

y_pred_MLR = y_pred_MLR_ori.flatten()

# Clipping values
y_pred_MLR[y_pred_MLR<-1] = 0
y_pred_MLR[y_pred_MLR>35] = 35


print('MLR Error: %.2f'
      % mean_squared_error(Y_test.values, y_pred_MLR))
# The coefficient of determination: 1 is perfect prediction
print('Coefficient of determination: %.2f'
      % r2_score(Y_test.values, y_pred_MLR))

### Random Forest

In [None]:
# Random Forest Regression modeling - Parameter Tuning: n_est
n_est = 40 #The number of trees in the forest.
max_d = 10 #Max depth of the tree
min_samples = 100 #The minimum number of samples required to be at a leaf node. 

n_est_dict_y_e = {}
n_est_dict_mse_e = {}

for n_est in range(40,220,40):
    m1 = RandomForestRegressor(n_estimators = n_est, max_depth = max_d, min_samples_leaf = min_samples)
    m1.fit(X_train,Y_train)
    y_pred_RF = m1.predict(X_test)
    n_est_dict_mse_e[n_est] = metrics.mean_squared_error(Y_test, y_pred_RF)
    n_est_dict_y_e[n_est] = y_pred_RF
    print("RF Error:", metrics.mean_squared_error(Y_test, y_pred_RF))


In [None]:
# Random Forest Regression modeling - Parameter Tuning: max_d
n_est = 40 #The number of trees in the forest.
max_d = 10 #Max depth of the tree
min_samples = 100 #The minimum number of samples required to be at a leaf node. 

n_est_dict_y_d = {}
n_est_dict_mse_d = {}

for max_d in range(10,22,2):
    m1 = RandomForestRegressor(n_estimators = n_est, max_depth = max_d, min_samples_leaf = min_samples)
    m1.fit(X_train,Y_train)
    y_pred_RF = m1.predict(X_test)
    n_est_dict_mse_d[max_d] = metrics.mean_squared_error(Y_test, y_pred_RF)
    n_est_dict_y_d[max_d] = y_pred_RF
    print("RF Error:", metrics.mean_squared_error(Y_test, y_pred_RF))


#### Random Forest Hyperparameter Tuning

Varying number of trees (n_estimators)

n_estimators|min_sampels_leaf|MSE|
---|---|---|
40|10|43.69|
80|10|43.69|
120|10|43.79|
160|10|43.46|
200|10|43.75|


Varying minimum number of samples per leaf node (n_estimators)

n_estimators|min_sampels_leaf|MSE|
---|---|---|
40|10|43.70|
40|12|43.83|
40|14|42.69|
40|16|41.94|
40|18|43.46|
40|20|42.43|

In [None]:
# Random Forest Regression modeling - Final modeling
n_est = 160 #The number of trees in the forest.
max_d = 16 #Max depth of the tree
min_samples = 100 #The minimum number of samples required to be at a leaf node. 

m1 = RandomForestRegressor(n_estimators = n_est, max_depth = max_d, min_samples_leaf = min_samples)
m1.fit(X_train,Y_train)
y_pred_RF = m1.predict(X_test)
print("RF Error:", metrics.mean_squared_error(Y_test, y_pred_RF))

### Result Visualization

In [None]:
#Join testing data: X_test, y_test, y_pred to a single dataframe for visualization
y_vals = X_test.copy()
y_vals['true_IS'], y_vals['pred_RF'], y_vals['pred_MLR'] = y_test, y_pred_RF, y_pred_MLR

y_vals = y_vals.reset_index()

In [None]:
#Plot a certain zip code data. i.e. lat = 47, long = -122
zip_lat, zip_lon = 47, -122
data_zip = y_vals[(y_vals['lat']==zip_lat) & (y_vals['lng']==zip_lon)]

print(data_zip.shape)

fig, ax = plt.subplots(1,1,figsize=(15,5))
ax.plot(data_zip['true_IS'], label = 'True Value')
ax.plot(data_zip['pred_RF'], label = 'Random Forest')
ax.plot(data_zip['pred_MLR'], label = 'Regression')
ax.set(xlabel='Index', ylabel='Impact Score')
ax.legend()
fig.suptitle('lat '+str(zip_lat)+ ' & long ' + str(zip_lon), fontsize=15)