# Random Forests

In [125]:
import csv
import pandas as pd
import numpy as np
import sklearn
from sklearn import *
from sklearn.metrics import mean_squared_error,mean_absolute_error
from math import sqrt
from sklearn.ensemble import RandomForestRegressor
from sklearn.feature_selection import RFE

## Read the data

In [126]:
pd.set_option('display.max_columns', 65)

df = pd.read_csv('../Part1-EDA/clean_data.csv', low_memory=False)

## Divide Data into Train and Test Sets

In [128]:
from sklearn.model_selection import train_test_split

array=df.values
Y=array[:,30]
Y

array([0.0953, 0.0198, 0.006, ..., 0.725458273984, 0.043652112878300016,
       -0.0141783454295], dtype=object)

In [129]:
x1=array[:,2:6]
x2=array[:,7:12]
x3=array[:,13:17]
x4=array[:,19:30]  
x5=array[:,35:62]

X=np.hstack((x1,x2))
X=np.hstack((X,x3))
X=np.hstack((X,x4))
X=np.hstack((X,x5))

X[1]

array([2.0, 6.015460983643267, 1.0, 777.0, 0.0, 1.0, 1.0, 434.0, 0,
       34.272866, -119.19891100000001, 7200.0, 0.0, 4.0, 1.0, 1990.0, 1.0,
       0, 143809.0, 239679.0, 2015.0, 95870.0, 2581.3, 0, 0, 0, 1, 1, 0, 0,
       0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0], dtype=object)

In [130]:
X_train,X_test,Y_train,Y_test=train_test_split(X,Y,test_size=0.2)

* Attempt 1: Defaults
* Attempt 2: n_estimators = 20
* Attempt 3: n_estimators = 100
* Attempt 4: n_estimators = 100, max_features=20
* Attempt 5: n_estimators = 200, max_features=20
* Attempt 6: max_features=20
* Attempt 7: n_estimators=60, max_features=15
* Attempt 8: n_estimators=20, max_features=15
* Attempt 9: max_features=10, oob_score = True

## Attempt 1: Defaults

In [131]:
forest = RandomForestRegressor()
forest.fit(X_train, Y_train)



RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
           oob_score=False, random_state=None, verbose=0, warm_start=False)

In [132]:
forest.feature_importances_

array([  1.62895245e-02,   2.50614406e-02,   1.13828664e-02,
         9.29332762e-02,   3.94072412e-03,   8.78477693e-03,
         2.63366298e-03,   1.81916788e-02,   5.66497503e-04,
         1.03667445e-01,   1.03149671e-01,   9.59837018e-02,
         2.58476677e-03,   1.87586167e-02,   5.32510475e-03,
         6.60175535e-02,   4.46218567e-03,   1.21058495e-03,
         1.07625803e-01,   8.03761096e-02,   5.50735878e-03,
         8.65859026e-02,   9.71533636e-02,   5.63753612e-03,
         8.91331772e-04,   1.35292892e-03,   7.97849447e-04,
         4.62350218e-03,   6.58055665e-03,   2.73516862e-03,
         0.00000000e+00,   2.98228322e-03,   3.11993464e-04,
         1.64564307e-05,   1.03981441e-03,   5.71372538e-06,
         2.63627384e-03,   6.63832940e-04,   3.62334465e-03,
         7.63128904e-08,   6.79600829e-08,   0.00000000e+00,
         1.60100628e-05,   0.00000000e+00,   4.75186046e-05,
         3.05633904e-05,   1.72683786e-03,   2.96732954e-03,
         3.61871127e-04,

In [133]:
ptrain = forest.predict(X_train)
ptest = forest.predict(X_test)

In [135]:
sqrt(mean_squared_error(Y_train, ptrain))

0.07524734357793231

In [136]:
sqrt(mean_squared_error(Y_test, ptest))

0.1817701199045024

In [138]:
mean_absolute_error(Y_train, ptrain)

0.034471238153067435

In [140]:
mean_absolute_error(Y_test, ptest)

0.084781642606118174

Strategies for improving the Random Forest result:
* Increase the number of estimators
* Decrease the maximum number of features
* OOB?

### Attempt 2: Increase Estimators to 20

In [141]:
forest = RandomForestRegressor(n_estimators=20)
forest.fit(X_train, Y_train)
ptrain = forest.predict(X_train)
ptest = forest.predict(X_test)

In [142]:
sqrt(mean_squared_error(Y_train, ptrain))

0.06887901410211016

In [143]:
sqrt(mean_squared_error(Y_test, ptest))

0.17712311438801023

In [144]:
mean_absolute_error(Y_train, ptrain)

0.031854303315568357

In [145]:
mean_absolute_error(Y_test, ptest)

0.080630532052561085

### Attempt 3: Increase Estimators to 100

In [146]:
forest = RandomForestRegressor(n_estimators=100)
forest.fit(X_train, Y_train)
ptrain = forest.predict(X_train)
ptest = forest.predict(X_test)

In [147]:
sqrt(mean_squared_error(Y_train, ptrain))

0.0628744213078077

In [148]:
sqrt(mean_squared_error(Y_test, ptest))

0.17355070857924645

In [149]:
mean_absolute_error(Y_train, ptrain)

0.028374439049071602

In [150]:
mean_absolute_error(Y_test, ptest)

0.075758775554391006

### Attempt 4: Set Max Features to 20

In [151]:
forest = RandomForestRegressor(n_estimators=100, max_features=20)
forest.fit(X_train, Y_train)
ptrain = forest.predict(X_train)
ptest = forest.predict(X_test)

In [152]:
sqrt(mean_squared_error(Y_train, ptrain))

0.06231003368713297

In [153]:
sqrt(mean_squared_error(Y_test, ptest))

0.17233929996339914

In [154]:
mean_absolute_error(Y_train, ptrain)

0.028146890727624125

In [155]:
mean_absolute_error(Y_test, ptest)

0.075140856886367888

### Attempt 5: Increase Estimators to 200

In [156]:
forest = RandomForestRegressor(n_estimators=200, max_features=20)
forest.fit(X_train, Y_train)
ptrain = forest.predict(X_train)
ptest = forest.predict(X_test)

In [158]:
sqrt(mean_squared_error(Y_train, ptrain))

0.06153575903117457

In [159]:
sqrt(mean_squared_error(Y_test, ptest))

0.17222008542317163

In [160]:
mean_absolute_error(Y_train, ptrain)

0.027465799248871089

In [161]:
mean_absolute_error(Y_test, ptest)

0.074336304576334195

## Attempt 6: Use Recursive Feature Selection

In [162]:
forest = RandomForestRegressor(n_estimators=65, max_features=20)
rfe = RFE(forest, n_features_to_select=20)
rfe.fit(X_train,Y_train)
rfe.ranking_ 

array([ 1,  1,  1,  1, 10,  1,  5,  1, 19,  1,  1,  1, 11,  1,  1,  1,  6,
       13,  1,  1,  2,  1,  1,  1, 12, 14, 18,  1,  1,  7, 32,  4, 22, 25,
       16, 27,  9, 21,  1, 28, 30, 31, 26, 29, 23, 24, 15,  3, 20,  8, 17])

In [163]:
ptrain = rfe.predict(X_train)
ptest = rfe.predict(X_test)

In [164]:
sqrt(mean_squared_error(Y_train, ptrain))

0.06335949129212153

In [165]:
sqrt(mean_squared_error(Y_test, ptest))

0.17403697038983645

In [166]:
mean_absolute_error(Y_train, ptrain)

0.029016171393873077

In [167]:
mean_absolute_error(Y_test, ptest)

0.076600176035496584

## Attempt 7: Use only the Rank 1 Features from RFE

In [168]:
x1=array[:,2:6]
x2=array[:,8:9]
x3=array[:,10:11]
x4=array[:,13:16]  
x5=array[:,19:22]
x6=array[:,24:26]
x7=array[:,27:30]
x8=array[:,38:40]
x9=array[:,49:50]

X=np.hstack((x1,x2))
X=np.hstack((X,x3))
X=np.hstack((X,x4))
X=np.hstack((X,x5))
X=np.hstack((X,x6))
X=np.hstack((X,x7))
X=np.hstack((X,x8))
X=np.hstack((X,x9))


X[1]

array([2.0, 6.015460983643267, 1.0, 777.0, 1.0, 434.0, 34.272866,
       -119.19891100000001, 7200.0, 4.0, 1.0, 1990.0, 143809.0, 239679.0,
       95870.0, 2581.3, 0, 1, 0, 0], dtype=object)

In [169]:
X_train,X_test,Y_train,Y_test=train_test_split(X,Y,test_size=0.2)

In [170]:
forest = RandomForestRegressor(n_estimators=100)
forest.fit(X_train, Y_train)

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=1,
           oob_score=False, random_state=None, verbose=0, warm_start=False)

In [172]:
ptrain = forest.predict(X_train)
ptest = forest.predict(X_test)

In [173]:
sqrt(mean_squared_error(Y_train, ptrain))

0.06321974503615733

In [174]:
sqrt(mean_squared_error(Y_test, ptest))

0.16730388161123105

In [175]:
mean_absolute_error(Y_train, ptrain)

0.0286633582459302

In [176]:
mean_absolute_error(Y_test, ptest)

0.074580782033266541