In [1]:
%matplotlib inline
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestClassifier as RFC
from sklearn.ensemble import RandomForestRegressor as RFR
import scipy.stats as sp
from sklearn.model_selection import GridSearchCV as GS

## Previously, I built SVR/SVC models using GridSearch.

Now, I will attempt to build Random Forest models using the GridSearch function to optimise the models being built.

In [2]:
msft = pd.DataFrame.from_csv('MSFT.csv')
msft = msft.interpolate()
#changing the index to go from 1986 to 2016
msft = msft.reindex(index=msft.index[::-1])
investor = pd.DataFrame.from_csv('investor.csv')
investor = investor.fillna(investor.mean())
#changing the index to go from 1987 to 2016
investor = investor.reindex(index=investor.index[::-1])

In [3]:
#quick reminder of what the data looks like
msft.head()

Unnamed: 0_level_0,Open,High,Low,Close,Volume,Ex-Dividend,Split Ratio,Adj. Open,Adj. High,Adj. Low,Adj. Close,Adj. Volume
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
1986-03-13,25.5,29.25,25.5,28.0,3582600.0,0.0,1.0,0.060989,0.069957,0.060989,0.066968,1031789000.0
1986-03-14,28.0,29.5,28.0,29.0,1070000.0,0.0,1.0,0.066968,0.070555,0.066968,0.06936,308160000.0
1986-03-17,29.0,29.75,29.0,29.5,462400.0,0.0,1.0,0.06936,0.071153,0.06936,0.070555,133171200.0
1986-03-18,29.5,29.75,28.5,28.75,235300.0,0.0,1.0,0.070555,0.071153,0.068164,0.068762,67766400.0
1986-03-19,28.75,29.0,28.0,28.25,166300.0,0.0,1.0,0.068762,0.06936,0.066968,0.067566,47894400.0


In [4]:
investor.head()

Unnamed: 0_level_0,Bullish,Neutral,Bearish,Total,Bullish 8-Week Mov Avg,Bull-Bear Spread,Bullish Average,Bullish Average + St. Dev,Bullish Average - St. Dev,S&P 500 Weekly High,S&P 500 Weekly Low,S&P 500 Weekly Close
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
1987-06-26,0.384424,0.312702,0.302874,0.999999,0.384009,0.08155,0.386014,0.489178,0.28285,1038.23181,1011.831541,1026.840289
1987-07-17,0.384424,0.312702,0.302874,0.999999,0.384009,0.08155,0.386014,0.489178,0.28285,314.59,307.63,314.59
1987-07-24,0.36,0.5,0.14,1.0,0.384009,0.22,0.386014,0.489178,0.28285,311.39,307.81,309.27
1987-07-31,0.26,0.48,0.26,1.0,0.384009,0.0,0.386014,0.489178,0.28285,318.66,310.65,318.66
1987-08-07,0.56,0.15,0.29,1.0,0.384009,0.27,0.386014,0.489178,0.28285,323.0,316.23,323.0


In [14]:
def evaluateModel(prediction,actual):    
    prediction.plot(color='red', label='Prediction')
    actual.plot(color='blue', label='Actual')
    plt.legend(loc='best')
    prediction = np.array(prediction.tolist())
    actual = np.array(actual.tolist())
    print('Pearson Correlation Coeefiecient: ', sp.pearsonr(actual, prediction))
    error2 = [((x-y)/y)**2 for x, y in zip(prediction, actual)]
    print ("Normalised RMSE",np.sqrt(np.sum(error2)))

In [5]:
pre2006 = investor['1987':'2005']
trainData = pre2006[['Bullish','Neutral','Bearish']]
pre2006 = msft.loc[trainData.index]
pre2006 = pre2006.interpolate()
trainOutput = pre2006.Open

In [15]:
investor2006 = investor['2006']
testData = investor2006[['Bullish','Neutral','Bearish']]
msft2006 = msft.loc[testData.index].interpolate()
testOutput = msft2006.Open

In [8]:
params =[ {'criterion':[ 'mae', 'mse'],
          'max_features':['auto', 'sqrt', 'log2'],
          'min_samples_split':[2,3],
          'bootstrap':[False],
          'oob_score':[False],
          'min_samples_leaf':[1,2,3],
          'n_jobs':[6]},
         {'criterion':[ 'mae', 'mse'],
          'max_features':['auto', 'sqrt', 'log2'],
          'min_samples_split':[2,3],
          'bootstrap':[True],
          'oob_score':[True],
          'min_samples_leaf':[1,2,3],
          'n_jobs':[6]} 
        ]

In [9]:
randomForestRegression = RFR()
gridsearch = GS(randomForestRegression, param_grid=params)
gridsearch.fit(trainData,trainOutput)
gridsearch.best_params_

{'bootstrap': True,
 'criterion': 'mae',
 'max_features': 'sqrt',
 'min_samples_leaf': 3,
 'min_samples_split': 2,
 'n_jobs': 6,
 'oob_score': True}

In [12]:
randomForestRegression = RFR(
 n_estimators=10,
 bootstrap= True,
 criterion= 'mae',
 max_features= 'sqrt',
 min_samples_leaf= 3,
 min_samples_split= 2,
 n_jobs= 6,
 oob_score= True)

In [13]:
randomForestRegression.fit(trainData,trainOutput)
prediction = randomForestRegression.predict(testData)
prediction = pd.Series(prediction, testData.index)

RandomForestRegressor(bootstrap=True, criterion='mae', max_depth=None,
           max_features='sqrt', max_leaf_nodes=None,
           min_impurity_split=1e-07, min_samples_leaf=3,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           n_estimators=10, n_jobs=6, oob_score=True, random_state=None,
           verbose=0, warm_start=False)