# Random Forest Regressor

In [2]:
import pandas as pd
import numpy as np
import random

from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.utils import shuffle
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import RobustScaler, MinMaxScaler, PolynomialFeatures, PowerTransformer
from sklearn.model_selection import train_test_split

from datetime import datetime

from jupyterthemes import jtplot
jtplot.style(figsize=(15, 9))

In [7]:
df = pd.read_csv('../data/spy_2002-1-1_2022-12-31_d.csv', index_col=0)
df

Unnamed: 0_level_0,Open,High,Low,Close,Volume
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2001-12-31,77.603947,77.764298,76.321128,76.367897,14619500
2002-01-02,76.909097,77.336703,76.040518,77.189713,18651900
2002-01-03,77.269879,78.138452,77.196383,78.064957,15743000
2002-01-04,78.285449,78.826642,77.871208,78.586113,20140700
2002-01-07,78.639571,78.833331,77.877897,78.031570,13106500
...,...,...,...,...,...
2022-12-23,378.206578,381.603617,376.592742,381.454193,59857300
2022-12-27,381.334651,381.693267,378.206574,379.949921,51638200
2022-12-28,379.880163,381.932359,374.988858,375.227936,70911500
2022-12-29,378.186666,382.888722,377.638738,381.982178,66970900


In [124]:
close = df[['Close', 'Open', 'Volume']]

close['future_price'] = close['Close'].shift(-1)
close

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._set_item(key, value)


Unnamed: 0_level_0,Close,Open,Volume,future_price
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2001-12-31,76.367897,77.603947,14619500,77.189713
2002-01-02,77.189713,76.909097,18651900,78.064957
2002-01-03,78.064957,77.269879,15743000,78.586113
2002-01-04,78.586113,78.285449,20140700,78.031570
2002-01-07,78.031570,78.639571,13106500,77.851166
...,...,...,...,...
2022-12-23,381.454193,378.206578,59857300,379.949921
2022-12-27,379.949921,381.334651,51638200,375.227936
2022-12-28,375.227936,379.880163,70911500,381.982178
2022-12-29,381.982178,378.186666,66970900,380.975983


In [125]:
close.dropna(inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return func(*args, **kwargs)


In [126]:
close

Unnamed: 0_level_0,Close,Open,Volume,future_price
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2001-12-31,76.367897,77.603947,14619500,77.189713
2002-01-02,77.189713,76.909097,18651900,78.064957
2002-01-03,78.064957,77.269879,15743000,78.586113
2002-01-04,78.586113,78.285449,20140700,78.031570
2002-01-07,78.031570,78.639571,13106500,77.851166
...,...,...,...,...
2022-12-22,379.272491,381.593619,100120900,381.454193
2022-12-23,381.454193,378.206578,59857300,379.949921
2022-12-27,379.949921,381.334651,51638200,375.227936
2022-12-28,375.227936,379.880163,70911500,381.982178


In [158]:
close[
    ((close['Close']>=189.63814441) & (close['Close']<190.63814441))
    |
    ((close['Open']>=189.63814441) & (close['Open']<190.63814441))
]

Unnamed: 0_level_0,Close,Open,Volume,future_price
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2016-07-12,190.759445,190.386714,101275600,190.732864
2016-09-16,190.313736,190.41185,155236400,190.349457
2016-09-19,190.349457,190.991656,80250500,190.358398
2016-09-20,190.358398,191.241427,69665300,192.499008
2016-10-11,190.367294,192.356336,130367400,190.617096
2016-10-12,190.617096,190.510054,73866100,189.992691
2016-10-13,189.992691,189.234548,101357000,190.09079
2016-10-14,190.09079,191.00949,93346200,189.430786
2016-10-17,189.430786,190.064058,58275700,190.617096
2016-10-18,190.617096,191.089825,76869700,191.125443


In [127]:
training_size = int(len(close) * 0.70)
train_data = close.iloc[:training_size]
test_data = close.iloc[training_size:]

In [128]:
X_train = train_data[['Close', 'Open', 'Volume']]
y_train = train_data['future_price']

X_test = test_data[['Close', 'Open', 'Volume']]
y_test = test_data['future_price']

In [136]:
regr = RandomForestRegressor(
            n_estimators=10,
            max_depth=30,
            min_samples_split=50,
            min_samples_leaf=50,
            random_state=1,
#             verbose=2
)
regr.fit(X_train, y_train)

RandomForestRegressor(max_depth=30, min_samples_leaf=50, min_samples_split=50,
                      n_estimators=10, random_state=1)

In [137]:
prediction = regr.predict(X_test)

In [138]:
regr.score(X_test, y_test)

-2.1969010467920542

In [139]:
prediction

array([189.63814441, 189.63814441, 189.63814441, ..., 189.63814441,
       189.63814441, 189.63814441])

In [140]:
y_test

Date
2016-09-12    189.233017
2016-09-13    189.162033
2016-09-14    191.052353
2016-09-15    190.313736
2016-09-16    190.349457
                 ...    
2022-12-22    381.454193
2022-12-23    379.949921
2022-12-27    375.227936
2022-12-28    381.982178
2022-12-29    380.975983
Name: future_price, Length: 1587, dtype: float64

In [141]:
pipeline = Pipeline([
    ('MinMaxScaler', MinMaxScaler()),
    ('robust', RobustScaler()),
    ('poly', PolynomialFeatures()),
    ('rf', RandomForestRegressor(
        n_estimators=20,
        max_depth=20,
        min_samples_split=5000,
        min_samples_leaf=5000,
        random_state=1,
#         verbose=2
    ))
])

In [142]:
pipeline.fit(X_train, y_train)

prediction = pipeline.predict(X_test)

r2 = r2_score(y_test, prediction)
print(f'R2 score: {r2}')

R2 score: -6.400968759399159


In [143]:
prediction

array([107.0314954, 107.0314954, 107.0314954, ..., 107.0314954,
       107.0314954, 107.0314954])