In [1]:
from collections import defaultdict
from datetime import datetime
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sn
import yfinance as yf
from finta import TA
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.feature_selection import RFE
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score
from sklearn.preprocessing import MinMaxScaler
from sklearn.tree import DecisionTreeClassifier
from tabulate import tabulate
from ta import add_all_ta_features
import xgboost as xgb

In [2]:
WINDOW = 8  # number of rows to look ahead to see what the price did
FETCH_INTERVAL = "60m"  # fetch data by interval (including intraday if period < 60 days)
# valid intervals: 1m,2m,5m,15m,30m,60m,90m,1h,1d,5d,1wk,1mo,3mo
# (optional, default is '1d')
INTERVAL = '2y'  # use "period" instead of start/end
# valid periods: 1d,5d,1mo,3mo,6mo,1y,2y,5y,10y,ytd,max
# (optional, default is '1mo')
symbol = 'AAPL'  # Symbol of the desired stock
ROWS_TO_PREDICT = 128
# one day 16 rows of data

In [3]:
data = pd.read_csv(
    'C:\\Users\\exomat\\Desktop\\repo\\magisterka_analiza\\data\\preprocess\\AAPL_16_21_04_2021 00_40_43_full.csv')

In [4]:
important_columns = ['open', 'high', 'low']

In [5]:

def calculate_diffs(diff_number, col_name):
    new_col_name = f'{col_name}_{diff_number}'
    data[new_col_name] = data[col_name].diff(diff_number)

In [6]:
for name in important_columns:
    for i in range(1,11):
        calculate_diffs(i,name)

data.head(10)

Unnamed: 0.1,Unnamed: 0,open,high,low,close,Adj Close,volume,close_pct,close_shift,class_column,...,low_1,low_2,low_3,low_4,low_5,low_6,low_7,low_8,low_9,low_10
0,0,56.22,56.22,55.5,55.6,55.6,0,,56.875,1,...,,,,,,,,,,
1,1,55.55,56.095,55.37,55.7,55.7,0,0.001799,58.4475,1,...,-0.13,,,,,,,,,
2,2,56.0325,56.4375,55.9375,56.2475,56.2475,0,0.009829,58.55,1,...,0.5675,0.4375,,,,,,,,
3,3,56.1325,56.75,56.0425,56.595,56.595,0,0.006178,59.2225,1,...,0.105,0.6725,0.5425,,,,,,,
4,4,56.55,58.1975,55.625,57.8125,57.8125,0,0.021513,58.3075,0,...,-0.4175,-0.3125,0.255,0.125,,,,,,
5,5,57.8375,59.525,56.8875,57.02,57.02,0,-0.013708,58.175,1,...,1.2625,0.845,0.95,1.5175,1.3875,,,,,
6,6,56.299999,56.75,54.82375,55.450001,55.450001,21473989,-0.027534,58.892525,1,...,-2.06375,-0.80125,-1.21875,-1.11375,-0.54625,-0.67625,,,,
7,7,55.465,55.889999,53.412498,53.767502,53.767502,11306818,-0.030343,60.434502,1,...,-1.411251,-3.475002,-2.212502,-2.630002,-2.525002,-1.957502,-2.087502,,,
8,8,53.762501,55.0975,53.1525,54.803925,54.803925,11355189,0.019276,60.172501,1,...,-0.259998,-1.671249,-3.735,-2.4725,-2.89,-2.785,-2.2175,-2.3475,,
9,9,54.823875,56.775002,54.4725,55.486252,55.486252,10671797,0.01245,60.645,1,...,1.32,1.060001,-0.35125,-2.415,-1.1525,-1.57,-1.465,-0.8975,-1.0275,


In [7]:
data = data.dropna()

In [8]:
# del (data['close'])
# del (data['open'])
# del (data['high'])
# del (data['volume'])
del (data['close_shift'])
data = data.dropna()
train_set = data.iloc[:-ROWS_TO_PREDICT]
train_set = train_set.iloc[:-WINDOW] # optional drop last n rows (avoid of data leak)
test_set =data.iloc[-ROWS_TO_PREDICT:]

In [10]:
data['class_column'].value_counts()

 1    1390
-1    1388
 0    1379
Name: class_column, dtype: int64

In [11]:
train_set

Unnamed: 0.1,Unnamed: 0,open,high,low,close,Adj Close,volume,close_pct,class_column,volume_adi,...,low_1,low_2,low_3,low_4,low_5,low_6,low_7,low_8,low_9,low_10
10,10,55.497501,56.161598,54.549999,55.264999,55.264999,7868884,-0.003988,1,-9.811428e+06,...,0.077499,1.397499,1.137501,-0.273750,-2.337501,-1.075001,-1.492501,-1.387501,-0.820001,-0.950001
11,11,55.247501,55.596424,54.255001,54.432499,54.432499,6766480,-0.015064,1,-1.478722e+07,...,-0.294998,-0.217499,1.102501,0.842503,-0.568748,-2.632499,-1.369999,-1.787499,-1.682499,-1.114999
12,12,54.435001,56.700001,54.365250,55.994999,55.994999,8603180,0.028705,1,-1.137967e+07,...,0.110249,-0.184750,-0.107250,1.212749,0.952751,-0.458500,-2.522250,-1.259750,-1.677250,-1.572250
13,13,56.092500,56.222500,55.269924,56.000000,56.000000,0,0.000089,1,-1.137967e+07,...,0.904674,1.014923,0.719925,0.797424,2.117424,1.857426,0.446174,-1.617576,-0.355076,-0.772576
14,14,55.912500,56.250000,55.800026,56.200000,56.200000,0,0.003571,1,-1.137967e+07,...,0.530102,1.434776,1.545025,1.250027,1.327526,2.647526,2.387528,0.976276,-1.087474,0.175026
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4026,4026,119.000000,121.559898,118.790001,121.317001,121.317001,39941189,0.018700,-1,7.316685e+08,...,0.260001,2.430001,0.110001,-0.029999,0.310001,0.680001,2.290001,9.512146,2.680001,2.580002
4027,4027,121.315002,121.370003,120.050003,120.427101,120.427101,16707654,-0.007335,0,7.245069e+08,...,1.260002,1.520003,3.690003,1.370003,1.230003,1.570003,1.940003,3.550003,10.772148,3.940003
4028,4028,120.430000,121.235001,120.199997,121.103401,121.103401,11116493,0.005616,-1,7.327965e+08,...,0.149994,1.409996,1.669997,3.839997,1.519997,1.379997,1.719997,2.089997,3.699997,10.922142
4029,4029,121.099998,121.690002,120.750000,121.565002,121.565002,11146638,0.003812,-1,7.409786e+08,...,0.550003,0.699997,1.959999,2.220000,4.390000,2.070000,1.930000,2.270000,2.640000,4.250000


In [12]:
y = data['class_column']
features = [x for x in data.columns if x not in ['class_column']]
x = data[features]
scaler = MinMaxScaler()
# x = pd.DataFrame(scaler.fit_transform(x.values), columns=x.columns, index=x.index)
x_train= x.iloc[:-ROWS_TO_PREDICT]
y_train= y.iloc[:-ROWS_TO_PREDICT]
x_test =x.iloc[-ROWS_TO_PREDICT:]
y_test=y.iloc[-ROWS_TO_PREDICT:]

In [16]:
for i in range(2,20):
    model = xgb.XGBRFClassifier(nthread =-1,max_depth=i,num_parallel_tree=100,eta =0.4)
    model.fit(x_train,y_train)
    predicted_train = model.predict(x_train)
    predicted_test = model.predict(x_test)
    print("------------")
    print(f'max_depth: {i}')
    print(accuracy_score(y_train.values, predicted_train))
    print(accuracy_score(y_test.values, predicted_test))
    print("------------")

# for i in range(0,100):

model = xgb.sklearn.XGBRFClassifier(n_jobs=-1,max_depth=12,n_estimators =100,eta=0.4)
model.fit(x_train,y_train)
predicted_train = model.predict(x_train)
predicted_test = model.predict(x_test)
print("------------")
print(f'eta: ')
print(accuracy_score(y_train.values, predicted_train))
print(accuracy_score(y_test.values, predicted_test))
print("------------")



------------
max_depth: 2
0.513030528667163
0.4453125
------------
------------
max_depth: 3
0.5656490444278978
0.4453125
------------
------------
max_depth: 4
0.653512037726483
0.4453125
------------
------------
max_depth: 5
0.7614792752544055
0.421875
------------
------------
max_depth: 6
0.8255150161330355
0.46875
------------
------------
max_depth: 7
0.876644328617523
0.4921875
------------
------------
max_depth: 8
0.919831223628692
0.5078125
------------
------------
max_depth: 9
0.9431620749565649
0.5078125
------------
------------
max_depth: 10
0.9610325142715314
0.5234375
------------
------------
max_depth: 11
0.9751799453958798
0.515625
------------
------------
max_depth: 12
0.9843633655994043
0.5234375
------------
------------
max_depth: 13
0.9895755770662695
0.5078125
------------
------------
max_depth: 14
0.9930503847108464
0.515625
------------
------------
max_depth: 15
0.9940431868950111
0.5078125
------------
------------
max_depth: 16
0.9967733929014644
0.5
-

In [18]:
predicted_test


array([-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,  1,  1,  1,  1,
        1,  1,  1,  1,  1,  1,  1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
       -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
       -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
       -1, -1,  1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
       -1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  0,  0,
        1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1, -1, -1, -1, -1,
       -1, -1, -1, -1, -1, -1, -1, -1, -1], dtype=int64)

In [19]:
y_train.values

array([ 1,  1,  1, ..., -1, -1, -1], dtype=int64)

In [10]:
accuracy_score(y_train.values, predicted_train)


0.9965466206216083

In [11]:
accuracy_score(y_test.values, predicted_test)

0.4921875

In [12]:
model.feature_importances_

array([0.00570821, 0.00381237, 0.00671404, 0.00632185, 0.00920099,
       0.00962175, 0.00414626, 0.00327022, 0.01308581, 0.01473213,
       0.01021574, 0.0073754 , 0.00863127, 0.01194806, 0.01144959,
       0.00496418, 0.01368823, 0.01187784, 0.00808209, 0.01010779,
       0.01332559, 0.01146628, 0.00927289, 0.00666881, 0.00474349,
       0.0058435 , 0.01131136, 0.01040499, 0.01207635, 0.0089634 ,
       0.0068779 , 0.00707698, 0.0058231 , 0.01800122, 0.02046858,
       0.01484249, 0.01270077, 0.00760605, 0.01038803, 0.01082772,
       0.01151109, 0.01021596, 0.01260001, 0.01380788, 0.01478382,
       0.01742299, 0.01117344, 0.00926107, 0.01008793, 0.00659338,
       0.00702907, 0.00750393, 0.01071193, 0.01008036, 0.00740577,
       0.00641479, 0.0117988 , 0.01252222, 0.00990448, 0.01581678,
       0.01933273, 0.02334621, 0.01971269, 0.01980872, 0.02068326,
       0.00739919, 0.00838184, 0.01333402, 0.01592654, 0.02283322,
       0.00250278, 0.0035254 , 0.01184978, 0.01096373, 0.00779

In [13]:
# model = xgb.XGBRFClassifier(nthread =-1,max_depth=14,n_estimators=1000,
#                           eta =0.2)
# model.fit(x_train,y_train)
predicted_train = model.predict(x_train)
predicted_test = model.predict(x_test)
print("------------")
print(f'max_depth: {14}')
print(accuracy_score(y_train.values, predicted_train))
print(accuracy_score(y_test.values, predicted_test))
print("------------")

------------
max_depth: 14
0.9965466206216083
0.4921875
------------
