In [35]:
from collections import defaultdict
from datetime import datetime
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sn
import yfinance as yf
from finta import TA
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.feature_selection import RFE
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score
from sklearn.preprocessing import MinMaxScaler
from sklearn.tree import DecisionTreeClassifier
from tabulate import tabulate
from ta import add_all_ta_features
import xgboost as xgb

In [36]:
WINDOW = 8  # number of rows to look ahead to see what the price did
FETCH_INTERVAL = "60m"  # fetch data by interval (including intraday if period < 60 days)
# valid intervals: 1m,2m,5m,15m,30m,60m,90m,1h,1d,5d,1wk,1mo,3mo
# (optional, default is '1d')
INTERVAL = '2y'  # use "period" instead of start/end
# valid periods: 1d,5d,1mo,3mo,6mo,1y,2y,5y,10y,ytd,max
# (optional, default is '1mo')
symbol = 'AAPL'  # Symbol of the desired stock
ROWS_TO_PREDICT = 128
# one day 16 rows of data

In [37]:
data = pd.read_csv(
    'C:\\Users\\exomat\\Desktop\\repo\\magisterka_analiza\\data\\preprocess\\AAPL_16_21_04_2021 00_40_43_full.csv')

In [38]:
# del (data['close'])
# del (data['open'])
# del (data['high'])
# del (data['volume'])
del (data['close_shift'])
data = data.dropna()
train_set = data.iloc[:-ROWS_TO_PREDICT]
train_set = train_set.iloc[:-WINDOW] # optional drop last n rows (avoid of data leak)
test_set =data.iloc[-ROWS_TO_PREDICT:]

In [39]:
data['class_column'].value_counts()

 1    1398
 0    1396
-1    1388
Name: class_column, dtype: int64

In [40]:
train_set

Unnamed: 0.1,Unnamed: 0,open,high,low,close,Adj Close,volume,close_pct,class_column,volume_adi,...,momentum_wr,momentum_ao,momentum_kama,momentum_roc,momentum_ppo,momentum_ppo_signal,momentum_ppo_hist,others_dr,others_dlr,others_cr
1,1,55.550000,56.095000,55.370000,55.700000,55.700000,0,0.001799,1,-0.000000e+00,...,-61.176471,0.000000,55.641486,0.000000,0.000000,0.000000,0.000000,0.179856,0.179695,0.179856
2,2,56.032500,56.437500,55.937500,56.247500,56.247500,0,0.009829,1,0.000000e+00,...,-17.798595,0.000000,55.895109,0.000000,0.000000,0.000000,0.000000,0.982944,0.978145,1.164568
3,3,56.132500,56.750000,56.042500,56.595000,56.595000,0,0.006178,1,0.000000e+00,...,-11.231884,0.000000,56.185689,0.000000,0.000000,0.000000,0.000000,0.617805,0.615905,1.789568
4,4,56.550000,58.197500,55.625000,57.812500,57.812500,0,0.021513,0,0.000000e+00,...,-13.616269,0.000000,56.833695,0.000000,0.000000,0.000000,0.000000,2.151250,2.128437,3.979317
5,5,57.837500,59.525000,56.887500,57.020000,57.020000,0,-0.013708,1,0.000000e+00,...,-60.288809,0.137792,56.907320,0.000000,0.000000,0.000000,0.000000,-1.370811,-1.380293,2.553957
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4042,4042,121.849998,121.949997,119.794998,119.930000,119.930000,34515975,-0.016806,1,6.971709e+08,...,-54.542259,1.144553,120.287005,-1.011100,4.174922,-23.072724,27.247646,-1.680603,-1.694886,115.701439
4043,4043,119.919998,120.489998,119.470001,120.429298,120.429298,18588318,0.004163,1,7.135468e+08,...,-51.346916,1.091641,120.288723,-1.064449,14.697685,-15.518642,30.216326,0.416325,0.415460,116.599458
4044,4044,120.419998,120.500000,119.794998,119.861000,119.861000,11044540,-0.004719,1,7.045703e+08,...,-54.983837,1.043391,120.278308,-1.006774,15.112980,-9.392318,24.505297,-0.471894,-0.473011,115.577338
4045,4045,119.867500,120.410004,119.550003,120.327003,120.327003,13045822,0.003888,1,7.150979e+08,...,-52.001570,0.866795,120.279079,-0.597271,16.801027,-4.153649,20.954675,0.388787,0.388033,116.415474


In [41]:
y = data['class_column']
features = [x for x in data.columns if x not in ['class_column']]
x = data[features]
scaler = MinMaxScaler()
# x = pd.DataFrame(scaler.fit_transform(x.values), columns=x.columns, index=x.index)
x_train= x.iloc[:-ROWS_TO_PREDICT]
y_train= y.iloc[:-ROWS_TO_PREDICT]
x_test =x.iloc[-ROWS_TO_PREDICT:]
y_test=y.iloc[-ROWS_TO_PREDICT:]

In [42]:
classifiers = dict()

classifiers['DecisionTreeClassifier 1'] = DecisionTreeClassifier(max_depth=10, random_state=0,criterion='gini',splitter='best')
classifiers['DecisionTreeClassifier 2'] = DecisionTreeClassifier(max_depth=10, random_state=0,criterion='gini',splitter='random')
classifiers['DecisionTreeClassifier 3'] = DecisionTreeClassifier(max_depth=10, random_state=0,criterion='entropy',splitter='best')
classifiers['RandomForestClassifier 4'] = RandomForestClassifier(n_estimators=1000, max_depth=3, random_state=0,criterion='gini', n_jobs = -1)
classifiers['RandomForestClassifier 5'] = RandomForestClassifier(n_estimators=100, max_depth=2, random_state=0,criterion='entropy', n_jobs = -1)
classifiers['GradientBoostingClassifier 1'] = GradientBoostingClassifier(n_estimators=100,random_state=0,criterion='friedman_mse',max_depth=3, learning_rate=0.1)
classifiers['GradientBoostingClassifier 2'] = GradientBoostingClassifier(n_estimators=1000,random_state=0,criterion='friedman_mse',max_depth=3, learning_rate=0.1)

In [43]:
def print_conf_matrix(test_y, predict, name):
    matrix = confusion_matrix(test_y, predict,  labels=[-1, 0, 1])
    print(matrix)
    ax= plt.subplot()
    sn.heatmap(matrix, annot=True, ax = ax) #annot=True to annotate cells

    # labels, title and ticks
    ax.set_xlabel('Predicted labels', color='white')
    ax.set_ylabel('True labels', color='white')
    ax.set_title(f'Confusion Matrix for {name}' , color='white')
    ax.xaxis.set_ticklabels(['-1','0', '1'], color='white')
    ax.yaxis.set_ticklabels(['-1','0', '1'], color='white')
    plt.show()

def train_model(model,train_x, train_y):
    model.fit(train_x, train_y)

In [44]:
predictions= dict()
predictions_train= dict()
score = dict()
score_train = dict()
for k,v in classifiers.items():
    print("Calculate: ", k)
    train_model(v,x_train,y_train)
    predictions[k] = v.predict(x_test)
    predictions_train[k] = v.predict(x_train)
    score[k] = accuracy_score(y_test.values, predictions[k])
    score_train[k] = accuracy_score(y_train.values, predictions_train[k])
    print('Score train: ',  score_train[k] )
    print('Score: ',  score[k] )
    # print_conf_matrix(test_y, predictions[k], k)

headers = ["Classifier type", "Accuracy"]
score_df = pd.DataFrame(score_train.items(), columns=headers)
print("train")
print(tabulate(score_df, headers, tablefmt="psql"))
print("test")
score_df = pd.DataFrame(score.items(), columns=headers)
print(tabulate(score_df, headers, tablefmt="psql"))




Calculate:  DecisionTreeClassifier 1
Score train:  0.7318697582634435
Score:  0.390625
Calculate:  DecisionTreeClassifier 2
Score train:  0.7839171188949186
Score:  0.4375
Calculate:  DecisionTreeClassifier 3
Score train:  0.8125308337444499
Score:  0.3515625
Calculate:  RandomForestClassifier 4
Score train:  0.5096201282683769
Score:  0.421875
Calculate:  RandomForestClassifier 5
Score train:  0.4743463246176616
Score:  0.421875
Calculate:  GradientBoostingClassifier 1
Score train:  0.8665515540207203
Score:  0.390625
Calculate:  GradientBoostingClassifier 2
Score train:  1.0
Score:  0.4375
train
+----+------------------------------+------------+
|    | Classifier type              |   Accuracy |
|----+------------------------------+------------|
|  0 | DecisionTreeClassifier 1     |   0.73187  |
|  1 | DecisionTreeClassifier 2     |   0.783917 |
|  2 | DecisionTreeClassifier 3     |   0.812531 |
|  3 | RandomForestClassifier 4     |   0.50962  |
|  4 | RandomForestClassifier 5     | 

In [45]:
filename_to_export = f'../data/results/{symbol}_1_{WINDOW}_{ROWS_TO_PREDICT}_{ datetime.now().strftime("%d_%m_%Y %H_%M_%S")}.csv'
score_df.to_csv(filename_to_export, index=False)

In [46]:
rfe = RFE(classifiers['RandomForestClassifier 5'],10)
fited = rfe.fit(x_train, y_train)
rfe



RFE(estimator=RandomForestClassifier(criterion='entropy', max_depth=2,
                                     n_jobs=-1, random_state=0),
    n_features_to_select=10)

In [47]:
names = x.columns
columns=[]
for i in range(len(fited.support_)):
    if fited.support_[i]:
        columns.append(names[i])

print("Columns with predictive power:", columns )

Columns with predictive power: ['Unnamed: 0', 'high', 'volatility_bbh', 'volatility_kch', 'volatility_dch', 'trend_macd', 'trend_visual_ichimoku_a', 'trend_psar_down', 'momentum_ao', 'momentum_kama']


In [48]:
x_test_cropped = x_test[columns]
x_train_cropped = x_train[columns]
x_train_cropped

Unnamed: 0.1,Unnamed: 0,high,volatility_bbh,volatility_kch,volatility_dch,trend_macd,trend_visual_ichimoku_a,trend_psar_down,momentum_ao,momentum_kama
1,1,56.0950,55.750000,56.470000,56.22000,0.007977,106.371461,-1.000000,0.000000,55.641486
2,2,56.4375,56.418382,56.549167,56.43750,0.057811,106.371461,-1.000000,0.000000,55.895109
3,3,56.7500,56.848155,56.704375,56.75000,0.123917,106.371461,-1.000000,0.000000,56.185689
4,4,58.1975,57.987504,57.320333,58.19750,0.271420,106.371461,-1.000000,0.000000,56.833695
5,5,59.5250,58.026788,57.841667,59.52500,0.320672,106.371461,-1.000000,0.137792,56.907320
...,...,...,...,...,...,...,...,...,...,...
4050,4050,121.9103,123.191105,122.195959,128.45267,-0.296163,115.798393,124.696701,-0.717044,119.511745
4051,4051,120.2000,123.035114,121.869292,128.45267,-0.228849,115.798393,124.231534,-0.750632,119.523453
4052,4052,122.5900,123.241762,121.930626,128.45267,0.009973,115.798393,123.784973,-0.646386,119.599707
4053,4053,122.6000,123.441085,122.093983,124.92175,0.197769,117.379171,123.356275,-0.369091,119.659735


In [49]:
classifiers_boosted = dict()
classifiers_boosted['GradientBoostingClassifier 1'] = GradientBoostingClassifier(n_estimators=100,random_state=0,criterion='friedman_mse',max_depth=3, learning_rate=0.1)
classifiers_boosted['GradientBoostingClassifier 2'] = GradientBoostingClassifier(n_estimators=1000,random_state=0,criterion='friedman_mse',max_depth=3, learning_rate=0.3)
classifiers_boosted['GradientBoostingClassifier 3'] = GradientBoostingClassifier(n_estimators=1000,random_state=0,criterion='friedman_mse',max_depth=2, learning_rate=0.5)
classifiers_boosted['GradientBoostingClassifier 4'] = GradientBoostingClassifier(n_estimators=1000,random_state=0,criterion='friedman_mse',max_depth=2, learning_rate=0.8)

In [50]:
predictions= dict()
predictions_train= dict()
score = dict()
score_train = dict()
for k,v in classifiers_boosted.items():
    print("Calculate: ", k)
    train_model(v,x_train,y_train)
    predictions[k] = v.predict(x_test)
    predictions_train[k] = v.predict(x_train)
    score[k] = accuracy_score(y_test.values, predictions[k])
    score_train[k] = accuracy_score(y_train.values, predictions_train[k])
    print('Score train: ',  score_train[k] )
    print('Score: ',  score[k] )
    # print_conf_matrix(test_y, predictions[k], k)

headers = ["Classifier type", "Accuracy"]
score_df = pd.DataFrame(score_train.items(), columns=headers)
print("train")
print(tabulate(score_df, headers, tablefmt="psql"))
print("test")
score_df = pd.DataFrame(score.items(), columns=headers)
print(tabulate(score_df, headers, tablefmt="psql"))

Calculate:  GradientBoostingClassifier 1
Score train:  0.8665515540207203
Score:  0.390625
Calculate:  GradientBoostingClassifier 2
Score train:  1.0
Score:  0.453125
Calculate:  GradientBoostingClassifier 3
Score train:  1.0
Score:  0.4453125
Calculate:  GradientBoostingClassifier 4
Score train:  1.0
Score:  0.3515625
train
+----+------------------------------+------------+
|    | Classifier type              |   Accuracy |
|----+------------------------------+------------|
|  0 | GradientBoostingClassifier 1 |   0.866552 |
|  1 | GradientBoostingClassifier 2 |   1        |
|  2 | GradientBoostingClassifier 3 |   1        |
|  3 | GradientBoostingClassifier 4 |   1        |
+----+------------------------------+------------+
test
+----+------------------------------+------------+
|    | Classifier type              |   Accuracy |
|----+------------------------------+------------|
|  0 | GradientBoostingClassifier 1 |   0.390625 |
|  1 | GradientBoostingClassifier 2 |   0.453125 |
|  2 

In [51]:
# score_df.to_csv(filename_to_export,mode='a', index=False)