In [94]:
from collections import defaultdict
from datetime import datetime
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sn
import yfinance as yf
from finta import TA
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.feature_selection import RFE
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score
from sklearn.preprocessing import MinMaxScaler
from sklearn.tree import DecisionTreeClassifier
from tabulate import tabulate
from ta import add_all_ta_features

In [95]:
WINDOW = 8  # number of rows to look ahead to see what the price did
FETCH_INTERVAL = "60m"  # fetch data by interval (including intraday if period < 60 days)
# valid intervals: 1m,2m,5m,15m,30m,60m,90m,1h,1d,5d,1wk,1mo,3mo
# (optional, default is '1d')
INTERVAL = '1y'  # use "period" instead of start/end
# valid periods: 1d,5d,1mo,3mo,6mo,1y,2y,5y,10y,ytd,max
# (optional, default is '1mo')
symbol = 'AAPL'  # Symbol of the desired stock
ROWS_TO_PREDICT = 128
# one day 16 rows of data

In [96]:
data = pd.read_csv(
    'C:\\Users\\exomat\\Desktop\\repo\\magisterka_analiza\\data\\preprocess\\AAPL_16_20_04_2021 23_06_09_full.csv')

In [97]:
# del (data['close'])
# del (data['open'])
# del (data['high'])
# del (data['volume'])
del (data['close_shift'])
data = data.dropna()
train_set = data.iloc[:-ROWS_TO_PREDICT]
train_set = train_set.iloc[:-WINDOW] # optional drop last n rows (avoid of data leak)
test_set =data.iloc[-ROWS_TO_PREDICT:]

In [98]:
data['class_column'].value_counts()

 1    1398
 0    1396
-1    1388
Name: class_column, dtype: int64

In [99]:
train_set

Unnamed: 0.1,Unnamed: 0,open,high,low,close,volume,close_pct,class_column,volume_adi,volume_obv,...,momentum_wr,momentum_ao,momentum_kama,momentum_roc,momentum_ppo,momentum_ppo_signal,momentum_ppo_hist,others_dr,others_dlr,others_cr
1,1,55.550000,56.095000,55.370000,55.700000,0,0.001799,1,-0.000000e+00,0,...,-61.176471,0.000000,55.641486,0.000000,0.000000,0.000000,0.000000,0.179856,0.179695,0.179856
2,2,56.032500,56.437500,55.937500,56.247500,0,0.009829,1,0.000000e+00,0,...,-17.798595,0.000000,55.895109,0.000000,0.000000,0.000000,0.000000,0.982944,0.978145,1.164568
3,3,56.132500,56.750000,56.042500,56.595000,0,0.006178,1,0.000000e+00,0,...,-11.231884,0.000000,56.185689,0.000000,0.000000,0.000000,0.000000,0.617805,0.615905,1.789568
4,4,56.550000,58.197500,55.625000,57.812500,0,0.021513,0,0.000000e+00,0,...,-13.616269,0.000000,56.833695,0.000000,0.000000,0.000000,0.000000,2.151250,2.128437,3.979317
5,5,57.837500,59.525000,56.887500,57.020000,0,-0.013708,1,0.000000e+00,0,...,-60.288809,0.137792,56.907320,0.000000,0.000000,0.000000,0.000000,-1.370811,-1.380293,2.553957
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4138,4138,123.630000,123.890000,123.250000,123.400000,0,-0.001860,-1,7.326173e+08,-207956508,...,-69.806164,-0.757845,123.690942,0.476324,-37.791113,-15.805844,-21.985269,-0.186039,-0.186212,121.942446
4139,4139,123.410000,125.071000,122.680000,123.010000,0,-0.003160,-1,7.326173e+08,-207956508,...,-80.873438,-0.921378,123.572178,0.219979,-43.150648,-21.274805,-21.875843,-0.316045,-0.316546,121.241007
4140,4140,123.040000,123.070000,122.680000,122.870000,0,-0.001138,-1,7.326173e+08,-207956508,...,-84.846306,-1.030995,123.455672,-0.813861,-48.048438,-26.629531,-21.418907,-0.113812,-0.113877,120.989209
4141,4141,122.879997,123.180000,121.589996,122.184998,28495155,-0.005575,-1,7.254487e+08,-236451663,...,-86.065235,-1.142760,123.216573,-2.314600,7.799588,-19.743707,27.543296,-0.557502,-0.559062,119.757190


In [100]:
y = data['class_column']
features = [x for x in data.columns if x not in ['class_column']]
x = data[features]
scaler = MinMaxScaler()
# x = pd.DataFrame(scaler.fit_transform(x.values), columns=x.columns, index=x.index)
x_train= x.iloc[:-ROWS_TO_PREDICT]
y_train= y.iloc[:-ROWS_TO_PREDICT]
x_test =x.iloc[-ROWS_TO_PREDICT:]
y_test=y.iloc[-ROWS_TO_PREDICT:]

In [101]:
classifiers = dict()

classifiers['DecisionTreeClassifier 1'] = DecisionTreeClassifier(max_depth=10, random_state=0,criterion='gini',splitter='best')
classifiers['DecisionTreeClassifier 2'] = DecisionTreeClassifier(max_depth=10, random_state=0,criterion='gini',splitter='random')
classifiers['DecisionTreeClassifier 3'] = DecisionTreeClassifier(max_depth=10, random_state=0,criterion='entropy',splitter='best')
classifiers['RandomForestClassifier 4'] = RandomForestClassifier(n_estimators=1000, max_depth=3, random_state=0,criterion='gini', n_jobs = -1)
classifiers['RandomForestClassifier 5'] = RandomForestClassifier(n_estimators=100, max_depth=2, random_state=0,criterion='entropy', n_jobs = -1)
classifiers['GradientBoostingClassifier 1'] = GradientBoostingClassifier(n_estimators=100,random_state=0,criterion='friedman_mse',max_depth=3, learning_rate=0.1)
classifiers['GradientBoostingClassifier 2'] = GradientBoostingClassifier(n_estimators=1000,random_state=0,criterion='friedman_mse',max_depth=3, learning_rate=0.1)

In [102]:
def print_conf_matrix(test_y, predict, name):
    matrix = confusion_matrix(test_y, predict,  labels=[-1, 0, 1])
    print(matrix)
    ax= plt.subplot()
    sn.heatmap(matrix, annot=True, ax = ax) #annot=True to annotate cells

    # labels, title and ticks
    ax.set_xlabel('Predicted labels', color='white')
    ax.set_ylabel('True labels', color='white')
    ax.set_title(f'Confusion Matrix for {name}' , color='white')
    ax.xaxis.set_ticklabels(['-1','0', '1'], color='white')
    ax.yaxis.set_ticklabels(['-1','0', '1'], color='white')
    plt.show()

def train_model(model,train_x, train_y):
    model.fit(train_x, train_y)

In [103]:
predictions= dict()
predictions_train= dict()
score = dict()
score_train = dict()
for k,v in classifiers.items():
    print("Calculate: ", k)
    train_model(v,x_train,y_train)
    predictions[k] = v.predict(x_test)
    predictions_train[k] = v.predict(x_train)
    score[k] = accuracy_score(y_test.values, predictions[k])
    score_train[k] = accuracy_score(y_train.values, predictions_train[k])
    print('Score train: ',  score_train[k] )
    print('Score: ',  score[k] )
    # print_conf_matrix(test_y, predictions[k], k)

headers = ["Classifier type", "Accuracy"]
score_df = pd.DataFrame(score_train.items(), columns=headers)
print("train")
print(tabulate(score_df, headers, tablefmt="psql"))
print("test")
score_df = pd.DataFrame(score.items(), columns=headers)
print(tabulate(score_df, headers, tablefmt="psql"))




Calculate:  DecisionTreeClassifier 1
Score train:  0.788433734939759
Score:  0.25
Calculate:  DecisionTreeClassifier 2
Score train:  0.8236144578313253
Score:  0.34375
Calculate:  DecisionTreeClassifier 3
Score train:  0.7701204819277109
Score:  0.28125
Calculate:  RandomForestClassifier 4
Score train:  0.5086746987951807
Score:  0.0625
Calculate:  RandomForestClassifier 5
Score train:  0.47734939759036144
Score:  0.0625
Calculate:  GradientBoostingClassifier 1
Score train:  0.8587951807228915
Score:  0.0625
Calculate:  GradientBoostingClassifier 2
Score train:  1.0
Score:  0.0625
train
+----+------------------------------+------------+
|    | Classifier type              |   Accuracy |
|----+------------------------------+------------|
|  0 | DecisionTreeClassifier 1     |   0.788434 |
|  1 | DecisionTreeClassifier 2     |   0.823614 |
|  2 | DecisionTreeClassifier 3     |   0.77012  |
|  3 | RandomForestClassifier 4     |   0.508675 |
|  4 | RandomForestClassifier 5     |   0.477349 

In [104]:
filename_to_export = f'../data/results/{symbol}_1_{WINDOW}_{ROWS_TO_PREDICT}_{ datetime.now().strftime("%d_%m_%Y %H_%M_%S")}.csv'
score_df.to_csv(filename_to_export, index=False)

In [105]:
rfe = RFE(classifiers['RandomForestClassifier 5'],10)
fited = rfe.fit(x_train, y_train)
rfe



RFE(estimator=RandomForestClassifier(criterion='entropy', max_depth=2,
                                     n_jobs=-1, random_state=0),
    n_features_to_select=10)

In [106]:
names = x.columns
columns=[]
for i in range(len(fited.support_)):
    if fited.support_[i]:
        columns.append(names[i])

print("Columns with predictive power:", columns )

Columns with predictive power: ['Unnamed: 0', 'high', 'volatility_bbh', 'volatility_dch', 'trend_macd', 'trend_ichimoku_b', 'trend_visual_ichimoku_a', 'trend_psar_down', 'momentum_ao', 'momentum_kama']


In [107]:
x_test_cropped = x_test[columns]
x_train_cropped = x_train[columns]
x_train_cropped

Unnamed: 0.1,Unnamed: 0,high,volatility_bbh,volatility_dch,trend_macd,trend_ichimoku_b,trend_visual_ichimoku_a,trend_psar_down,momentum_ao,momentum_kama
1,1,56.095000,55.750000,56.220000,0.007977,55.795000,106.371461,-1.000000,0.000000,55.641486
2,2,56.437500,56.418382,56.437500,0.057811,55.903750,106.371461,-1.000000,0.000000,55.895109
3,3,56.750000,56.848155,56.750000,0.123917,56.060000,106.371461,-1.000000,0.000000,56.185689
4,4,58.197500,57.987504,58.197500,0.271420,56.783750,106.371461,-1.000000,0.000000,56.833695
5,5,59.525000,58.026788,59.525000,0.320672,57.447500,106.371461,-1.000000,0.137792,56.907320
...,...,...,...,...,...,...,...,...,...,...
4146,4146,121.540001,125.738465,125.859901,-0.725519,121.508866,121.684325,125.254816,-1.896813,121.945077
4147,4147,121.080002,125.912982,125.859901,-0.825179,121.508866,121.684325,125.254816,-2.079954,121.403886
4148,4148,121.427040,125.955757,125.859901,-0.885082,121.508866,121.672975,125.254816,-2.193273,121.150552
4149,4149,120.800000,125.750434,125.410004,-0.913153,121.508866,121.672975,125.254816,-2.436961,121.041108


In [108]:
classifiers_boosted = dict()
classifiers_boosted['GradientBoostingClassifier 1'] = GradientBoostingClassifier(n_estimators=100,random_state=0,criterion='friedman_mse',max_depth=3, learning_rate=0.1)
classifiers_boosted['GradientBoostingClassifier 2'] = GradientBoostingClassifier(n_estimators=1000,random_state=0,criterion='friedman_mse',max_depth=3, learning_rate=0.3)
classifiers_boosted['GradientBoostingClassifier 3'] = GradientBoostingClassifier(n_estimators=1000,random_state=0,criterion='friedman_mse',max_depth=2, learning_rate=0.5)
classifiers_boosted['GradientBoostingClassifier 4'] = GradientBoostingClassifier(n_estimators=1000,random_state=0,criterion='friedman_mse',max_depth=2, learning_rate=0.8)

In [109]:
predictions= dict()
predictions_train= dict()
score = dict()
score_train = dict()
for k,v in classifiers_boosted.items():
    print("Calculate: ", k)
    train_model(v,x_train,y_train)
    predictions[k] = v.predict(x_test)
    predictions_train[k] = v.predict(x_train)
    score[k] = accuracy_score(y_test.values, predictions[k])
    score_train[k] = accuracy_score(y_train.values, predictions_train[k])
    print('Score train: ',  score_train[k] )
    print('Score: ',  score[k] )
    # print_conf_matrix(test_y, predictions[k], k)

headers = ["Classifier type", "Accuracy"]
score_df = pd.DataFrame(score_train.items(), columns=headers)
print("train")
print(tabulate(score_df, headers, tablefmt="psql"))
print("test")
score_df = pd.DataFrame(score.items(), columns=headers)
print(tabulate(score_df, headers, tablefmt="psql"))

Calculate:  GradientBoostingClassifier 1
Score train:  0.8587951807228915
Score:  0.0625
Calculate:  GradientBoostingClassifier 2
Score train:  1.0
Score:  0.15625
Calculate:  GradientBoostingClassifier 3
Score train:  1.0
Score:  0.21875
Calculate:  GradientBoostingClassifier 4
Score train:  1.0
Score:  0.09375
train
+----+------------------------------+------------+
|    | Classifier type              |   Accuracy |
|----+------------------------------+------------|
|  0 | GradientBoostingClassifier 1 |   0.858795 |
|  1 | GradientBoostingClassifier 2 |   1        |
|  2 | GradientBoostingClassifier 3 |   1        |
|  3 | GradientBoostingClassifier 4 |   1        |
+----+------------------------------+------------+
test
+----+------------------------------+------------+
|    | Classifier type              |   Accuracy |
|----+------------------------------+------------|
|  0 | GradientBoostingClassifier 1 |    0.0625  |
|  1 | GradientBoostingClassifier 2 |    0.15625 |
|  2 | Gradi

In [110]:
# score_df.to_csv(filename_to_export,mode='a', index=False)