In [21]:
from collections import defaultdict
from sklearn.model_selection import LeaveOneOut
from sklearn.model_selection import cross_val_score
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import confusion_matrix
from sklearn.metrics import jaccard_score
import seaborn as sn
from sklearn.base import clone
from tabulate import tabulate

In [22]:
#consts

filename= 'fb_27_01_2021 18_51_42.csv'

In [23]:
# functions

def calculate_jaccard_score(test_y, predict):
    return jaccard_score(test_y, predict, average="macro")

def print_conf_matrix(test_y, predict, name):
    matrix = confusion_matrix(test_y, predict,  labels=[-1, 0, 1])
    print(matrix)
    ax= plt.subplot()
    sn.heatmap(matrix, annot=True, ax = ax) #annot=True to annotate cells

    # labels, title and ticks
    ax.set_xlabel('Predicted labels', color='white')
    ax.set_ylabel('True labels', color='white')
    ax.set_title(f'Confusion Matrix for {name}' , color='white')
    ax.xaxis.set_ticklabels(['-1','0', '1'], color='white')
    ax.yaxis.set_ticklabels(['-1','0', '1'], color='white')
    plt.show()

def train_model(model,train_x, train_y):
    model.fit(train_x, train_y)

def leave_one_out_cross_val(model, train_x, train_y,):
    return np.mean(cross_val_score(model, train_x,train_y, scoring='accuracy', cv=cv, n_jobs=-1))

In [24]:
df = pd.read_csv(f'../data/preprocess/{filename}')

df.head(10)

Unnamed: 0,Date,Time,Open,High,Low,Close,Volume,Close_diff,h_l_diff,o_c_diff,...,o_c_diff_8,o_c_diff_9,o_c_diff_10,Open_daily_mean,High_daily_mean,Low_daily_mean,Close_daily_mean,h_l_diff_daily_mean,o_c_diff_daily_mean,class_column
0,2017-05-17,19:00:00,147.42,147.5,146.88,147.404,1447021,-0.016,0.62,0.016,...,0.456,-0.1895,-0.294,146.952525,147.05835,145.8754,146.311,1.18295,0.641525,0
1,2017-05-17,20:00:00,147.4001,147.42,146.58,146.91,1639010,-0.494,0.84,0.4901,...,0.1951,0.9301,0.2846,146.952525,147.05835,145.8754,146.311,1.18295,0.641525,-1
2,2017-05-17,21:00:00,146.91,146.9834,145.62,146.08,3303803,-0.83,1.3634,0.83,...,0.5107,0.535,1.27,146.952525,147.05835,145.8754,146.311,1.18295,0.641525,-1
3,2017-05-17,22:00:00,146.08,146.33,144.4216,144.85,7778376,-1.23,1.9084,1.23,...,1.525,0.9107,0.935,146.952525,147.05835,145.8754,146.311,1.18295,0.641525,-1
4,2017-05-18,16:00:00,144.72,146.23,144.51,146.1,4087815,1.25,1.72,-1.38,...,-1.34,-1.085,-1.6993,146.589286,147.457486,146.394286,146.992371,1.0632,-0.403086,1
5,2017-05-18,17:00:00,146.11,147.42,146.11,146.87,3926294,0.77,1.31,-0.76,...,-1.13,-0.72,-0.465,146.589286,147.457486,146.394286,146.992371,1.0632,-0.403086,1
6,2017-05-18,18:00:00,146.87,147.3724,146.34,146.43,2280229,-0.44,1.0324,0.44,...,-0.445,0.07,0.48,146.589286,147.457486,146.394286,146.992371,1.0632,-0.403086,-1
7,2017-05-18,19:00:00,146.44,147.17,146.2,146.73,1460849,0.3,0.97,-0.29,...,0.38,-1.175,-0.66,146.589286,147.457486,146.394286,146.992371,1.0632,-0.403086,1
8,2017-05-18,20:00:00,146.735,147.8,146.57,147.6066,1434632,0.8766,1.23,-0.8716,...,-0.8876,-0.2016,-1.7566,146.589286,147.457486,146.394286,146.992371,1.0632,-0.403086,1
9,2017-05-18,21:00:00,147.61,148.15,147.56,147.64,2415781,0.0334,0.59,-0.03,...,-0.5201,-0.046,0.64,146.589286,147.457486,146.394286,146.992371,1.0632,-0.403086,0


In [25]:
# sort
df = df.sort_values(by=['Date'])
df

Unnamed: 0,Date,Time,Open,High,Low,Close,Volume,Close_diff,h_l_diff,o_c_diff,...,o_c_diff_8,o_c_diff_9,o_c_diff_10,Open_daily_mean,High_daily_mean,Low_daily_mean,Close_daily_mean,h_l_diff_daily_mean,o_c_diff_daily_mean,class_column
0,2017-05-17,19:00:00,147.4200,147.5000,146.8800,147.404,1447021,-0.016,0.6200,0.0160,...,0.4560,-0.1895,-0.2940,146.952525,147.058350,145.875400,146.311000,1.182950,0.641525,0
1,2017-05-17,20:00:00,147.4001,147.4200,146.5800,146.910,1639010,-0.494,0.8400,0.4901,...,0.1951,0.9301,0.2846,146.952525,147.058350,145.875400,146.311000,1.182950,0.641525,-1
2,2017-05-17,21:00:00,146.9100,146.9834,145.6200,146.080,3303803,-0.830,1.3634,0.8300,...,0.5107,0.5350,1.2700,146.952525,147.058350,145.875400,146.311000,1.182950,0.641525,-1
3,2017-05-17,22:00:00,146.0800,146.3300,144.4216,144.850,7778376,-1.230,1.9084,1.2300,...,1.5250,0.9107,0.9350,146.952525,147.058350,145.875400,146.311000,1.182950,0.641525,-1
4,2017-05-18,16:00:00,144.7200,146.2300,144.5100,146.100,4087815,1.250,1.7200,-1.3800,...,-1.3400,-1.0850,-1.6993,146.589286,147.457486,146.394286,146.992371,1.063200,-0.403086,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
980,2017-12-06,17:00:00,174.6776,175.7500,174.1950,175.740,2830563,1.070,1.5550,-1.0624,...,1.9226,-2.1824,-2.6024,175.051086,175.990000,174.537857,175.564286,1.452143,-0.513200,1
981,2017-12-06,18:00:00,175.7100,176.0900,174.9500,175.500,1945926,-0.240,1.1400,0.2100,...,1.6508,3.1950,-0.9100,175.051086,175.990000,174.537857,175.564286,1.452143,-0.513200,-1
982,2017-12-06,19:00:00,175.4800,175.8200,174.4600,175.080,1330343,-0.420,1.3600,0.4000,...,-0.5784,1.8408,3.3850,175.051086,175.990000,174.537857,175.564286,1.452143,-0.513200,-1
983,2017-12-06,20:00:00,175.1100,175.8200,174.7000,175.700,1146662,0.620,1.1200,-0.5900,...,-0.8700,-1.5684,0.8508,175.051086,175.990000,174.537857,175.564286,1.452143,-0.513200,1


In [26]:
# get last date
# df.sort('Date')['Date'].value_counts()

train_set = df.copy()
train_set = train_set.drop(['Date','Time'],1)
train_x = train_set.drop(['class_column'],1)
train_y = train_set['class_column']

In [27]:
train_x

Unnamed: 0,Open,High,Low,Close,Volume,Close_diff,h_l_diff,o_c_diff,Open_1,Open_2,...,o_c_diff_7,o_c_diff_8,o_c_diff_9,o_c_diff_10,Open_daily_mean,High_daily_mean,Low_daily_mean,Close_daily_mean,h_l_diff_daily_mean,o_c_diff_daily_mean
0,147.4200,147.5000,146.8800,147.404,1447021,-0.016,0.6200,0.0160,0.6700,-0.2100,...,-0.2790,0.4560,-0.1895,-0.2940,146.952525,147.058350,145.875400,146.311000,1.182950,0.641525
1,147.4001,147.4200,146.5800,146.910,1639010,-0.494,0.8400,0.4901,-0.0199,0.6501,...,0.1708,0.1951,0.9301,0.2846,146.952525,147.058350,145.875400,146.311000,1.182950,0.641525
2,146.9100,146.9834,145.6200,146.080,3303803,-0.830,1.3634,0.8300,-0.4901,-0.5100,...,1.1250,0.5107,0.5350,1.2700,146.952525,147.058350,145.875400,146.311000,1.182950,0.641525
3,146.0800,146.3300,144.4216,144.850,7778376,-1.230,1.9084,1.2300,-0.8300,-1.3201,...,1.2700,1.5250,0.9107,0.9350,146.952525,147.058350,145.875400,146.311000,1.182950,0.641525
4,144.7200,146.2300,144.5100,146.100,4087815,1.250,1.7200,-1.3800,-1.3600,-2.1900,...,-1.7500,-1.3400,-1.0850,-1.6993,146.589286,147.457486,146.394286,146.992371,1.063200,-0.403086
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
980,174.6776,175.7500,174.1950,175.740,2830563,1.070,1.5550,-1.0624,2.1776,2.0276,...,0.3784,1.9226,-2.1824,-2.6024,175.051086,175.990000,174.537857,175.564286,1.452143,-0.513200
981,175.7100,176.0900,174.9500,175.500,1945926,-0.240,1.1400,0.2100,1.0324,3.2100,...,-0.7684,1.6508,3.1950,-0.9100,175.051086,175.990000,174.537857,175.564286,1.452143,-0.513200
982,175.4800,175.8200,174.4600,175.080,1330343,-0.420,1.3600,0.4000,-0.2300,0.8024,...,0.1200,-0.5784,1.8408,3.3850,175.051086,175.990000,174.537857,175.564286,1.452143,-0.513200
983,175.1100,175.8200,174.7000,175.700,1146662,0.620,1.1200,-0.5900,-0.3700,-0.6000,...,-0.7300,-0.8700,-1.5684,0.8508,175.051086,175.990000,174.537857,175.564286,1.452143,-0.513200


In [28]:
train_y

0      0
1     -1
2     -1
3     -1
4      1
      ..
980    1
981   -1
982   -1
983    1
985    0
Name: class_column, Length: 986, dtype: int64

In [29]:
classifiers = dict()
classifiers['RandomForestClassifier 1'] = RandomForestClassifier(n_estimators=100, max_depth=2, random_state=0,criterion='gini')
classifiers['RandomForestClassifier 2'] = RandomForestClassifier(n_estimators=1000, max_depth=2, random_state=0,criterion='gini')
classifiers['RandomForestClassifier 3'] = RandomForestClassifier(n_estimators=100, max_depth=3, random_state=0,criterion='gini')
classifiers['RandomForestClassifier 4'] = RandomForestClassifier(n_estimators=1000, max_depth=3, random_state=0,criterion='gini')
# classifiers['RandomForestClassifier 5'] = RandomForestClassifier(n_estimators=100, max_depth=2, random_state=0,criterion='entropy')
# classifiers['RandomForestClassifier 6'] = RandomForestClassifier(n_estimators=1000, max_depth=2, random_state=0,criterion='entropy')
# classifiers['RandomForestClassifier 7'] = RandomForestClassifier(n_estimators=100, max_depth=3, random_state=0,criterion='entropy')
# classifiers['RandomForestClassifier 8'] = RandomForestClassifier(n_estimators=1000, max_depth=3, random_state=0,criterion='entropy')

classifiers['DecisionTreeClassifier 1'] = DecisionTreeClassifier(max_depth=10, random_state=0,criterion='gini',splitter='best')
classifiers['DecisionTreeClassifier 2'] = DecisionTreeClassifier(max_depth=10, random_state=0,criterion='gini',splitter='random')
classifiers['DecisionTreeClassifier 3'] = DecisionTreeClassifier(max_depth=10, random_state=0,criterion='entropy',splitter='best')
classifiers['DecisionTreeClassifier 4'] = DecisionTreeClassifier(max_depth=10, random_state=0,criterion='entropy',splitter='random')
classifiers['DecisionTreeClassifier 5'] = DecisionTreeClassifier(random_state=0,criterion='gini',splitter='best')
classifiers['DecisionTreeClassifier 6'] = DecisionTreeClassifier(random_state=0,criterion='gini',splitter='random')
classifiers['DecisionTreeClassifier 7'] = DecisionTreeClassifier(random_state=0,criterion='entropy',splitter='best')
classifiers['DecisionTreeClassifier 8'] = DecisionTreeClassifier(random_state=0,criterion='entropy',splitter='random')

classifiers['GradientBoostingClassifier 1'] = GradientBoostingClassifier(n_estimators=100,random_state=0,criterion='friedman_mse',max_depth=3, learning_rate=0.1)
classifiers['GradientBoostingClassifier 2'] = GradientBoostingClassifier(n_estimators=100,random_state=0,criterion='friedman_mse',max_depth=3, learning_rate=0.3)
classifiers['GradientBoostingClassifier 3'] = GradientBoostingClassifier(n_estimators=100,random_state=0,criterion='friedman_mse',max_depth=3, learning_rate=0.5)
classifiers['GradientBoostingClassifier 4'] = GradientBoostingClassifier(n_estimators=100,random_state=0,criterion='friedman_mse',max_depth=3, learning_rate=1)



In [30]:
cv = LeaveOneOut()
predictions= dict()
score = dict()

for k,v in classifiers.items():
    score[k] = leave_one_out_cross_val(v,train_x,train_y)

In [34]:
headers = ["Classifier type", "Score"]
score_df = pd.DataFrame(score.items(), columns=headers)
print(tabulate(score_df, headers, tablefmt="psql"))




+----+------------------------------+----------+
|    | Classifier type              |    Score |
|----+------------------------------+----------|
|  0 | RandomForestClassifier 1     | 0.98783  |
|  1 | RandomForestClassifier 2     | 0.984787 |
|  2 | RandomForestClassifier 3     | 0.994929 |
|  3 | RandomForestClassifier 4     | 0.994929 |
|  4 | DecisionTreeClassifier 1     | 0.991886 |
|  5 | DecisionTreeClassifier 2     | 0.932049 |
|  6 | DecisionTreeClassifier 3     | 0.991886 |
|  7 | DecisionTreeClassifier 4     | 0.957404 |
|  8 | DecisionTreeClassifier 5     | 0.991886 |
|  9 | DecisionTreeClassifier 6     | 0.953347 |
| 10 | DecisionTreeClassifier 7     | 0.991886 |
| 11 | DecisionTreeClassifier 8     | 0.957404 |
| 12 | GradientBoostingClassifier 1 | 0.991886 |
| 13 | GradientBoostingClassifier 2 | 0.992901 |
| 14 | GradientBoostingClassifier 3 | 0.993915 |
| 15 | GradientBoostingClassifier 4 | 0.993915 |
+----+------------------------------+----------+


In [35]:
classifiers['RandomForestClassifier 1'].fit(train_x,tr)


# loo = LeaveOneOut()
# score = {}
# score = defaultdict(lambda: 0, score)
# # train_x_np = train_x.to_numpy()
# # train_y_np = train_y.to_numpy()
# for k,v in classifiers.items():
#     for idx, row in train_x.iterrows():
#         train_x_lou = train_x.copy()
#         test_x_lou = train_x_lou.iloc[[idx]]
#         train_x_lou = train_x_lou.drop(idx)
#         train_y_lou = train_y.copy()
#         test_y_lou = train_y_lou.iloc[[idx]].to_numpy()
#         train_y_lou = train_y_lou.drop(idx)
#         model = clone(v)
#         train_model(model,train_x_lou,train_y_lou)
#         predict = model.predict(test_x_lou)
#         if test_y_lou==model.predict(test_x_lou):
#             score[k]+=1
#     score[k] = score[k]/len(train_x)
#

NotFittedError: This RandomForestClassifier instance is not fitted yet. Call 'fit' with appropriate arguments before using this estimator.

In [33]:
# headers = ["Classifier type", "Score"]
# score_df = pd.DataFrame(score.items(), columns=headers)
# print(tabulate(score_df, headers, tablefmt="psql"))