# Import Libraries

In [None]:
from datetime import datetime
import numpy as np
import pandas as pd
pd.set_option("display.max.columns", None)

import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
sns.set_style('whitegrid')
import plotly.graph_objects as go

from tapy import Indicators

from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.metrics import (
    accuracy_score, 
    recall_score,
    precision_score,
    classification_report,
    confusion_matrix
)

import xgboost
from xgboost import XGBClassifier

from keras.models import Sequential
from keras import regularizers
from keras.callbacks import EarlyStopping
from keras.layers import Dense, Dropout, GRU
from keras.optimizers import Adam
from keras.preprocessing.sequence import TimeseriesGenerator
from keras.utils import to_categorical

# D1 Timeframe

In [None]:
# read in 2019 full year 1 hour timeframe price data
df_d1_feature = pd.read_csv('../data/df_d1_feature.csv', index_col=0)
df_d1_feature.index = pd.to_datetime(df_d1_feature.index)
df_d1_feature = df_d1_feature.dropna()
df_d1_feature.head()

In [None]:
df_d1_feature.shape

In [None]:
df_d1_feature.info()

In [None]:
non_feature = ['open', 'high', 'low', 'close', 'results',
               'shift_01', 'shift_02', 'shift_03', 'shift_05', 
               'shift_08', 'shift_13', 'shift_21', 'shift_34',
               'shift_55', 'shift_89'
               'sma_02', 'sma_03', 'sma_05', 'sma_08', 
               'sma_13', 'sma_21', 'sma_34', 'sma_55', 'sma_89',
               'diff','gain','loss']
features = [col for col in df_d1_feature.columns if col not in non_feature]

X = df_d1_feature[features]
y = df_d1_feature['results']

## Train Test Split

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, shuffle=False )

In [None]:
X_train.head()

In [None]:
print(X_train.shape)
print(X_test.shape)

In [None]:
y_train.value_counts(normalize=True)

In [None]:
y_test.value_counts(normalize=True)

## Standard Scaler

In [None]:
ss = StandardScaler()
X_train_ss = ss.fit_transform(X_train)
X_test_ss = ss.transform(X_test)

## Logistic Regression

### un-randomized train set

In [None]:
%%time
# Gridsearch parameters
logr_params = {'C':np.logspace(-1,10),
               'class_weight': [None, 'balanced'],
               'penalty':['l1', 'l2'],}

# Score base on roc_auc
logr_gridsearch = RandomizedSearchCV(LogisticRegression(solver='liblinear'), 
                              logr_params,
                              cv=5,
                              verbose=1,                              
                              n_jobs=-1)

logr_gridsearch = logr_gridsearch.fit(X_train_ss, y_train)

In [None]:
print(f"cv best score mean: {round(logr_gridsearch.best_score_, 4)}")
print(f"cv best score std: {round(logr_gridsearch.cv_results_['std_test_score'][logr_gridsearch.best_index_], 4)}")
print(f"cv best param: {logr_gridsearch.best_params_}")
print("")
print(f"train set score: {round(logr_gridsearch.score(X_train_ss, y_train), 4)}")
print(f"validation set score: {round(logr_gridsearch.score(X_test_ss, y_test), 4)}")

In [None]:
logr_pred = logr_gridsearch.predict(X_test_ss)
cm = confusion_matrix(y_test, logr_pred)
cm_df = pd.DataFrame(cm, columns=['pred 0', 'pred 1', 'pred 2'], index=['actual 0', 'actual 1', 'actual 2'])
cm_df

In [None]:
cm_df.loc['actual 1'][1]/(cm_df['pred 1'].sum())

In [None]:
cm_df.loc['actual 2'][2]/(cm_df['pred 2'].sum())

In [None]:
(cm_df.loc['actual 1'][1]+cm_df.loc['actual 2'][2])/((cm_df['pred 1'].sum())+(cm_df['pred 2'].sum()))

## Random Forest

### un-randomized train set

In [None]:
%%time
rf_params = {
    'n_estimators': list(range(400, 601)),
    'min_samples_split': list(range(2, 9)),
    'max_features' : list(range(1, X_train_ss.shape[1])),
    'max_depth': list(range(2, 10)),
    'class_weight': [None, 'balanced', 'balanced_subsample']
}

rf_gridsearch = RandomizedSearchCV(RandomForestClassifier(), 
                              rf_params,
                              cv=5, 
                              verbose=1,
                              n_iter=10,
                              n_jobs=-1)

rf_gridsearch = rf_gridsearch.fit(X_train_ss, y_train)

In [None]:
print(f"cv best score mean: {round(rf_gridsearch.best_score_, 4)}")
print(f"cv best score std: {round(rf_gridsearch.cv_results_['std_test_score'][rf_gridsearch.best_index_], 4)}")
print(f"cv best param: {rf_gridsearch.best_params_}")
print("")
print(f"train set score: {round(rf_gridsearch.score(X_train_ss, y_train), 4)}")
print(f"validation set score: {round(rf_gridsearch.score(X_test_ss, y_test), 4)}")

In [None]:
rf_pred = rf_gridsearch.predict(X_test_ss)
cm = confusion_matrix(y_test, rf_pred)
cm_df = pd.DataFrame(cm, columns=['pred 0', 'pred 1', 'pred 2'], index=['actual 0', 'actual 1', 'actual 2'])
cm_df

In [None]:
cm_df.loc['actual 1'][1]/(cm_df['pred 1'].sum())

In [None]:
cm_df.loc['actual 2'][2]/(cm_df['pred 2'].sum())

In [None]:
(cm_df.loc['actual 1'][1]+cm_df.loc['actual 2'][2])/((cm_df['pred 1'].sum())+(cm_df['pred 2'].sum()))

## Extremely Randomized Trees

### un-randomized train set

In [None]:
%%time
et_params = {
    'n_estimators': list(range(400, 601)),
    'min_samples_split': list(range(2, 9)),    
    'max_features' : list(range(1, X_train_ss.shape[1])),
    'max_depth': list(range(2, 10)),
    'class_weight': [None, 'balanced', 'balanced_subsample']
}

et_gridsearch = RandomizedSearchCV(ExtraTreesClassifier(), 
                              et_params,
                              cv=5, 
                              verbose=1, 
                              n_jobs=-1)

et_gridsearch = et_gridsearch.fit(X_train_ss, y_train)

In [None]:
print(f"cv best score mean: {round(et_gridsearch.best_score_, 4)}")
print(f"cv best score std: {round(et_gridsearch.cv_results_['std_test_score'][et_gridsearch.best_index_], 4)}")
print(f"cv best param: {et_gridsearch.best_params_}")
print("")
print(f"train set score: {round(et_gridsearch.score(X_train_ss, y_train), 4)}")
print(f"validation set score: {round(et_gridsearch.score(X_test_ss, y_test), 4)}")

In [None]:
et_pred = et_gridsearch.predict(X_test_ss)
cm = confusion_matrix(y_test, et_pred)
cm_df = pd.DataFrame(cm, columns=['pred 0', 'pred 1', 'pred 2'], index=['actual 0', 'actual 1', 'actual 2'])
cm_df

In [None]:
cm_df.loc['actual 1'][1]/(cm_df['pred 1'].sum())

In [None]:
cm_df.loc['actual 2'][2]/(cm_df['pred 2'].sum())

In [None]:
(cm_df.loc['actual 1'][1]+cm_df.loc['actual 2'][2])/((cm_df['pred 1'].sum())+(cm_df['pred 2'].sum()))

## <font color=green>XGBoost</font>

### un-randomized train set

In [None]:
%%time
xgc_param = [{'subsample' : [0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,1],
              'reg_lambda' : np.logspace(-5,5),
              'reg_alpha' : np.logspace(-5,5),
              'max_depth' : list(range(2, 10)),
              'learning_rate' : [0.0001],
              'gamma' : np.logspace(-5,5),
              'colsample_bytree' : np.logspace(-5,5), }]

xgc_gridsearch = RandomizedSearchCV(XGBClassifier(), 
                              xgc_param,
                              cv=5, 
                              verbose=1, 
                              n_jobs=-1)

xgc_gridsearch = xgc_gridsearch.fit(X_train_ss, y_train)

In [None]:
print(f"cv best score mean: {round(xgc_gridsearch.best_score_, 4)}")
print(f"cv best score std: {round(xgc_gridsearch.cv_results_['std_test_score'][xgc_gridsearch.best_index_], 4)}")
print(f"cv best param: {xgc_gridsearch.best_params_}")
print("")
print(f"train set score: {round(xgc_gridsearch.score(X_train_ss, y_train), 4)}")
print(f"validation set score: {round(xgc_gridsearch.score(X_test_ss, y_test), 4)}")

In [None]:
xgc_pred = xgc_gridsearch.predict(X_test_ss)
cm = confusion_matrix(y_test, xgc_pred)
cm_df = pd.DataFrame(cm, columns=['pred 0', 'pred 1', 'pred 2'], index=['actual 0', 'actual 1', 'actual 2'])
cm_df

In [None]:
cm_df.loc['actual 1'][1]/(cm_df['pred 1'].sum())

In [None]:
cm_df.loc['actual 2'][2]/(cm_df['pred 2'].sum())

In [None]:
(cm_df.loc['actual 1'][1]+cm_df.loc['actual 2'][2])/((cm_df['pred 1'].sum())+(cm_df['pred 2'].sum()))

## FNN

### un-randomized train set

In [None]:
y_train_cat = to_categorical(y_train)
y_test_cat = to_categorical(y_test)

In [None]:
model = Sequential()
model.add(Dense(X_train_ss.shape[1], 
                input_shape= (X_train_ss.shape[1],),
                activation='relu',
                kernel_regularizer=regularizers.l2(0.01)))
model.add(Dense(3, activation='softmax'))

In [None]:
model.compile(loss='categorical_crossentropy', 
              optimizer=Adam(lr = .0005), 
              metrics=['categorical_accuracy'])

early_stop = EarlyStopping(monitor='val_loss', min_delta=0, patience=5, verbose=1, mode='auto')

In [None]:
%%time
history = model.fit(X_train_ss, y_train_cat, 
                    validation_data=(X_test_ss, y_test_cat), 
                    epochs=50, batch_size=256,verbose=1,
                    callbacks=[early_stop])

In [None]:
plt.plot(history.history['loss'], label='Train loss')
plt.plot(history.history['val_loss'], label='Val Loss')
plt.legend();

In [None]:
plt.plot(history.history['categorical_accuracy'], label='Train accuracy')
plt.plot(history.history['val_categorical_accuracy'], label='val_categorical_accuracy accuracy')
plt.legend();

In [None]:
fnn_pred = model.predict_classes(X_test_ss)
cm = confusion_matrix(y_test, fnn_pred)
cm_df = pd.DataFrame(cm, columns=['pred 0', 'pred 1', 'pred 2'], index=['actual 0', 'actual 1', 'actual 2'])
cm_df

In [None]:
cm_df.loc['actual 1'][1]/(cm_df['pred 1'].sum())

In [None]:
cm_df.loc['actual 2'][2]/(cm_df['pred 2'].sum())

In [None]:
(cm_df.loc['actual 1'][1]+cm_df.loc['actual 2'][2])/((cm_df['pred 1'].sum())+(cm_df['pred 2'].sum()))