# Library import

In [28]:
import pandas as pd
import numpy as np

import bs4 as bs
import pickle
import requests
import time
import lxml
from datetime import datetime, timedelta

import yfinance as yf
import stockstats

import unicodedata
import json
from textblob import TextBlob

from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier, IsolationForest,VotingClassifier
from sklearn.model_selection import StratifiedShuffleSplit, train_test_split
from sklearn.tree import export_graphviz
from sklearn import tree
from sklearn.neural_network import MLPClassifier
from xgboost import XGBClassifier

from sklearn.metrics import roc_auc_score, confusion_matrix, classification_report
from sklearn.metrics import classification_report, accuracy_score

# Read Data

In [29]:
stock_without_absolute = pd.read_pickle('./data/stock_without_absolute.pkl')
stock_with_absolute = pd.read_pickle('./data/stock_with_absolute.pkl')

label_abs_1d = pd.read_pickle('./data/label_abs_1d.pkl')
label_abs_7d = pd.read_pickle('./data/label_abs_7d.pkl')
label_abs_30d = pd.read_pickle('./data/label_abs_30d.pkl')

label_value_1d = pd.read_pickle('./data/label_value_1d.pkl')
label_value_7d = pd.read_pickle('./data/label_value_7d.pkl')
label_value_30d = pd.read_pickle('./data/label_value_30d.pkl')


# Result Evaluation

In [42]:
def result(grid_model, train_data, train_label, test_data, test_label, clf_name="Classifier"):
    predicted_test = grid_model.predict(test_data)
    predicted_train = grid_model.predict(train_data)
    decision_function = grid_model.predict_proba(test_data)
    
    print("Results for ", clf_name, ": ")
    acc_train = accuracy_score(train_label, predicted_train)
    acc_test = accuracy_score(test_label, predicted_test)
    print("The Train Accuracy  %0.3f" % (acc_train))
    print("The Test Accuracy   %0.3f" % (acc_test ))

    test_label_roc = np.zeros((len(test_label),2) )
    for i,v in enumerate(test_label):
        if v > 0.5:
            test_label_roc[i,1] = 1
        else:
            test_label_roc[i,0] = 1
                
    print("AUC ROC : %0.3f" %( roc_auc_score(test_label_roc, decision_function) ))
    # confusion matrix
    print("confusion matrix / precision recall scores")
    print ( confusion_matrix(test_label, predicted_test) )
    print ( classification_report(test_label, predicted_test) )
    
    pass# Result Evaluation

##  Feature Comparison

In [43]:
finacial_list = ['change', 'open_delta', 'close_delta', 'volume_delta',
'high_low_diff_ratio', 'open_close_diff_ratio', 'close_-1_r',
'close_-6_r', 'kdjk', 'kdjd', 'kdjj', 'macd', 'macds', 'macdh', 'rsi_6',
'rsi_12', 'wr_6', 'wr_12', 'cci', 'atr', 'dma', 'vr', 'boll_-1_d',
'boll_ub_-1_d', 'boll_lb_-1_d', 'boll_k_diff', 'high_low_diff',
'open_close_diff', 'open', 'high', 'low', 'close', 'adj close', 'volume']

news_list = ['news_des_score', 'sp500_open']

external_list = ['sp500_open',
       'sp500_high', 'sp500_low', 'sp500_close', 'sp500_adj close',
       'sp500_volume', 'sp500_high_low_diff', 'sp500_open_close_diff',
       'sp500_high_low_diff_ratio', 'sp500_open_close_diff_ratio',
       'sp500_change', 'sp500_close_delta', 'sp500_volume_delta',
       'sp500_close_-1_r', 'sp500_close_-6_r', 'gold_open', 'gold_high',
       'gold_low', 'gold_close', 'gold_adj close', 'gold_volume',
       'gold_high_low_diff', 'gold_open_close_diff',
       'gold_high_low_diff_ratio', 'gold_open_close_diff_ratio', 'gold_change',
       'gold_close_delta', 'gold_volume_delta', 'gold_close_-1_r',
       'gold_close_-6_r', 'y10bond_open', 'y10bond_high', 'y10bond_low',
       'y10bond_close', 'y10bond_adj close', 'y10bond_high_low_diff',
       'y10bond_open_close_diff', 'y10bond_high_low_diff_ratio',
       'y10bond_open_close_diff_ratio', 'y10bond_change',
       'y10bond_close_delta', 'y10bond_close_-1_r', 'y10bond_close_-6_r',
       'y5bond_open', 'y5bond_high', 'y5bond_low', 'y5bond_close',
       'y5bond_adj close', 'y5bond_high_low_diff', 'y5bond_open_close_diff',
       'y5bond_high_low_diff_ratio', 'y5bond_open_close_diff_ratio',
       'y5bond_change', 'y5bond_close_delta', 'y5bond_close_-1_r','y5bond_close_-6_r']

In [44]:
def create_train_test(train_data, label):
    X_train, X_test, y_train, y_test = train_test_split(train_data, label, test_size=0.1, random_state=42)
    scaler = MinMaxScaler()
    X_train = scaler.fit_transform(X_train)
    X_test = scaler.transform(X_test)
    return X_train, X_test, y_train, y_test

In [45]:
best_param = {'gamma': 0.8, 'max_depth': 6, 'min_child_weight': 7, 'n_estimators': 500, 'n_jobs': -1}

### Train the model with the best parameters received earlier in XGBoost.ipynb

In [46]:
# Only one category of feature
train = stock_with_absolute[finacial_list]
X_train, X_test, y_train, y_test = create_train_test(train, label_abs_7d)
clfXGB = XGBClassifier(**best_param)
clfXGB = clfXGB.fit(X_train, y_train)
result(clfXGB, X_train, y_train, X_test, y_test, clf_name="Only Financial")
print()
train = stock_with_absolute[news_list]
X_train, X_test, y_train, y_test = create_train_test(train, label_abs_7d)
clfXGB = XGBClassifier(**best_param)
clfXGB = clfXGB.fit(X_train, y_train)
result(clfXGB, X_train, y_train, X_test, y_test, clf_name="Only Sentiment")
print()
train = stock_with_absolute[external_list]
X_train, X_test, y_train, y_test = create_train_test(train, label_abs_7d)
clfXGB = XGBClassifier(**best_param)
clfXGB = clfXGB.fit(X_train, y_train)
result(clfXGB, X_train, y_train, X_test, y_test, clf_name="Only External")

Results for  Only Financial : 
The Train Accuracy  0.999
The Test Accuracy   0.684
AUC ROC : 0.779
confusion matrix / precision recall scores
[[ 58  52]
 [ 26 111]]
              precision    recall  f1-score   support

           0       0.69      0.53      0.60       110
           1       0.68      0.81      0.74       137

    accuracy                           0.68       247
   macro avg       0.69      0.67      0.67       247
weighted avg       0.69      0.68      0.68       247


Results for  Only Sentiment : 
The Train Accuracy  0.729
The Test Accuracy   0.607
AUC ROC : 0.631
confusion matrix / precision recall scores
[[ 42  68]
 [ 29 108]]
              precision    recall  f1-score   support

           0       0.59      0.38      0.46       110
           1       0.61      0.79      0.69       137

    accuracy                           0.61       247
   macro avg       0.60      0.59      0.58       247
weighted avg       0.60      0.61      0.59       247


Results for  O

In [47]:
# Only two category of feature
train = stock_with_absolute[external_list+news_list]
X_train, X_test, y_train, y_test = create_train_test(train, label_abs_7d)
clfXGB = XGBClassifier(**best_param)
clfXGB = clfXGB.fit(X_train, y_train)
result(clfXGB, X_train, y_train, X_test, y_test, clf_name="Without Financial")
print()
train = stock_with_absolute[finacial_list+external_list]
X_train, X_test, y_train, y_test = create_train_test(train, label_abs_7d)
clfXGB = XGBClassifier(**best_param)
clfXGB = clfXGB.fit(X_train, y_train)
result(clfXGB, X_train, y_train, X_test, y_test, clf_name="Without Sentiment")
print()
train = stock_with_absolute[finacial_list+news_list]
X_train, X_test, y_train, y_test = create_train_test(train, label_abs_7d)
clfXGB = XGBClassifier(**best_param)
clfXGB = clfXGB.fit(X_train, y_train)
result(clfXGB, X_train, y_train, X_test, y_test, clf_name="Without External")

Results for  Without Financial : 
The Train Accuracy  0.999
The Test Accuracy   0.648
AUC ROC : 0.717
confusion matrix / precision recall scores
[[ 54  56]
 [ 31 106]]
              precision    recall  f1-score   support

           0       0.64      0.49      0.55       110
           1       0.65      0.77      0.71       137

    accuracy                           0.65       247
   macro avg       0.64      0.63      0.63       247
weighted avg       0.65      0.65      0.64       247


Results for  Without Sentiment : 
The Train Accuracy  1.000
The Test Accuracy   0.753
AUC ROC : 0.825
confusion matrix / precision recall scores
[[ 64  46]
 [ 15 122]]
              precision    recall  f1-score   support

           0       0.81      0.58      0.68       110
           1       0.73      0.89      0.80       137

    accuracy                           0.75       247
   macro avg       0.77      0.74      0.74       247
weighted avg       0.76      0.75      0.75       247


Results 

In [48]:
# all
train = stock_with_absolute
X_train, X_test, y_train, y_test = create_train_test(train, label_abs_7d)
clfXGB = XGBClassifier(**best_param)
clfXGB = clfXGB.fit(X_train, y_train)
result(clfXGB, X_train, y_train, X_test, y_test, clf_name="Without Financial")

Results for  Without Financial : 
The Train Accuracy  1.000
The Test Accuracy   0.745
AUC ROC : 0.820
confusion matrix / precision recall scores
[[ 66  44]
 [ 19 118]]
              precision    recall  f1-score   support

           0       0.78      0.60      0.68       110
           1       0.73      0.86      0.79       137

    accuracy                           0.74       247
   macro avg       0.75      0.73      0.73       247
weighted avg       0.75      0.74      0.74       247

