# Library import

In [1]:
import pandas as pd
import numpy as np

import bs4 as bs
import pickle
import requests
import time
import lxml
from datetime import datetime, timedelta

import yfinance as yf
import stockstats

import unicodedata
import json
from textblob import TextBlob

from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier, IsolationForest,VotingClassifier
from sklearn.model_selection import StratifiedShuffleSplit, train_test_split
from sklearn.tree import export_graphviz, DecisionTreeClassifier, tree
import graphviz 
from sklearn import tree
from sklearn.neural_network import MLPClassifier
from xgboost import XGBClassifier

from sklearn.metrics import roc_auc_score, confusion_matrix, classification_report
from sklearn.metrics import classification_report, accuracy_score



# Read Data

In [2]:
stock_without_absolute = pd.read_pickle('./data/stock_without_absolute.pkl')
stock_with_absolute = pd.read_pickle('./data/stock_with_absolute.pkl')

label_abs_1d = pd.read_pickle('./data/label_abs_1d.pkl')
label_abs_7d = pd.read_pickle('./data/label_abs_7d.pkl')
label_abs_30d = pd.read_pickle('./data/label_abs_30d.pkl')

label_value_1d = pd.read_pickle('./data/label_value_1d.pkl')
label_value_7d = pd.read_pickle('./data/label_value_7d.pkl')
label_value_30d = pd.read_pickle('./data/label_value_30d.pkl')


In [3]:
# stock_without_absolute.corr().style.background_gradient(cmap='coolwarm').set_precision(2)

# Result Evaluation

In [4]:
def create_grid_model(classifier, param_grid):
    cv = StratifiedShuffleSplit(n_splits=5, test_size=0.1)
    grid_model = GridSearchCV(classifier, param_grid=param_grid, cv=cv, n_jobs=-1,verbose=1, scoring='accuracy')
    return grid_model

In [32]:
def result(grid_model, train_data, predicted_test, test_label, predicted_train, train_label, file_name, decision_function, clf_name="Classifier"):
    print("Results for ", clf_name, ": ")
    print()
    print("The best parameters are %s" % (grid_model.best_params_))
    acc_train = accuracy_score(train_label, predicted_train)
    acc_test = accuracy_score(test_label, predicted_test)
    print("The Train Accuracy  %0.3f" % (acc_train))
    print("The Validation Accuracy   %0.3f" % (grid_model.best_score_))
    print("The Test Accuracy   %0.3f" % (acc_test ))
    
    if (clf_name[:6] == 'Random') | (clf_name == 'DecisionTree'):
        test_label_roc = np.zeros((len(test_label),2) )
        for i,v in enumerate(test_label):
            if v > 0.5:
                test_label_roc[i,1] = 1
            else:
                test_label_roc[i,0] = 1
    
    
    if (clf_name[:6] == 'Random') | (clf_name == 'DecisionTree'):
        print("AUC ROC : %0.3f" %( roc_auc_score(test_label_roc, decision_function)))
    else:
        print("AUC ROC : %0.3f" %( roc_auc_score(test_label, decision_function) ))

    print("The mean training time of %f" % (np.mean(grid_model.cv_results_['mean_fit_time'], axis=0)) )
    print("The mean test time of %f" % (np.mean(grid_model.cv_results_['mean_score_time'], axis=0)) )
    # confusion matrix
    print("confusion matrix / precision recall scores")
    print ( confusion_matrix(test_label, predicted_test) )
    print ( classification_report(test_label, predicted_test) )
    rfc_model.best_estimator_.feature_importances_
    
    #feature importance
    feats = {} # a dict to hold feature_name: feature_importance
    for feature, importance in zip(train_data.columns, grid_model.best_estimator_.feature_importances_):
        feats[feature] = importance #add the name/value pair 
    importances = pd.DataFrame.from_dict(feats, orient='index').rename(columns={0: 'Gini-importance'})
    importances = importances.sort_values(by='Gini-importance', ascending=False)
    print(importances.head(7)) # print the top 7 features with greater importance
    
    f = open(file_name+'.txt','w')
    f.write("The best parameters are %s\n"% (grid_model.best_params_))
    f.write("The Train Accuracy %0.3f\n" % (acc_train))
    if (clf_name[:6] == 'Random') | (clf_name == 'DecisionTree'):
        f.write("AUC ROC : %0.3f\n" %( roc_auc_score(test_label_roc, decision_function) ))
    else:
        f.write("AUC ROC : %0.3f\n" %( roc_auc_score(test_label, decision_function) ))

    f.write("The Validation Accuracy %0.3f\n" % (grid_model.best_score_))
    f.write("The Test Accuracy %0.3f\n" % (acc_test ))
    f.write( str(confusion_matrix(test_label, predicted_test)) + "\n")
    f.write( str(classification_report(test_label, predicted_test)) + "\n\n")
    f.write(json.dumps(feats))
    f.close()
    
    importances.to_csv(file_name+'.csv')
    
    pass

# DecisionTree Classifier

In [6]:
# 改這些地方： train_data, label, filename, clfname

In [33]:
# stock_with_absolute, predict 1 day trend 
rfc = DecisionTreeClassifier() # classifier
max_depth = range(1,20,2)
max_features = [2 ,'sqrt', None]
min_samples_split = range(10,50,10)

# param to grid search
param_grid = dict(max_depth=max_depth, max_features=max_features, min_samples_split=min_samples_split)

# train data and label
train_data = stock_with_absolute
label = label_abs_1d

X_train, X_test, y_train, y_test = train_test_split(train_data, label, test_size=0.1, random_state=42)
scaler = MinMaxScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

rfc_model = create_grid_model(rfc, param_grid)
rfc_model.fit(X_train, y_train)

predicted_test = rfc_model.predict(X_test)
predicted_train = rfc_model.predict(X_train)
decision_function = rfc_model.predict_proba(X_test)

result(rfc_model, train_data, predicted_test, y_test, predicted_train, y_train, 
       "./results/decision_with_ABS_pred_1_d", decision_function, clf_name="DecisionTree")

# dot_data = tree.export_graphviz(model.best_estimator_, 
#         out_file=None, 
#         feature_names=train_data.columns, # the feature names.
#         filled=True, # Whether to fill in the boxes with colours.
#         rounded=True, # Whether to round the corners of the boxes.
#         special_characters=True)
# graph = graphviz.Source(dot_data)
# graph.render('tree')
# graph

Fitting 5 folds for each of 120 candidates, totalling 600 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  58 tasks      | elapsed:    0.3s


Results for  DecisionTree : 

The best parameters are {'max_depth': 9, 'max_features': 2, 'min_samples_split': 20}
The Train Accuracy  0.623
The Validation Accuracy   0.550
The Test Accuracy   0.518
AUC ROC : 0.505
The mean training time of 0.133941
The mean test time of 0.001585
confusion matrix / precision recall scores
[[46 60]
 [59 82]]
              precision    recall  f1-score   support

           0       0.44      0.43      0.44       106
           1       0.58      0.58      0.58       141

    accuracy                           0.52       247
   macro avg       0.51      0.51      0.51       247
weighted avg       0.52      0.52      0.52       247

                             Gini-importance
boll_lb_-1_d                        0.061162
adj close                           0.058876
gold_open                           0.054430
kdjj                                0.048231
y5bond_close_-1_r                   0.047561
sp500_close_-6_r                    0.045850
sp500_open_clos

[Parallel(n_jobs=-1)]: Done 600 out of 600 | elapsed:   11.5s finished


In [34]:
# stock_without_absolute, predict 1 day trend 
rfc = DecisionTreeClassifier() # classifier
max_depth = range(1,20,2)
max_features = [2 ,'sqrt', None]
min_samples_split = range(10,50,10)

# param to grid search
param_grid = dict(max_depth=max_depth, max_features=max_features, min_samples_split=min_samples_split)

# train data and label
train_data = stock_without_absolute
label = label_abs_1d

X_train, X_test, y_train, y_test = train_test_split(train_data, label, test_size=0.1, random_state=42)
scaler = MinMaxScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

rfc_model = create_grid_model(rfc, param_grid)
rfc_model.fit(X_train, y_train)

predicted_test = rfc_model.predict(X_test)
predicted_train = rfc_model.predict(X_train)
decision_function = rfc_model.predict_proba(X_test)

result(rfc_model, train_data, predicted_test, y_test, predicted_train, y_train, 
       "./results/decision_without_ABS_pred_1_d", decision_function, clf_name="DecisionTree")

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.


Fitting 5 folds for each of 120 candidates, totalling 600 fits


[Parallel(n_jobs=-1)]: Done  56 tasks      | elapsed:    0.2s


Results for  DecisionTree : 

The best parameters are {'max_depth': 3, 'max_features': 2, 'min_samples_split': 40}
The Train Accuracy  0.556
The Validation Accuracy   0.555
The Test Accuracy   0.607
AUC ROC : 0.564
The mean training time of 0.113087
The mean test time of 0.001976
confusion matrix / precision recall scores
[[ 30  76]
 [ 21 120]]
              precision    recall  f1-score   support

           0       0.59      0.28      0.38       106
           1       0.61      0.85      0.71       141

    accuracy                           0.61       247
   macro avg       0.60      0.57      0.55       247
weighted avg       0.60      0.61      0.57       247

                     Gini-importance
wr_6                        0.238614
sp500_change                0.152456
volume_delta                0.149387
open_close_diff             0.135848
rsi_6                       0.117565
high_low_diff_ratio         0.114146
gold_close_-1_r             0.091984


[Parallel(n_jobs=-1)]: Done 600 out of 600 | elapsed:   10.0s finished


In [35]:
# stock_with_absolute, predict 7 day trend 
rfc = DecisionTreeClassifier() # classifier
max_depth = range(1,20,2)
max_features = [2 ,'sqrt', None]
min_samples_split = range(10,50,10)

# param to grid search
param_grid = dict(max_depth=max_depth, max_features=max_features, min_samples_split=min_samples_split)

# train data and label
train_data = stock_with_absolute
label = label_abs_7d

X_train, X_test, y_train, y_test = train_test_split(train_data, label, test_size=0.1, random_state=42)
scaler = MinMaxScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

rfc_model = create_grid_model(rfc, param_grid)
rfc_model.fit(X_train, y_train)

predicted_test = rfc_model.predict(X_test)
predicted_train = rfc_model.predict(X_train)
decision_function = rfc_model.predict_proba(X_test)

result(rfc_model, train_data, predicted_test, y_test, predicted_train, y_train, 
       "./results/decision_with_ABS_pred_7_d", decision_function, clf_name="DecisionTree")

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.


Fitting 5 folds for each of 120 candidates, totalling 600 fits


[Parallel(n_jobs=-1)]: Done  58 tasks      | elapsed:    0.3s
[Parallel(n_jobs=-1)]: Done 544 tasks      | elapsed:    9.3s


Results for  DecisionTree : 

The best parameters are {'max_depth': 19, 'max_features': 'sqrt', 'min_samples_split': 40}
The Train Accuracy  0.830
The Validation Accuracy   0.696
The Test Accuracy   0.729
AUC ROC : 0.760
The mean training time of 0.136070
The mean test time of 0.001672
confusion matrix / precision recall scores
[[ 70  40]
 [ 27 110]]
              precision    recall  f1-score   support

           0       0.72      0.64      0.68       110
           1       0.73      0.80      0.77       137

    accuracy                           0.73       247
   macro avg       0.73      0.72      0.72       247
weighted avg       0.73      0.73      0.73       247

                  Gini-importance
gold_close               0.075517
close                    0.056381
sp500_high               0.052367
atr                      0.051161
y5bond_adj close         0.041469
macds                    0.039338
y5bond_low               0.032315


[Parallel(n_jobs=-1)]: Done 600 out of 600 | elapsed:   11.0s finished


In [36]:
# stock_without_absolute, predict 7 day trend 
rfc = DecisionTreeClassifier() # classifier
max_depth = range(1,20,2)
max_features = [2 ,'sqrt', None]
min_samples_split = range(10,50,10)

# param to grid search
param_grid = dict(max_depth=max_depth, max_features=max_features, min_samples_split=min_samples_split)

# train data and label
train_data = stock_without_absolute
label = label_abs_7d

X_train, X_test, y_train, y_test = train_test_split(train_data, label, test_size=0.1, random_state=42)
scaler = MinMaxScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

rfc_model = create_grid_model(rfc, param_grid)
rfc_model.fit(X_train, y_train)

predicted_test = rfc_model.predict(X_test)
predicted_train = rfc_model.predict(X_train)
decision_function = rfc_model.predict_proba(X_test)

result(rfc_model, train_data, predicted_test, y_test, predicted_train, y_train, 
       "./results/decision_without_ABS_pred_7_d", decision_function, clf_name="DecisionTree")

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.


Fitting 5 folds for each of 120 candidates, totalling 600 fits


[Parallel(n_jobs=-1)]: Done  56 tasks      | elapsed:    0.2s
[Parallel(n_jobs=-1)]: Done 600 out of 600 | elapsed:    8.9s finished


Results for  DecisionTree : 

The best parameters are {'max_depth': 15, 'max_features': None, 'min_samples_split': 10}
The Train Accuracy  0.931
The Validation Accuracy   0.659
The Test Accuracy   0.559
AUC ROC : 0.574
The mean training time of 0.101688
The mean test time of 0.001774
confusion matrix / precision recall scores
[[50 60]
 [49 88]]
              precision    recall  f1-score   support

           0       0.51      0.45      0.48       110
           1       0.59      0.64      0.62       137

    accuracy                           0.56       247
   macro avg       0.55      0.55      0.55       247
weighted avg       0.55      0.56      0.56       247

                           Gini-importance
atr                               0.110420
boll_-1_d                         0.055412
vr                                0.041831
sp500_high_low_diff_ratio         0.039833
macds                             0.037373
macd                              0.036212
macdh                    

In [37]:
# stock_with_absolute, predict 30 day trend 
rfc = DecisionTreeClassifier() # classifier
max_depth = range(1,20,2)
max_features = [2 ,'sqrt', None]
min_samples_split = range(10,50,10)

# param to grid search
param_grid = dict(max_depth=max_depth, max_features=max_features, min_samples_split=min_samples_split)

# train data and label
train_data = stock_with_absolute
label = label_abs_30d

X_train, X_test, y_train, y_test = train_test_split(train_data, label, test_size=0.1, random_state=42)
scaler = MinMaxScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

rfc_model = create_grid_model(rfc, param_grid)
rfc_model.fit(X_train, y_train)

predicted_test = rfc_model.predict(X_test)
predicted_train = rfc_model.predict(X_train)
decision_function = rfc_model.predict_proba(X_test)

result(rfc_model, train_data, predicted_test, y_test, predicted_train, y_train, 
       "./results/decision_with_ABS_pred_30_d", decision_function, clf_name="DecisionTree")

Fitting 5 folds for each of 120 candidates, totalling 600 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  56 tasks      | elapsed:    0.3s
[Parallel(n_jobs=-1)]: Done 600 out of 600 | elapsed:   10.0s finished


Results for  DecisionTree : 

The best parameters are {'max_depth': 11, 'max_features': None, 'min_samples_split': 10}
The Train Accuracy  0.961
The Validation Accuracy   0.856
The Test Accuracy   0.822
AUC ROC : 0.856
The mean training time of 0.123855
The mean test time of 0.001694
confusion matrix / precision recall scores
[[ 57  24]
 [ 20 146]]
              precision    recall  f1-score   support

           0       0.74      0.70      0.72        81
           1       0.86      0.88      0.87       166

    accuracy                           0.82       247
   macro avg       0.80      0.79      0.80       247
weighted avg       0.82      0.82      0.82       247

             Gini-importance
gold_low            0.083499
atr                 0.082974
low                 0.060818
gold_open           0.052275
y10bond_low         0.050467
dma                 0.049719
sp500_open          0.042161


In [38]:
# stock_without_absolute, predict 30 day trend 
rfc = DecisionTreeClassifier() # classifier
max_depth = range(1,20,2)
max_features = [2 ,'sqrt', None]
min_samples_split = range(10,50,10)

# param to grid search
param_grid = dict(max_depth=max_depth, max_features=max_features, min_samples_split=min_samples_split)

# train data and label
train_data = stock_without_absolute
label = label_abs_30d

X_train, X_test, y_train, y_test = train_test_split(train_data, label, test_size=0.1, random_state=42)
scaler = MinMaxScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

rfc_model = create_grid_model(rfc, param_grid)
rfc_model.fit(X_train, y_train)

predicted_test = rfc_model.predict(X_test)
predicted_train = rfc_model.predict(X_train)
decision_function = rfc_model.predict_proba(X_test)

result(rfc_model, train_data, predicted_test, y_test, predicted_train, y_train, 
       "./results/decision_without_ABS_pred_30_d", decision_function, clf_name="DecisionTree")

Fitting 5 folds for each of 120 candidates, totalling 600 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  56 tasks      | elapsed:    0.3s


Results for  DecisionTree : 

The best parameters are {'max_depth': 5, 'max_features': None, 'min_samples_split': 20}
The Train Accuracy  0.755
The Validation Accuracy   0.709
The Test Accuracy   0.733
AUC ROC : 0.705
The mean training time of 0.093047
The mean test time of 0.001544
confusion matrix / precision recall scores
[[ 26  55]
 [ 11 155]]
              precision    recall  f1-score   support

           0       0.70      0.32      0.44        81
           1       0.74      0.93      0.82       166

    accuracy                           0.73       247
   macro avg       0.72      0.63      0.63       247
weighted avg       0.73      0.73      0.70       247

                    Gini-importance
atr                        0.327704
macds                      0.189032
dma                        0.151851
vr                         0.069939
y10bond_close_-6_r         0.054096
close_-6_r                 0.032922
rsi_12                     0.031547


[Parallel(n_jobs=-1)]: Done 600 out of 600 | elapsed:    8.1s finished
