# Library import

In [213]:
import pandas as pd
import numpy as np

import bs4 as bs
import pickle
import requests
import time
import lxml
from datetime import datetime, timedelta

import yfinance as yf
import stockstats

import unicodedata
import json
from textblob import TextBlob

from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier, IsolationForest,VotingClassifier
from sklearn.model_selection import StratifiedShuffleSplit, train_test_split
from sklearn.tree import export_graphviz
from sklearn import tree
from sklearn.neural_network import MLPClassifier
from xgboost import XGBClassifier

from sklearn.metrics import roc_auc_score, confusion_matrix, classification_report
from sklearn.metrics import classification_report, accuracy_score

# Read Data

In [214]:
stock_without_absolute = pd.read_pickle('./data/stock_without_absolute.pkl')
stock_with_absolute = pd.read_pickle('./data/stock_with_absolute.pkl')

label_abs_1d = pd.read_pickle('./data/label_abs_1d.pkl')
label_abs_7d = pd.read_pickle('./data/label_abs_7d.pkl')
label_abs_30d = pd.read_pickle('./data/label_abs_30d.pkl')

label_value_1d = pd.read_pickle('./data/label_value_1d.pkl')
label_value_7d = pd.read_pickle('./data/label_value_7d.pkl')
label_value_30d = pd.read_pickle('./data/label_value_30d.pkl')


# Result Evaluation

In [215]:
def create_grid_model(classifier, param_grid):
    cv = StratifiedShuffleSplit(n_splits=5, test_size=0.1)
    grid_model = GridSearchCV(classifier, param_grid=param_grid, cv=cv, n_jobs=-1,verbose=1, scoring='accuracy')
    return grid_model

In [216]:
def result(grid_model, train_data, predicted_test, test_label, predicted_train, train_label, file_name, decision_function, clf_name="Classifier"):
    print("Results for ", clf_name, ": ")
    print()
    print("The best parameters are %s" % (grid_model.best_params_))
    acc_train = accuracy_score(train_label, predicted_train)
    acc_test = accuracy_score(test_label, predicted_test)
    print("The Train Accuracy  %0.3f" % (acc_train))
    print("The Validation Accuracy   %0.3f" % (grid_model.best_score_))
    print("The Test Accuracy   %0.3f" % (acc_test ))
    
    if (clf_name[:6] == 'Random') | (clf_name == 'DecisionTree'):
        test_label_roc = np.zeros((len(test_label),2) )
        for i,v in enumerate(test_label):
            if v > 0.5:
                test_label_roc[i,1] = 1
            else:
                test_label_roc[i,0] = 1
    
    
    if (clf_name[:6] == 'Random') | (clf_name == 'DecisionTree'):
        print("AUC ROC : %0.3f" %( roc_auc_score(test_label_roc, decision_function) ))
    else:
        print("AUC ROC : %0.3f" %( roc_auc_score(test_label, decision_function) ))

    print("The mean training time of %f" % (np.mean(grid_model.cv_results_['mean_fit_time'], axis=0)) )
    print("The mean test time of %f" % (np.mean(grid_model.cv_results_['mean_score_time'], axis=0)) )
    # confusion matrix
    print("confusion matrix / precision recall scores")
    print ( confusion_matrix(test_label, predicted_test) )
    print ( classification_report(test_label, predicted_test) )
    rfc_model.best_estimator_.feature_importances_
    
    #feature importance
    feats = {} # a dict to hold feature_name: feature_importance
    for feature, importance in zip(train_data.columns, grid_model.best_estimator_.feature_importances_):
        feats[feature] = importance #add the name/value pair 
    importances = pd.DataFrame.from_dict(feats, orient='index').rename(columns={0: 'Gini-importance'})
    importances = importances.sort_values(by='Gini-importance', ascending=False)
    print(importances.head(7)) # print the top 7 features with greater importance
    
    f = open(file_name+'.txt','w')
    f.write("The best parameters are %s\n"% (grid_model.best_params_))
    f.write("The Train Accuracy %0.3f\n" % (acc_train))
    if (clf_name[:6] == 'Random') | (clf_name == 'DecisionTree'):
        f.write("AUC ROC : %0.3f\n" %( roc_auc_score(test_label_roc, decision_function) ))
    else:
        f.write("AUC ROC : %0.3f\n" %( roc_auc_score(test_label, decision_function) ))

    f.write("The Validation Accuracy %0.3f\n" % (grid_model.best_score_))
    f.write("The Test Accuracy %0.3f\n" % (acc_test ))
    f.write( str(confusion_matrix(test_label, predicted_test)) + "\n")
    f.write( str(classification_report(test_label, predicted_test)) + "\n\n")
    f.write(json.dumps(feats))
    f.close()
    
    importances.to_csv(file_name+'.csv')
    
    pass

# Random Forest Classifier

In [None]:
# 改這些地方： train_data, label, filename, clfname

In [219]:
# stock_with_absolute, predict 1 day trend 
rfc = RandomForestClassifier() # classifier
max_depth = [2, 4, 8, 10, 20]
n_estimators = [100, 500, 1000]
max_features = [2 ,'sqrt']
# min_samples_split = [None, 10, 50]

# param to grid search
param_grid = dict(max_depth=max_depth, n_estimators=n_estimators, max_features=max_features)

# train data and label
train_data = stock_with_absolute
label = label_abs_1d

X_train, X_test, y_train, y_test = train_test_split(train_data, label, test_size=0.1, random_state=42)

rfc_model = create_grid_model(rfc, param_grid)
rfc_model.fit(X_train, y_train)

predicted_test = rfc_model.predict(X_test)
predicted_train = rfc_model.predict(X_train)
decision_function = rfc_model.predict_proba(X_test)

result(rfc_model, train_data, predicted_test, y_test, predicted_train, y_train, 
       "./results/rfc_with_ABS_pred_1_d", decision_function, clf_name="RandomForest")


Fitting 5 folds for each of 30 candidates, totalling 150 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:   10.9s
[Parallel(n_jobs=-1)]: Done 150 out of 150 | elapsed:  1.6min finished


Results for  RandomForest : 

The best parameters are {'max_depth': 4, 'max_features': 2, 'n_estimators': 100}
The Train Accuracy  0.673
The Validation Accuracy   0.531
The Test Accuracy   0.538
AUC ROC : 0.480
The mean training time of 4.416464
The mean test time of 0.090529
confusion matrix / precision recall scores
[[ 20  86]
 [ 28 113]]
              precision    recall  f1-score   support

           0       0.42      0.19      0.26       106
           1       0.57      0.80      0.66       141

    accuracy                           0.54       247
   macro avg       0.49      0.50      0.46       247
weighted avg       0.50      0.54      0.49       247

                        Gini-importance
y5bond_high_low_diff           0.019197
y5bond_open_close_diff         0.018199
gold_high                      0.017718
y10bond_high                   0.017347
rsi_12                         0.017284
y10bond_close_-1_r             0.017247
open_close_diff                0.016964


In [220]:
# stock_without_absolute, predict 1 day trend 
rfc = RandomForestClassifier() # classifier
max_depth = [2, 4, 8, 10, 20]
n_estimators = [100, 500, 1000]
max_features = [2 ,'sqrt']
# min_samples_split = [None, 10, 50]

# param to grid search
param_grid = dict(max_depth=max_depth, n_estimators=n_estimators, max_features=max_features)

# train data and label
train_data = stock_without_absolute
label = label_abs_1d

X_train, X_test, y_train, y_test = train_test_split(train_data, label, test_size=0.1, random_state=42)

rfc_model = create_grid_model(rfc, param_grid)
rfc_model.fit(X_train, y_train)

predicted_test = rfc_model.predict(X_test)
predicted_train = rfc_model.predict(X_train)
decision_function = rfc_model.predict_proba(X_test)

result(rfc_model, train_data, predicted_test, y_test, predicted_train, y_train, 
       "./results/rfc_without_ABS_pred_1_d", decision_function, clf_name="RandomForest")

Fitting 5 folds for each of 30 candidates, totalling 150 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    9.7s
[Parallel(n_jobs=-1)]: Done 150 out of 150 | elapsed:  1.4min finished


Results for  RandomForest : 

The best parameters are {'max_depth': 20, 'max_features': 'sqrt', 'n_estimators': 100}
The Train Accuracy  1.000
The Validation Accuracy   0.551
The Test Accuracy   0.555
AUC ROC : 0.562
The mean training time of 4.211553
The mean test time of 0.098626
confusion matrix / precision recall scores
[[51 55]
 [55 86]]
              precision    recall  f1-score   support

           0       0.48      0.48      0.48       106
           1       0.61      0.61      0.61       141

    accuracy                           0.55       247
   macro avg       0.55      0.55      0.55       247
weighted avg       0.55      0.55      0.55       247

                      Gini-importance
sp500_volume_delta           0.020237
boll_lb_-1_d                 0.019369
news_title_score             0.019357
vr                           0.018922
news_des_score               0.018594
y10bond_close_-6_r           0.018266
y5bond_high_low_diff         0.018199


In [222]:
# stock_with_absolute, predict 7 day trend 
rfc = RandomForestClassifier() # classifier
max_depth = [2, 4, 8, 10, 20]
n_estimators = [100, 500, 1000]
max_features = [2 ,'sqrt']
# min_samples_split = [None, 10, 50]

# param to grid search
param_grid = dict(max_depth=max_depth, n_estimators=n_estimators, max_features=max_features)

# train data and label
train_data = stock_with_absolute
label = label_abs_7d

X_train, X_test, y_train, y_test = train_test_split(train_data, label, test_size=0.1, random_state=42)

rfc_model = create_grid_model(rfc, param_grid)
rfc_model.fit(X_train, y_train)

predicted_test = rfc_model.predict(X_test)
predicted_train = rfc_model.predict(X_train)
decision_function = rfc_model.predict_proba(X_test)

result(rfc_model, train_data, predicted_test, y_test, predicted_train, y_train, 
       "./results/rfc_with_ABS_pred_7_d", decision_function, clf_name="RandomForest")

Fitting 5 folds for each of 30 candidates, totalling 150 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    9.2s
[Parallel(n_jobs=-1)]: Done 150 out of 150 | elapsed:  1.5min finished


Results for  RandomForest : 

The best parameters are {'max_depth': 20, 'max_features': 'sqrt', 'n_estimators': 500}
The Train Accuracy  1.000
The Validation Accuracy   0.769
The Test Accuracy   0.781
AUC ROC : 0.890
The mean training time of 4.479552
The mean test time of 0.092636
confusion matrix / precision recall scores
[[ 67  43]
 [ 11 126]]
              precision    recall  f1-score   support

           0       0.86      0.61      0.71       110
           1       0.75      0.92      0.82       137

    accuracy                           0.78       247
   macro avg       0.80      0.76      0.77       247
weighted avg       0.80      0.78      0.77       247

                 Gini-importance
atr                     0.026383
dma                     0.018068
macds                   0.017120
sp500_open              0.016855
sp500_high              0.016242
sp500_adj close         0.016201
gold_low                0.016124


In [221]:
# stock_without_absolute, predict 7 day trend 
rfc = RandomForestClassifier() # classifier
max_depth = [2, 4, 8, 10, 20]
n_estimators = [100, 500, 1000]
max_features = [2 ,'sqrt']
# min_samples_split = [None, 10, 50]

# param to grid search
param_grid = dict(max_depth=max_depth, n_estimators=n_estimators, max_features=max_features)

# train data and label
train_data = stock_without_absolute
label = label_abs_7d

X_train, X_test, y_train, y_test = train_test_split(train_data, label, test_size=0.1, random_state=42)

rfc_model = create_grid_model(rfc, param_grid)
rfc_model.fit(X_train, y_train)

predicted_test = rfc_model.predict(X_test)
predicted_train = rfc_model.predict(X_train)
decision_function = rfc_model.predict_proba(X_test)

result(rfc_model, train_data, predicted_test, y_test, predicted_train, y_train, 
       "./results/rfc_without_ABS_pred_7_d", decision_function, clf_name="RandomForest")

Fitting 5 folds for each of 30 candidates, totalling 150 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    9.3s
[Parallel(n_jobs=-1)]: Done 150 out of 150 | elapsed:  1.5min finished


Results for  RandomForest : 

The best parameters are {'max_depth': 20, 'max_features': 'sqrt', 'n_estimators': 1000}
The Train Accuracy  1.000
The Validation Accuracy   0.666
The Test Accuracy   0.640
AUC ROC : 0.725
The mean training time of 4.299433
The mean test time of 0.094797
confusion matrix / precision recall scores
[[ 33  77]
 [ 12 125]]
              precision    recall  f1-score   support

           0       0.73      0.30      0.43       110
           1       0.62      0.91      0.74       137

    accuracy                           0.64       247
   macro avg       0.68      0.61      0.58       247
weighted avg       0.67      0.64      0.60       247

                 Gini-importance
atr                     0.038052
vr                      0.023309
gold_close_-6_r         0.021191
macds                   0.020025
boll_lb_-1_d            0.019995
macdh                   0.019972
boll_-1_d               0.019800


In [223]:
# stock_with_absolute, predict 30 day trend 
rfc = RandomForestClassifier() # classifier
max_depth = [2, 4, 8, 10, 20]
n_estimators = [100, 500, 1000]
max_features = [2 ,'sqrt']
# min_samples_split = [None, 10, 50]

# param to grid search
param_grid = dict(max_depth=max_depth, n_estimators=n_estimators, max_features=max_features)

# train data and label
train_data = stock_with_absolute
label = label_abs_30d

X_train, X_test, y_train, y_test = train_test_split(train_data, label, test_size=0.1, random_state=42)

rfc_model = create_grid_model(rfc, param_grid)
rfc_model.fit(X_train, y_train)

predicted_test = rfc_model.predict(X_test)
predicted_train = rfc_model.predict(X_train)
decision_function = rfc_model.predict_proba(X_test)

result(rfc_model, train_data, predicted_test, y_test, predicted_train, y_train, 
       "./results/rfc_with_ABS_pred_30_d", decision_function, clf_name="RandomForest")

Fitting 5 folds for each of 30 candidates, totalling 150 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    9.2s
[Parallel(n_jobs=-1)]: Done 150 out of 150 | elapsed:  1.4min finished


Results for  RandomForest : 

The best parameters are {'max_depth': 20, 'max_features': 'sqrt', 'n_estimators': 500}
The Train Accuracy  1.000
The Validation Accuracy   0.909
The Test Accuracy   0.931
AUC ROC : 0.976
The mean training time of 4.203321
The mean test time of 0.091152
confusion matrix / precision recall scores
[[ 69  12]
 [  5 161]]
              precision    recall  f1-score   support

           0       0.93      0.85      0.89        81
           1       0.93      0.97      0.95       166

    accuracy                           0.93       247
   macro avg       0.93      0.91      0.92       247
weighted avg       0.93      0.93      0.93       247

             Gini-importance
atr                 0.033963
sp500_high          0.030688
sp500_close         0.028396
sp500_open          0.028332
sp500_low           0.027637
gold_open           0.026618
gold_high           0.026243


In [224]:
# stock_without_absolute, predict 30 day trend 
rfc = RandomForestClassifier() # classifier
max_depth = [2, 4, 8, 10, 20]
n_estimators = [100, 500, 1000]
max_features = [2 ,'sqrt']
# min_samples_split = [None, 10, 50]

# param to grid search
param_grid = dict(max_depth=max_depth, n_estimators=n_estimators, max_features=max_features)

# train data and label
train_data = stock_without_absolute
label = label_abs_30d

X_train, X_test, y_train, y_test = train_test_split(train_data, label, test_size=0.1, random_state=42)

rfc_model = create_grid_model(rfc, param_grid)
rfc_model.fit(X_train, y_train)

predicted_test = rfc_model.predict(X_test)
predicted_train = rfc_model.predict(X_train)
decision_function = rfc_model.predict_proba(X_test)

result(rfc_model, train_data, predicted_test, y_test, predicted_train, y_train, 
       "./results/rfc_without_ABS_pred_30_d", decision_function, clf_name="RandomForest")

Fitting 5 folds for each of 30 candidates, totalling 150 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    8.3s
[Parallel(n_jobs=-1)]: Done 150 out of 150 | elapsed:  1.3min finished


Results for  RandomForest : 

The best parameters are {'max_depth': 20, 'max_features': 'sqrt', 'n_estimators': 500}
The Train Accuracy  1.000
The Validation Accuracy   0.749
The Test Accuracy   0.781
AUC ROC : 0.860
The mean training time of 3.852295
The mean test time of 0.087666
confusion matrix / precision recall scores
[[ 35  46]
 [  8 158]]
              precision    recall  f1-score   support

           0       0.81      0.43      0.56        81
           1       0.77      0.95      0.85       166

    accuracy                           0.78       247
   macro avg       0.79      0.69      0.71       247
weighted avg       0.79      0.78      0.76       247

           Gini-importance
atr               0.062091
macds             0.038920
dma               0.034029
macd              0.030345
vr                0.030340
macdh             0.021150
boll_-1_d         0.019527
