# Library import

In [1]:
import pandas as pd
import numpy as np

import bs4 as bs
import pickle
import requests
import time
import lxml
from datetime import datetime, timedelta

import yfinance as yf
import stockstats

import unicodedata
import json
from textblob import TextBlob

from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier, IsolationForest,VotingClassifier
from sklearn.model_selection import StratifiedShuffleSplit, train_test_split
from sklearn.tree import export_graphviz, DecisionTreeClassifier, tree
import graphviz 
from sklearn import tree
from sklearn.neural_network import MLPClassifier
from xgboost import XGBClassifier

from sklearn.metrics import roc_auc_score, confusion_matrix, classification_report
from sklearn.metrics import classification_report, accuracy_score



In [2]:
target="MSFT"

# Read Data

In [3]:
stock_without_absolute = pd.read_pickle('./data/'+target+'/stock_without_absolute.pkl')
stock_with_absolute = pd.read_pickle('./data/'+target+'/stock_with_absolute.pkl')

label_abs_1d = pd.read_pickle('./data/'+target+'/label_abs_1d.pkl')
label_abs_7d = pd.read_pickle('./data/'+target+'/label_abs_7d.pkl')
label_abs_30d = pd.read_pickle('./data/'+target+'/label_abs_30d.pkl')

label_value_1d = pd.read_pickle('./data/'+target+'/label_value_1d.pkl')
label_value_7d = pd.read_pickle('./data/'+target+'/label_value_7d.pkl')
label_value_30d = pd.read_pickle('./data/'+target+'/label_value_30d.pkl')


In [4]:
# stock_without_absolute.corr().style.background_gradient(cmap='coolwarm').set_precision(2)

# Result Evaluation

In [5]:
def create_grid_model(classifier, param_grid):
    cv = StratifiedShuffleSplit(n_splits=5, test_size=0.1)
    grid_model = GridSearchCV(classifier, param_grid=param_grid, cv=cv, n_jobs=-1,verbose=1, scoring='accuracy')
    return grid_model

In [6]:
def result(grid_model, train_data, predicted_test, test_label, predicted_train, train_label, file_name, decision_function, clf_name="Classifier"):
    print("Results for ", clf_name, ": ")
    print()
    print("The best parameters are %s" % (grid_model.best_params_))
    acc_train = accuracy_score(train_label, predicted_train)
    acc_test = accuracy_score(test_label, predicted_test)
    print("The Train Accuracy  %0.3f" % (acc_train))
    print("The Validation Accuracy   %0.3f" % (grid_model.best_score_))
    print("The Test Accuracy   %0.3f" % (acc_test ))
    
    test_label_roc = np.zeros((len(test_label),2) )
    for i,v in enumerate(test_label):
        if v > 0.5:
            test_label_roc[i,1] = 1
        else:
            test_label_roc[i,0] = 1
    
    
    print("AUC ROC : %0.3f" %( roc_auc_score(test_label_roc, decision_function)))

    print("The mean training time of %f" % (np.mean(grid_model.cv_results_['mean_fit_time'], axis=0)) )
    print("The mean test time of %f" % (np.mean(grid_model.cv_results_['mean_score_time'], axis=0)) )
    # confusion matrix
    print("confusion matrix / precision recall scores")
    print ( confusion_matrix(test_label, predicted_test) )
    print ( classification_report(test_label, predicted_test))
    
    f = open(file_name+'.txt','w')
    f.write("The best parameters are %s\n"% (grid_model.best_params_))
    f.write("The Train Accuracy %0.3f\n" % (acc_train))
    f.write("AUC ROC : %0.3f\n" %( roc_auc_score(test_label_roc, decision_function) ))

    f.write("The Validation Accuracy %0.3f\n" % (grid_model.best_score_))
    f.write("The Test Accuracy %0.3f\n" % (acc_test ))
    f.write( str(confusion_matrix(test_label, predicted_test)) + "\n")
    f.write( str(classification_report(test_label, predicted_test)) + "\n\n")
    f.close()
    
    pass

In [7]:
best_1d_model_param = None
best_7d_model_param = None
best_30d_model_param = None

# DecisionTree Classifier

In [8]:
# 改這些地方： train_data, label, filename, clfname

In [9]:
# stock_with_absolute, predict 1 day trend 
rfc = LogisticRegression(n_jobs=-1) # classifier
solver = ['newton-cg', 'lbfgs', 'liblinear', 'sag','saga']

# param to grid search
param_grid = dict(solver=solver)

# train data and label
train_data = stock_with_absolute
label = label_abs_1d

X_train, X_test, y_train, y_test = train_test_split(train_data, label, test_size=0.1, shuffle=True)
scaler = MinMaxScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

rfc_model = create_grid_model(rfc, param_grid)
rfc_model.fit(X_train, y_train)

predicted_test = rfc_model.predict(X_test)
predicted_train = rfc_model.predict(X_train)
decision_function = rfc_model.predict_proba(X_test)

result(rfc_model, train_data, predicted_test, y_test, predicted_train, y_train, 
       "./results/"+target+"/lgr_with_ABS_pred_1_d", decision_function, clf_name="Logistic Regression")

best_1d_model_param = rfc_model.best_params_

Fitting 5 folds for each of 5 candidates, totalling 25 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  25 out of  25 | elapsed:    4.2s finished


Results for  Logistic Regression : 

The best parameters are {'solver': 'saga'}
The Train Accuracy  0.570
The Validation Accuracy   0.510
The Test Accuracy   0.490
AUC ROC : 0.520
The mean training time of 0.320416
The mean test time of 0.000888
confusion matrix / precision recall scores
[[90 44]
 [83 32]]
              precision    recall  f1-score   support

           0       0.52      0.67      0.59       134
           1       0.42      0.28      0.34       115

    accuracy                           0.49       249
   macro avg       0.47      0.47      0.46       249
weighted avg       0.47      0.49      0.47       249





In [10]:
# # stock_without_absolute, predict 1 day trend 
# rfc = LogisticRegression(n_jobs=-1) # classifier
# solver = ['newton-cg', 'lbfgs', 'liblinear', 'sag','saga']

# # param to grid search
# param_grid = dict(solver=solver)

# # train data and label
# train_data = stock_without_absolute
# label = label_abs_1d

# X_train, X_test, y_train, y_test = train_test_split(train_data, label, test_size=0.1, random_state=42)
# scaler = MinMaxScaler()
# X_train = scaler.fit_transform(X_train)
# X_test = scaler.transform(X_test)

# rfc_model = create_grid_model(rfc, param_grid)
# rfc_model.fit(X_train, y_train)

# predicted_test = rfc_model.predict(X_test)
# predicted_train = rfc_model.predict(X_train)
# decision_function = rfc_model.predict_proba(X_test)

# result(rfc_model, train_data, predicted_test, y_test, predicted_train, y_train, 
#        "./results/lgr_without_ABS_pred_1_d", decision_function, clf_name="Logistic Regression")

In [11]:
# stock_with_absolute, predict 7 day trend 
rfc = LogisticRegression(n_jobs=-1) # classifier
solver = ['newton-cg', 'lbfgs', 'liblinear', 'sag','saga']

# param to grid search
param_grid = dict(solver=solver)

# train data and label
train_data = stock_with_absolute
label = label_abs_7d

X_train, X_test, y_train, y_test = train_test_split(train_data, label, test_size=0.1, shuffle=True)
scaler = MinMaxScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

rfc_model = create_grid_model(rfc, param_grid)
rfc_model.fit(X_train, y_train)

predicted_test = rfc_model.predict(X_test)
predicted_train = rfc_model.predict(X_train)
decision_function = rfc_model.predict_proba(X_test)

result(rfc_model, train_data, predicted_test, y_test, predicted_train, y_train, 
       "./results/"+target+"/lgr_with_ABS_pred_7_d", decision_function, clf_name="Logistic Regression")
best_7d_model_param = rfc_model.best_params_

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.


Fitting 5 folds for each of 5 candidates, totalling 25 fits
Results for  Logistic Regression : 

The best parameters are {'solver': 'lbfgs'}
The Train Accuracy  0.605
The Validation Accuracy   0.563
The Test Accuracy   0.598
AUC ROC : 0.619
The mean training time of 0.318198
The mean test time of 0.000745
confusion matrix / precision recall scores
[[114  23]
 [ 77  35]]
              precision    recall  f1-score   support

           0       0.60      0.83      0.70       137
           1       0.60      0.31      0.41       112

    accuracy                           0.60       249
   macro avg       0.60      0.57      0.55       249
weighted avg       0.60      0.60      0.57       249



[Parallel(n_jobs=-1)]: Done  25 out of  25 | elapsed:    1.3s finished


In [12]:
# # stock_without_absolute, predict 7 day trend 
# rfc = LogisticRegression(n_jobs=-1) # classifier
# solver = ['newton-cg', 'lbfgs', 'liblinear', 'sag','saga']

# # param to grid search
# param_grid = dict(solver=solver)

# # train data and label
# train_data = stock_without_absolute
# label = label_abs_7d

# X_train, X_test, y_train, y_test = train_test_split(train_data, label, test_size=0.1, random_state=42)
# scaler = MinMaxScaler()
# X_train = scaler.fit_transform(X_train)
# X_test = scaler.transform(X_test)

# rfc_model = create_grid_model(rfc, param_grid)
# rfc_model.fit(X_train, y_train)

# predicted_test = rfc_model.predict(X_test)
# predicted_train = rfc_model.predict(X_train)
# decision_function = rfc_model.predict_proba(X_test)

# result(rfc_model, train_data, predicted_test, y_test, predicted_train, y_train, 
#        "./results/lgr_without_ABS_pred_7_d", decision_function, clf_name="Logistic Regression")

In [13]:
# stock_with_absolute, predict 30 day trend 
rfc = LogisticRegression(n_jobs=-1) # classifier
solver = ['newton-cg', 'lbfgs', 'liblinear', 'sag','saga']

# param to grid search
param_grid = dict(solver=solver)

# train data and label
train_data = stock_with_absolute
label = label_abs_30d

X_train, X_test, y_train, y_test = train_test_split(train_data, label, test_size=0.1, shuffle=True)
scaler = MinMaxScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

rfc_model = create_grid_model(rfc, param_grid)
rfc_model.fit(X_train, y_train)

predicted_test = rfc_model.predict(X_test)
predicted_train = rfc_model.predict(X_train)
decision_function = rfc_model.predict_proba(X_test)

result(rfc_model, train_data, predicted_test, y_test, predicted_train, y_train, 
       "./results/"+target+"/lgr_with_ABS_pred_30_d", decision_function, clf_name="Logistic Regression")
best_30d_model_param = rfc_model.best_params_

Fitting 5 folds for each of 5 candidates, totalling 25 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.


Results for  Logistic Regression : 

The best parameters are {'solver': 'newton-cg'}
The Train Accuracy  0.706
The Validation Accuracy   0.667
The Test Accuracy   0.647
AUC ROC : 0.647
The mean training time of 0.280239
The mean test time of 0.000756
confusion matrix / precision recall scores
[[143  22]
 [ 66  18]]
              precision    recall  f1-score   support

           0       0.68      0.87      0.76       165
           1       0.45      0.21      0.29        84

    accuracy                           0.65       249
   macro avg       0.57      0.54      0.53       249
weighted avg       0.61      0.65      0.60       249



[Parallel(n_jobs=-1)]: Done  25 out of  25 | elapsed:    1.2s finished


In [14]:
# # stock_without_absolute, predict 30 day trend 
# rfc = LogisticRegression(n_jobs=-1) # classifier
# solver = ['newton-cg', 'lbfgs', 'liblinear', 'sag','saga']

# # param to grid search
# param_grid = dict(solver=solver)

# # train data and label
# train_data = stock_without_absolute
# label = label_abs_30d

# X_train, X_test, y_train, y_test = train_test_split(train_data, label, test_size=0.1, random_state=42)
# scaler = MinMaxScaler()
# X_train = scaler.fit_transform(X_train)
# X_test = scaler.transform(X_test)

# rfc_model = create_grid_model(rfc, param_grid)
# rfc_model.fit(X_train, y_train)

# predicted_test = rfc_model.predict(X_test)
# predicted_train = rfc_model.predict(X_train)
# decision_function = rfc_model.predict_proba(X_test)

# result(rfc_model, train_data, predicted_test, y_test, predicted_train, y_train, 
#        "./results/lgr_without_ABS_pred_30_d", decision_function, clf_name="Logistic Regression")

In [15]:
best_list = [best_1d_model_param, best_7d_model_param,  best_30d_model_param]
labels = [label_abs_1d, label_abs_7d, label_abs_30d]
file_name = ["LR_1d", "LR_7d", "LR_30d"]

for index, best_parameters in enumerate(best_list):
    print(best_parameters)
    rfc = LogisticRegression(**best_parameters)

    # train data and label
    train_data = stock_with_absolute
    label = labels[index]

    X_train, X_test, y_train, y_test = train_test_split(train_data, label, test_size=0.1, shuffle=True)
    scaler = MinMaxScaler()
    X_train = scaler.fit_transform(X_train)
    X_test = scaler.transform(X_test)

    rfc.fit(X_train, y_train)

    predicted_test = rfc.predict(X_test)
    predicted_train = rfc.predict(X_train)

    acc_train = accuracy_score(y_train, predicted_train)
    acc_test = accuracy_score(y_test, predicted_test)
    print("The Train Accuracy  %0.3f" % (acc_train))
    print("The Test Accuracy   %0.3f" % (acc_test ))
    pickle.dump(rfc, open('./backend/'+target+'/LR/'+file_name[index]+'.pkl','wb'))

{'solver': 'saga'}




The Train Accuracy  0.562
The Test Accuracy   0.502
{'solver': 'lbfgs'}


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


The Train Accuracy  0.616
The Test Accuracy   0.550
{'solver': 'newton-cg'}
The Train Accuracy  0.703
The Test Accuracy   0.675
