In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import yfinance as yf
import xgboost as xgb
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_percentage_error, accuracy_score, confusion_matrix, precision_score, classification_report
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from scipy.stats import randint as sp_randint
from scipy.stats import uniform as sp_uniform

from google.colab import drive
drive.mount('/content/gdrive')

path = '/content/gdrive/MyDrive/Colab_Notebooks/FYP/'

In [None]:
# Read data
full_training_data = pd.read_csv(path+'data/full_training_data_cleaned.csv',index_col=False)

In [None]:
# Min-Max Scaling
min_max_column = ['Open', 'High', 'Low', 'Volume',
                  'wsentiments', 'HSI_overnight_ret',
                  'HSI_intraday_overnight_ret_diff', 'CSI300_overnight_ret',
                  'SSE50_overnight_ret', 'HSI_stoch_k', 'HSI_rsi', 'HSI_daily_ret',
                  'HSI_intraday_ret', 'HSI_ret_range']

mms_X = MinMaxScaler()
full_training_data[min_max_column] = mms_X.fit_transform(full_training_data[min_max_column])

In [None]:
# Data Transformation for buy/sell/neutral label
full_training_data['HSI_OO_ter_0.005'] = full_training_data['HSI_OO_ter_0.005'].map({'buy':1, 'sell':-1, 'neutral':0})

In [None]:
# Feature selection
classification_feature_column = ['Open', 'High', 'Low', 'Volume', 'HSI_overnight_ret',
                             'HSI_intraday_overnight_ret_diff',
                             'CSI300_overnight_ret', 'SSE50_overnight_ret',
                             'HSI_stoch_k', 'HSI_rsi', 'HSI_daily_ret',
                             'HSI_disc_macd_1', 'HSI_intraday_ret',
                             'HSI_ret_range']

classification_nlp_column = ['Open', 'High', 'Low', 'Volume', 'HSI_overnight_ret',
                         'HSI_intraday_overnight_ret_diff',
                         'CSI300_overnight_ret', 'SSE50_overnight_ret',
                         'HSI_stoch_k', 'HSI_rsi', 'HSI_daily_ret',
                         'HSI_disc_macd_1', 'HSI_intraday_ret', 'HSI_ret_range',
                         'wsentiments']

classification_feature_df = full_training_data[classification_feature_column]
classification_nlp_df = full_training_data[classification_nlp_column]

In [None]:
# Prepare the dataset
def train(X,y,version):

  # Split the data into training (70%), testing (15%), and validation (15%) sets
  X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.15, random_state=42)
  X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.1765, random_state=42)

  # Create the XGBoost model with a reduced number of estimators
  model = xgb.XGBRegressor(objective="reg:squarederror", n_estimators=200, random_state=42)

  # Set up the hyperparameter distributions for tuning
  param_dist = {
      'max_depth': sp_randint(1, 6),
      'min_child_weight': sp_randint(1, 10),
      'gamma': sp_uniform(0, 1),
      'subsample': sp_uniform(0.4, 0.6),
      'colsample_bytree': sp_uniform(0.4, 0.6),
      'reg_alpha': sp_uniform(0, 3),
      'reg_lambda': sp_uniform(0, 3),
      'learning_rate': sp_uniform(0.005, 0.05)
  }

  # Perform randomized search with a given number of iterations
  n_iter_search = 75
  random_search = RandomizedSearchCV(model, param_distributions=param_dist, n_iter=n_iter_search, cv=5,
                                    scoring='neg_mean_squared_error', verbose=1, n_jobs=-1, random_state=42)

  random_search.fit(X_train, y_train)

  # Print the best combination of hyperparameters
  print("Best hyperparameters found using randomized search:")
  print(random_search.best_params_)

  # Train the model with the best hyperparameters
  best_model = random_search.best_estimator_
  best_model.fit(X_train, y_train, early_stopping_rounds=10, eval_set=[(X_val, y_val)], verbose=False)

  # Make predictions
  y_pred_train = best_model.predict(X_train)
  y_pred_val = best_model.predict(X_val)
  y_pred_test = best_model.predict(X_test)

  y_pred_train = list(map(lambda x: -1 if (x < -0.5) else (0 if (x < 0.5) else 1), y_pred_train))
  y_pred_val = list(map(lambda x: -1 if (x < -0.5) else (0 if (x < 0.5) else 1), y_pred_val))
  y_pred_test = list(map(lambda x: -1 if (x < -0.5) else (0 if (x < 0.5) else 1), y_pred_test))

  train_pred_df = pd.DataFrame({'Date': full_training_data.loc[X_train.index, 'Date'], 'Predicted': y_pred_train})
  val_pred_df = pd.DataFrame({'Date': full_training_data.loc[X_val.index, 'Date'], 'Predicted': y_pred_val})
  test_pred_df = pd.DataFrame({'Date': full_training_data.loc[X_test.index, 'Date'], 'Predicted': y_pred_test})

  result = pd.concat([train_pred_df, val_pred_df])
  result = pd.concat([result, test_pred_df])
  result = result.sort_values(by='Date').reset_index(drop=True)

  # output result
  result.to_csv(path+f'xgboost_stock_prediction_classification_{version}.csv', index=False)

  print(f"Accuracy of training: {accuracy_score(y_train, y_pred_train)}")
  print(f"Accuracy of validation: {accuracy_score(y_val, y_pred_val)}")
  print(f"Accuracy of testing: {accuracy_score(y_test, y_pred_test)}")

  # Calculating the precision score of classifier
  print(f"Precision Score of training: {precision_score(y_train, y_pred_train, average=None)}")
  print(f"Precision Score of training: {precision_score(y_train, y_pred_train, average=None)}")
  print(f"Precision Score of testing: {precision_score(y_test, y_pred_test, average=None)}")

  # confusion matrix function a matrix containing the summary of predictions
  print(f"Confusion matrix of training: {confusion_matrix(y_train, y_pred_train)}")
  print(f"Confusion matrix of validation: {confusion_matrix(y_val, y_pred_val)}")
  print(f"Confusion matrix of testing: {confusion_matrix(y_test, y_pred_test)}")

  print(f"Classification report of training: {classification_report(y_train, y_pred_train, digits=3)}")
  print(f"Classification report of validation: {classification_report(y_val, y_pred_val, digits=3)}")
  print(f"Classification report of testing: {classification_report(y_test, y_pred_test, digits=3)}")

  return X_train, X_val, X_test, y_train, y_val, y_test, y_pred_train, y_pred_val, y_pred_test

**XGBoost Vanilla + Feature**

In [None]:
X = classification_feature_df
y = full_training_data['HSI_OO_ter_0.005']
X_train, X_val, X_test, y_train, y_val, y_test, y_pred_train, y_pred_val, y_pred_test = train(X, y, 'feature')

Fitting 5 folds for each of 75 candidates, totalling 375 fits
Best hyperparameters found using randomized search:
{'colsample_bytree': 0.7316919861447446, 'gamma': 0.2965101436477985, 'learning_rate': 0.02598904282231383, 'max_depth': 2, 'min_child_weight': 9, 'reg_alpha': 1.8345411325970415, 'reg_lambda': 0.24478254120072107, 'subsample': 0.4031109176643921}
Accuracy of training: 0.6189555125725339
Accuracy of validation: 0.5357142857142857
Accuracy of testing: 0.5135135135135135
Precision Score of training: [0.81415929 0.42756184 0.88429752]
Precision Score of training: [0.81415929 0.42756184 0.88429752]
Precision Score of testing: [0.80952381 0.34722222 0.83333333]
Confusion matrix of training: [[ 92  85   2]
 [ 19 121  12]
 [  2  77 107]]
Confusion matrix of validation: [[14 26  0]
 [ 3 21  6]
 [ 1 16 25]]
Confusion matrix of testing: [[17 34  1]
 [ 4 25  2]
 [ 0 13 15]]
Classification report of training:               precision    recall  f1-score   support

          -1      0.81



**XGBoost Vanilla + Feature + NLP**

In [None]:
X = classification_nlp_df
y = full_training_data['HSI_OO_ter_0.005']
X_train, X_val, X_test, y_train, y_val, y_test, y_pred_train, y_pred_val, y_pred_test = train(X, y, 'nlp')

Fitting 5 folds for each of 75 candidates, totalling 375 fits
Best hyperparameters found using randomized search:
{'colsample_bytree': 0.7316919861447446, 'gamma': 0.2965101436477985, 'learning_rate': 0.02598904282231383, 'max_depth': 2, 'min_child_weight': 9, 'reg_alpha': 1.8345411325970415, 'reg_lambda': 0.24478254120072107, 'subsample': 0.4031109176643921}
Accuracy of training: 0.6150870406189555
Accuracy of validation: 0.5178571428571429
Accuracy of testing: 0.4864864864864865
Precision Score of training: [0.84210526 0.42413793 0.87610619]
Precision Score of training: [0.84210526 0.42413793 0.87610619]
Precision Score of testing: [0.76190476 0.32876712 0.82352941]
Confusion matrix of training: [[ 96  81   2]
 [ 17 123  12]
 [  1  86  99]]
Confusion matrix of validation: [[12 28  0]
 [ 3 21  6]
 [ 2 15 25]]
Confusion matrix of testing: [[16 35  1]
 [ 5 24  2]
 [ 0 14 14]]
Classification report of training:               precision    recall  f1-score   support

          -1      0.84

