<a href="https://colab.research.google.com/github/geexe/stock-price-classification-sell-buy/blob/main/DADS6003_Final_Project_Test_File_for_Classification.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Instruction

- Upload the 3 model files into Colab Folder (Folder icon on the left side)
  - model_tisco.joblib
  - model_erw.joblib
  - model_sprc.joblib
- Input desired start date, end date and interval for each respective ticker
- Run the code and obtain AUC Score

### Installations

In [1]:
!pip install pandas
!pip install numpy
!pip install requests
!pip install lxml
!pip install yfinance



In [None]:
# Resolve "cannot install ta-lib"
!wget http://prdownloads.sourceforge.net/ta-lib/ta-lib-0.4.0-src.tar.gz
!tar -xzvf ta-lib-0.4.0-src.tar.gz
%cd ta-lib
!./configure --prefix=/usr
!make
!make install
!pip install Ta-Lib

In [3]:
# Algorithms to be used
from sklearn.linear_model import *
from sklearn.neural_network import *
from sklearn.naive_bayes import *
from sklearn.neighbors import *
from sklearn.ensemble import *
from sklearn.tree import *
from sklearn.svm import *

# Preprocessing
from sklearn.preprocessing import StandardScaler

# Model Seclection
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import *
from sklearn.feature_selection import *
from sklearn import metrics
from sklearn.model_selection import cross_val_score

# import other required libraries
from pandas_datareader import data as pdr
import yfinance as yf
import talib as ta
import numpy as np
import pandas as pd
#from sklearn.externals import joblib
import joblib
from google.colab import files

# import other libraries for EDA
import seaborn as sns
import matplotlib.pyplot as plt

### Feature engineering and Functions

In [14]:
# Get Ticker Data at specified start date, end date and interval
def get_data(ticker,start_date,end_date,query_interval):
  yf.pdr_override()
  quote = yf.Ticker(ticker)
  interval = query_interval
  start = start_date
  end = end_date
  quote_price_df = quote.history(interval=interval,start=start,end=end)
  df = quote_price_df.iloc[:,:4]
  return df

# Add features to Data Frame
def add_features(df):
  df = df.reset_index()
  df = df.iloc[:,:5]
  df['S_10'] = df['Close'].rolling(window=10).mean()
  #df['Corr'] = df['Close'].rolling(window=10).corr(df['S_10'])
  df['RSI'] = ta.RSI(np.array(df['Close']), timeperiod =10)
  df['Open-Close'] = df['Open'] - df['Close'].shift(1)
  df['Open-Open'] = df['Open'] - df['Open'].shift(1)
  df['%K'], df['%D'] = ta.STOCH(df['High'], df['Low'], df['Close'])
  # df['%K'], df['%D'] = df['%K']/100, df['%D']/100
  df['UpperBand'], df['MiddleBand'], df['LowerBand'] = ta.BBANDS(df['Close'])
  # Replace upper and lower bolliger bands with the differences to the 'Close'
  df['UpperBand'] = df['UpperBand'] - df['Close']
  df['LowerBand'] = df['UpperBand'] - df['Close']
  df['Reverse_buy'] = np.where(df['High'].shift(2) < df['Close'], 1, 0)
  df['Reverse_sell'] = np.where(df['Low'].shift(2) > df['Close'], 1, 0)
  # Convert the 'Date' column to datetime
  if 'Datetime' in df.columns:
    # Since the column name changes according to the time frame given to the Yahoo Finance API:
    # "Date" if time frame is >1d, otherwise "Datetime" if <1d, we need to select the right column
    df['Date'] = pd.to_datetime(df['Datetime'])
  else:
    df['Date'] = pd.to_datetime(df['Date'])
  # Convert day of week to circular function (x,y) coordinates in a 2-dimension space using sin() and cos()
  df['Day_of_Week'] = df['Date'].dt.dayofweek
  df['Circular_Day_Sine'] = np.sin(2 * np.pi * df['Day_of_Week'] / 7)
  df['Circular_Day_Cosine'] = np.cos(2 * np.pi * df['Day_of_Week'] / 7)
  df['Is_Friday'] = (df['Day_of_Week'] == 4).astype(int)
  df['Is_Monday'] = (df['Day_of_Week'] == 0).astype(int)
  # Extract hour and minute from the datetime
  df['Hour'] = df['Date'].dt.hour
  df['Minute'] = df['Date'].dt.minute
  df['Is_10am'] = (df['Hour'] == 10).astype(int)
  df['Is_4pm'] = (df['Hour'] == 16).astype(int)
  # Convert time of day to circular values using sine and cosine
  df['Circular_Time_Sine'] = np.sin(2 * np.pi * (df['Hour'] * 60 + df['Minute']) / (24 * 60))
  df['Circular_Time_Cosine'] = np.cos(2 * np.pi * (df['Hour'] * 60 + df['Minute']) / (24 * 60))
  # Drop variables
  df = df.drop('Day_of_Week', axis = 1)
  df = df.drop('Hour', axis = 1)
  df = df.drop('Minute', axis = 1)
  df = df.drop('MiddleBand', axis = 1)
  if 'Datetime' in df.columns:
    df = df.drop('Datetime', axis = 1)
    df = df.drop('Date', axis = 1)
  else:
    df = df.drop('Date', axis = 1)

  df = df.dropna()
  return df

def define_var(data,split_proportion):
  # Define y
  # Shift -1 means data from the next period is put into the current row: means if the next period price is higher, we buy
  # Shift 1 means data from the previous period is put into the current row: means if the previous period price is higher, we sell
  X = data
  y = np.where(data['Close'].shift(-1) > data['Close'],1,0)

  buy = int(0)
  not_buy = int(0)
  for i in range(len(y)):
    if y[i] == 1:
      buy += 1
    else:
      not_buy += 1

  split = int(split_proportion * len(data))
  X_train, X_test, y_train, y_test = data[:split], data[split:], y[:split], y[split:]
  return X_train, X_test, y_train, y_test

# Define train or test data (No splitting)
def define_data(data):
# Define y
  # Shift -1 means data from the next period is put into the current row: means if the next period price is higher, we buy
  # Shift 1 means data from the previous period is put into the current row: means if the previous period price is higher, we sell
  X = data
  y = np.where(data['Close'].shift(-1) > data['Close'],1,-1)

  buy = int(0)
  not_buy = int(0)
  for i in range(len(y)):
    if y[i] == 1:
      buy += 1
    else:
      not_buy += 1

  return X, y

# Define train or test data (No splitting)
def define_data1(data):
# Define y
  # Shift -1 means data from the next period is put into the current row: means if the next period price is higher, we buy
  # Shift 1 means data from the previous period is put into the current row: means if the previous period price is higher, we sell
  X = data
  y = np.where(data['Close'].shift(-1) > data['Close'],1,-1)

  buy = int(0)
  not_buy = int(0)
  for i in range(len(y)):
    if y[i] == 1:
      buy += 1
    else:
      not_buy += 1

  return X, y, buy, not_buy

# GridSearch
def grid_search(modelType, X_train, y_train, gridParam, cv_fold,):
  grid = GridSearchCV(modelType,gridParam,cv=cv_fold)
  grid.fit(X_train, y_train)
  best_model = grid.best_estimator_
  return best_model

# Build model and output report (Input gridseach 'best_model' for modelType). However, this is not recommended as GridSearchCV uses the same training data
# Use nested_cv instead for GridSearch and model evaluation with 2-step CV
def train_model(modelType, X_train, X_test, y_train, y_test):

  model = modelType
  model.fit(X_train, y_train)

  y_pred = model.predict(X_test)
  y_prob = model.predict_proba(X_test)[:,1]

  conf_matrix = metrics.confusion_matrix(y_test, y_pred)

  #Classification Report
  report = metrics.classification_report(y_test, y_pred)

  # Model Accuracy
  accuracy = model.score(X_test,y_test)

  # Cross Validation
  cross_val = cross_val_score(modelType, X_train, y_train, scoring = 'roc_auc', cv = 5)

  # AUC_ROC
  auc_score = roc_auc_score(y_test, y_prob)

  return X_train, X_test, y_train, y_test, y_pred, model, conf_matrix, report, accuracy, cross_val, auc_score

# Nested Cross-Validation
def nested_cv(modelType, X, y, gridParam, outerCvFold, innerCvFold,state):
    # Outer cross-validation (for model evaluation)
  outer_cv = StratifiedKFold(n_splits=outerCvFold, shuffle=True, random_state=state)

  # Inner cross-validation (for hyperparameter tuning)
  inner_cv = StratifiedKFold(n_splits=innerCvFold, shuffle=True,random_state=state)

  # Nested cross-validation
  nested_scores = []

  for train_index, test_index in outer_cv.split(X, y):

    # X_train, X_test = X.iloc[train_index,:], X.iloc[test_index,:] # X is pandas DataFrame so need to use .iloc[Slice,Slice]
    X_train, X_test = X[train_index], X[test_index] # X is pandas DataFrame so need to use .iloc[Slice,Slice]
    y_train, y_test = y[train_index], y[test_index] # y is np array so can use index directly

    # GridSearchCV for hyperparameter tuning
    grid_search = GridSearchCV(estimator=modelType, param_grid=gridParam, scoring='roc_auc', cv=inner_cv)
    grid_search.fit(X_train, y_train)
    best_model = grid_search.best_estimator_

    # Evaluate the model on the outer test set
    outer_score = grid_search.score(X_test, y_test)
    nested_scores.append(outer_score)
  return best_model, nested_scores

###Test on new data

TISCO

In [32]:
# Define ticker, start, end and interval
ticker = 'TISCO.BK'
start_time = '2023-12-29'
end_time = '2024-01-13'
interval = '15m'

# Load saved model
loaded_model = joblib.load('/content/model_tisco.joblib')

df_test = get_data(ticker,start_time, end_time, interval)
df_test = add_features(df_test)
X_test, y_test = define_data(df_test)
X_test = X_test.to_numpy()

y_prob = loaded_model.predict_proba(X_test)[:,1]
y_pred = loaded_model.predict(X_test)
aucScore = roc_auc_score(y_test,y_prob)

# Get AUC
print(f"AUC Score for {ticker}: {aucScore}")

AUC Score for TISCO.BK: 0.8215634989828537


SPRC

In [33]:
# Define ticker, start, end and interval
ticker = 'SPRC.BK'
start_time = '2023-12-29'
end_time = '2024-01-13'
interval = '15m'

# Load saved model
loaded_model = joblib.load('/content/model_tisco.joblib')

df_test = get_data(ticker,start_time, end_time, interval)
df_test = add_features(df_test)
X_test, y_test = define_data(df_test)
X_test = X_test.to_numpy()

y_prob = loaded_model.predict_proba(X_test)[:,1]
y_pred = loaded_model.predict(X_test)
aucScore = roc_auc_score(y_test,y_prob)

# Get AUC
print(f"AUC Score for {ticker}: {aucScore}")

AUC Score for SPRC.BK: 0.6424595141700405


ERW

In [35]:
# Define ticker, start, end and interval
ticker = 'ERW.BK'
start_time = '2023-12-29'
end_time = '2024-01-13'
interval = '15m'

# Load saved model
loaded_model = joblib.load('/content/model_erw.joblib')

df_test = get_data(ticker,start_time, end_time, interval)
df_test = add_features(df_test)
X_test, y_test = define_data(df_test)
X_test = X_test.to_numpy()

y_prob = loaded_model.predict_proba(X_test)[:,1]
y_pred = loaded_model.predict(X_test)
aucScore = roc_auc_score(y_test,y_prob)

# Get AUC
print(f"AUC Score for {ticker}: {aucScore}")

AUC Score for ERW.BK: 0.7449874686716792
