In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import yfinance as yf
import xgboost as xgb
import glob
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_percentage_error, accuracy_score, confusion_matrix, precision_score, classification_report
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from scipy.stats import randint as sp_randint
from scipy.stats import uniform as sp_uniform
from matplotlib.dates import DateFormatter
import matplotlib.dates as mdates
from google.colab import drive

drive.mount('/content/gdrive')

path = '/content/gdrive/MyDrive/Colab_Notebooks/FYP/'

In [None]:
# read data
full_training_data = pd.read_csv(path+'data/full_training_data_cleaned.csv',index_col=False)
csv_files = glob.glob(path+'/data/news/*.{}'.format('csv'))
df_concat = pd.concat([pd.read_csv(f) for f in csv_files], ignore_index=True)

# data transformation
df_concat['created_at'] = pd.to_datetime(df_concat['created_at'], dayfirst=True).dt.strftime('%Y-%m-%d')
df_concat = df_concat.sort_values(by=['created_at'])
df_concat['combined_text'] = df_concat['description'] + ' ' + df_concat['content']
df_concat['combined_text'] = df_concat['combined_text'].replace(r'\s+', ' ', regex=True)
df_concat = df_concat[['created_at', 'combined_text']]
df_concat['combined_text'] = df_concat['combined_text'].astype(str)
df_concat.reset_index(inplace=True)
df_concat = df_concat.groupby(['created_at'], as_index=False)['combined_text'].agg({'combined_text': ' '.join})

In [None]:
full_training_data['created_at'] = pd.to_datetime(full_training_data['created_at']).dt.strftime('%Y-%m-%d')
full_training_data_combined = pd.merge(full_training_data, df_concat, how='left', on='created_at')
required_columns = ['created_at', 'combined_text', 'HSI_OO_ter_0.005']
full_training_data_combined = full_training_data_combined[required_columns]

In [None]:
import torch
import numpy as np
from transformers import BertTokenizer

tokenizer = BertTokenizer.from_pretrained('bert-base-cased')
labels = {'buy':0,
          'neutral':1,
          'sell':2
          }

# create dataset
class Dataset(torch.utils.data.Dataset):

    def __init__(self, df):

        self.labels = [labels[label] for label in df['HSI_OO_ter_0.005']]
        self.texts = [tokenizer(str(text),
                               padding='max_length', max_length = 512, truncation=True,
                                return_tensors="pt") for text in df['combined_text']]

    def classes(self):
        return self.labels

    def __len__(self):
        return len(self.labels)

    def get_batch_labels(self, idx):
        # Fetch a batch of labels
        return np.array(self.labels[idx])

    def get_batch_texts(self, idx):
        # Fetch a batch of inputs
        return self.texts[idx]

    def __getitem__(self, idx):

        batch_texts = self.get_batch_texts(idx)
        batch_y = self.get_batch_labels(idx)

        return batch_texts, batch_y

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/29.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

In [None]:
np.random.seed(112)
df_train, df_val, df_test = np.split(full_training_data_combined.sample(frac=1, random_state=42),
                                     [int(.7*len(full_training_data_combined)), int(.85*len(full_training_data_combined))])

In [None]:
from torch import nn
from transformers import BertModel

# BERT model
class BertClassifier(nn.Module):

    def __init__(self, dropout=0.5):

        super(BertClassifier, self).__init__()

        self.bert = BertModel.from_pretrained('bert-base-cased')
        self.dropout = nn.Dropout(dropout)
        self.linear = nn.Linear(768, 3)
        self.relu = nn.LeakyReLU(0.1)

    def forward(self, input_id, mask):

        _, pooled_output = self.bert(input_ids= input_id, attention_mask=mask,return_dict=False)
        dropout_output = self.dropout(pooled_output)
        linear_output = self.linear(dropout_output)
        final_layer = self.relu(linear_output)

        return final_layer

In [None]:
from torch.optim import Adam
from tqdm import tqdm

#model training
def train(model, train_data, val_data, learning_rate, epochs):

    train, val = Dataset(train_data), Dataset(val_data)

    train_dataloader = torch.utils.data.DataLoader(train, batch_size=2, shuffle=True)
    val_dataloader = torch.utils.data.DataLoader(val, batch_size=2, shuffle=True)

    use_cuda = torch.cuda.is_available()
    device = torch.device("cuda" if use_cuda else "cpu")

    criterion = nn.CrossEntropyLoss()
    optimizer = Adam(model.parameters(), lr= learning_rate)

    if use_cuda:

            model = model.cuda()
            criterion = criterion.cuda()

    for epoch_num in range(epochs):

            total_acc_train = 0
            total_loss_train = 0

            for train_input, train_label in tqdm(train_dataloader):
                train_label = train_label.to(device)
                mask = train_input['attention_mask'].to(device)
                input_id = train_input['input_ids'].squeeze(1).to(device)

                output = model(input_id, mask)

                batch_loss = criterion(output, train_label.long())
                total_loss_train += batch_loss.item()

                acc = (output.argmax(dim=1) == train_label).sum().item()
                total_acc_train += acc

                model.zero_grad()
                batch_loss.backward()
                optimizer.step()

            total_acc_val = 0
            total_loss_val = 0

            with torch.no_grad():

                for val_input, val_label in val_dataloader:

                    val_label = val_label.to(device)
                    mask = val_input['attention_mask'].to(device)
                    input_id = val_input['input_ids'].squeeze(1).to(device)

                    output = model(input_id, mask)

                    batch_loss = criterion(output, val_label.long())
                    total_loss_val += batch_loss.item()

                    acc = (output.argmax(dim=1) == val_label).sum().item()
                    total_acc_val += acc

            print(
                f'Epochs: {epoch_num + 1} | Train Loss: {total_loss_train / len(train_data): .3f} \
                | Train Accuracy: {total_acc_train / len(train_data): .3f} \
                | Val Loss: {total_loss_val / len(val_data): .3f} \
                | Val Accuracy: {total_acc_val / len(val_data): .3f}')

EPOCHS = 20
model = BertClassifier()
LR = 1e-6

train(model, df_train, df_val, LR, EPOCHS)

Downloading model.safetensors:   0%|          | 0.00/436M [00:00<?, ?B/s]

100%|██████████| 259/259 [00:58<00:00,  4.46it/s]


Epochs: 1 | Train Loss:  0.559                 | Train Accuracy:  0.340                 | Val Loss:  0.554                 | Val Accuracy:  0.387


100%|██████████| 259/259 [00:57<00:00,  4.53it/s]


Epochs: 2 | Train Loss:  0.552                 | Train Accuracy:  0.309                 | Val Loss:  0.558                 | Val Accuracy:  0.279


100%|██████████| 259/259 [00:59<00:00,  4.35it/s]


Epochs: 3 | Train Loss:  0.550                 | Train Accuracy:  0.346                 | Val Loss:  0.554                 | Val Accuracy:  0.360


100%|██████████| 259/259 [01:00<00:00,  4.27it/s]


Epochs: 4 | Train Loss:  0.552                 | Train Accuracy:  0.315                 | Val Loss:  0.567                 | Val Accuracy:  0.315


100%|██████████| 259/259 [01:01<00:00,  4.22it/s]


Epochs: 5 | Train Loss:  0.547                 | Train Accuracy:  0.376                 | Val Loss:  0.564                 | Val Accuracy:  0.306


100%|██████████| 259/259 [01:02<00:00,  4.16it/s]


Epochs: 6 | Train Loss:  0.536                 | Train Accuracy:  0.421                 | Val Loss:  0.559                 | Val Accuracy:  0.315


100%|██████████| 259/259 [01:02<00:00,  4.14it/s]


Epochs: 7 | Train Loss:  0.532                 | Train Accuracy:  0.411                 | Val Loss:  0.553                 | Val Accuracy:  0.342


100%|██████████| 259/259 [01:02<00:00,  4.12it/s]


Epochs: 8 | Train Loss:  0.505                 | Train Accuracy:  0.463                 | Val Loss:  0.562                 | Val Accuracy:  0.369


100%|██████████| 259/259 [01:02<00:00,  4.11it/s]


Epochs: 9 | Train Loss:  0.487                 | Train Accuracy:  0.517                 | Val Loss:  0.561                 | Val Accuracy:  0.378


100%|██████████| 259/259 [01:02<00:00,  4.13it/s]


Epochs: 10 | Train Loss:  0.457                 | Train Accuracy:  0.571                 | Val Loss:  0.541                 | Val Accuracy:  0.432


100%|██████████| 259/259 [01:03<00:00,  4.10it/s]


Epochs: 11 | Train Loss:  0.423                 | Train Accuracy:  0.635                 | Val Loss:  0.587                 | Val Accuracy:  0.405


100%|██████████| 259/259 [01:02<00:00,  4.12it/s]


Epochs: 12 | Train Loss:  0.393                 | Train Accuracy:  0.656                 | Val Loss:  0.577                 | Val Accuracy:  0.414


100%|██████████| 259/259 [01:03<00:00,  4.10it/s]


Epochs: 13 | Train Loss:  0.368                 | Train Accuracy:  0.710                 | Val Loss:  0.571                 | Val Accuracy:  0.423


100%|██████████| 259/259 [01:02<00:00,  4.12it/s]


Epochs: 14 | Train Loss:  0.342                 | Train Accuracy:  0.757                 | Val Loss:  0.660                 | Val Accuracy:  0.360


100%|██████████| 259/259 [01:03<00:00,  4.11it/s]


Epochs: 15 | Train Loss:  0.299                 | Train Accuracy:  0.807                 | Val Loss:  0.631                 | Val Accuracy:  0.333


100%|██████████| 259/259 [01:03<00:00,  4.11it/s]


Epochs: 16 | Train Loss:  0.273                 | Train Accuracy:  0.846                 | Val Loss:  0.693                 | Val Accuracy:  0.351


100%|██████████| 259/259 [01:02<00:00,  4.11it/s]


Epochs: 17 | Train Loss:  0.239                 | Train Accuracy:  0.888                 | Val Loss:  0.697                 | Val Accuracy:  0.360


100%|██████████| 259/259 [01:02<00:00,  4.12it/s]


Epochs: 18 | Train Loss:  0.203                 | Train Accuracy:  0.907                 | Val Loss:  0.704                 | Val Accuracy:  0.297


100%|██████████| 259/259 [01:02<00:00,  4.13it/s]


Epochs: 19 | Train Loss:  0.181                 | Train Accuracy:  0.931                 | Val Loss:  0.753                 | Val Accuracy:  0.297


100%|██████████| 259/259 [01:02<00:00,  4.12it/s]


Epochs: 20 | Train Loss:  0.160                 | Train Accuracy:  0.948                 | Val Loss:  0.789                 | Val Accuracy:  0.297


In [None]:
# model evaluation
def evaluate(model, test_data):

    test = Dataset(test_data)

    test_dataloader = torch.utils.data.DataLoader(test, batch_size=1, shuffle=False)

    use_cuda = torch.cuda.is_available()
    device = torch.device("cuda" if use_cuda else "cpu")
    pred_list = []
    test_label_list = []

    if use_cuda:

        model = model.cuda()

    total_acc_test = 0

    with torch.no_grad():

        for test_input, test_label in test_dataloader:

              test_label = test_label.to(device)
              mask = test_input['attention_mask'].to(device)
              input_id = test_input['input_ids'].squeeze(1).to(device)

              output = model(input_id, mask)
              pred_list.append(output.argmax(dim=1).item())
              test_label_list.append(test_label.item())

              acc = (output.argmax(dim=1) == test_label).sum().item()
              total_acc_test += acc

    print(f'Test Accuracy: {total_acc_test / len(test_data): .3f}')
    print(f"Accuracy of training: {accuracy_score(test_label_list, pred_list)}")
    print(f"Precision Score of training: {precision_score(test_label_list, pred_list, average=None)}")
    print(f"Confusion matrix of training: {confusion_matrix(test_label_list, pred_list)}")
    print(f"Classification report of training: {classification_report(test_label_list, pred_list, digits=3)}")
    pred_df = pd.DataFrame({'Date': full_training_data_combined.loc[test_data.index, 'created_at'], 'Predicted': pred_list})

    return pred_df

print(('Training:'))
train_pred_df = evaluate(model, df_train)
print(('Validation:'))
val_pred_df = evaluate(model, df_val)
print(('Testing:'))
test_pred_df = evaluate(model, df_test)

Training
tensor([[-0.0942, -0.0149,  4.0481]], device='cuda:0')
tensor([[ 2.7098, -0.0441, -0.0923]], device='cuda:0')
tensor([[-0.0852,  0.8911,  4.3208]], device='cuda:0')
tensor([[-0.0030,  2.3203, -0.0438]], device='cuda:0')
tensor([[ 0.5125,  2.4187, -0.0953]], device='cuda:0')
tensor([[-0.0498,  1.0124,  3.4436]], device='cuda:0')
tensor([[ 1.8044, -0.0153, -0.0719]], device='cuda:0')
tensor([[-0.1139,  0.6032,  3.5794]], device='cuda:0')
tensor([[ 1.8376,  0.5498, -0.2319]], device='cuda:0')
tensor([[ 0.6845,  2.2764, -0.0656]], device='cuda:0')
tensor([[ 1.1936,  0.0444, -0.1367]], device='cuda:0')
tensor([[1.4778, 2.0949, 0.0471]], device='cuda:0')
tensor([[ 0.7253,  0.0392, -0.1552]], device='cuda:0')
tensor([[-0.0213,  2.2809, -0.0250]], device='cuda:0')
tensor([[-0.0242,  1.1329,  5.0040]], device='cuda:0')
tensor([[-0.0166,  0.5194,  3.8418]], device='cuda:0')
tensor([[ 1.4466, -0.0627, -0.1466]], device='cuda:0')
tensor([[-0.0515,  0.4797,  2.8141]], device='cuda:0')
tens

In [None]:
result = pd.concat([train_pred_df, val_pred_df])
result = pd.concat([result, test_pred_df])
result = result.sort_values(by='Date').reset_index(drop=True)

# output result
result.to_csv(path+f'bert_stock_prediction_classification.csv', index=False)

In [None]:
# data transformation after output
df = pd.read_csv(path+'bert_stock_prediction_classification.csv')
my_dict = {'0':'1', '1':'0', '2':'-1'}
df.Predicted = df.Predicted.astype(str)
df['col3'] = df.Predicted.apply(lambda x: [my_dict.get(v) for v in x])
df.drop(['Predicted'], axis=1, inplace=True)
df.rename(columns = {'col3':'Predicted'}, inplace = True)
df['Predicted'] = df.Predicted.apply(lambda x: int(x[0]))
df.to_csv(path+f'bert_stock_prediction_classification_updated.csv', index=False)