# **IMPORT LIBRARIES AND LOAD DATA**

In [None]:
# install packages
!pip install transformers[torch]
!pip install --upgrade accelerate
!pip install datasets

In [None]:
# import libraries
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
from dataclasses import dataclass
from torch.utils.data import Dataset
from torch.nn.functional import softmax
from google.colab import files, runtime
from sklearn.metrics import f1_score
from transformers import AutoModel, AutoTokenizer, AutoConfig, Trainer, TrainingArguments
from transformers.modeling_outputs import SequenceClassifierOutput
from typing import List, Dict, Any

In [None]:
# mount google drive
from google.colab import drive
drive.mount('/content/drive')

In [None]:
# load training data
file_path_train = '/content/drive/MyDrive/PostGrad/5. Extended Research Projects/Dataset/train_data.csv'
train_df = pd.read_csv(file_path_train)

# load test data
file_path_test = '/content/drive/MyDrive/PostGrad/5. Extended Research Projects/Dataset/test_data.csv'
test_df = pd.read_csv(file_path_test)

In [None]:
# combine train and test dataset for rollig window
final_df = pd.concat([train_df, test_df], axis=0, ignore_index=True)

# sort by permco and date
final_df.sort_values(by=['permco', 'start_date'], inplace=True)

# format to date
final_df['start_date'] = pd.to_datetime(final_df['start_date'], format='%Y-%m-%d')

# filter out no change label due to very small occurence
final_df = final_df[final_df['price_direction'] != 'no change']

In [None]:
# encode price direction
label_mapping = {'positive': 1, 'negative': 0}
final_df['label'] = final_df['price_direction'].map(label_mapping)

In [None]:
# define function to concatenate with special tokens to separate columns
def concatenate_columns(row):
    return f"{row['headline']} [SEP] {row['situation']} [SEP] {row['eventtype']}"

# apply function to concatenate columns headline, situation, and eventtype
final_df['combined_text'] = final_df.apply(concatenate_columns, axis=1)

# **DEFINE FUNCTION**

In [None]:
# set device to GPU if available
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# define a custom Dataset class for processing news data
class NewsDataset(Dataset):
    def __init__(self, data, tokenizer, max_length=512, max_articles=5, for_prediction=False):
        self.data = data
        self.tokenizer = tokenizer
        self.max_length = max_length
        self.max_articles = max_articles
        self.for_prediction = for_prediction

    def __len__(self):
        # return the total number of items in the dataset
        return len(self.data)

    def __getitem__(self, idx):
        # retrieve the item and label by index
        item, label = self.data[idx]

        # if for_prediction is True, use only the first combined text article; otherwise, use up to max_articles
        if self.for_prediction:
            articles = [item['combined_text']]
        else:
            articles = item['combined_text'][:self.max_articles]
            if len(articles) < self.max_articles:
                articles += [''] * (self.max_articles - len(articles))

        # encode the text articles using the tokenizer
        encoded_inputs = self.tokenizer(
            articles,
            add_special_tokens=True,
            max_length=self.max_length,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )

        # squeeze the first dimension if for_prediction is True, otherwise keep the full tensor
        return {
            'input_ids': encoded_inputs['input_ids'].squeeze(0) if self.for_prediction else encoded_inputs['input_ids'],
            'attention_mask': encoded_inputs['attention_mask'].squeeze(0) if self.for_prediction else encoded_inputs['attention_mask'],
            'labels': torch.tensor(label, dtype=torch.long)
        }

In [None]:
# Define the custom data collator
@dataclass
class CustomDataCollator:
    def __call__(self, features: List[Dict[str, Any]]) -> Dict[str, torch.Tensor]:

        # stack the 'input_ids' from each feature into a single tensor for the batch
        batch = {
            'input_ids': torch.stack([f['input_ids'] for f in features]),
            # stack the 'attention_mask' from each feature into a single tensor for the batch
            'attention_mask': torch.stack([f['attention_mask'] for f in features]),
            # convert the 'labels' from each feature into a single tensor of type long for the batch
            'labels': torch.tensor([f['labels'] for f in features], dtype=torch.long)
        }
        return batch

In [None]:
# define a custom model class for stock prediction using a transformer model
class TransformerForStockPrediction(nn.Module):
    def __init__(self, transformer_model, config):
        super().__init__()
        # initialise the transformer model
        self.transformer = transformer_model
        # define a linear classifier layer that maps the transformer's output to the number of labels
        self.classifier = nn.Linear(config.hidden_size, config.num_labels)

    def forward(self, input_ids=None, attention_mask=None, labels=None):
        # check if input is for prediction (2D) or training (3D)
        if input_ids.dim() == 2:  # For prediction (single article)
            # pass input through transformer
            outputs = self.transformer(input_ids, attention_mask=attention_mask)
            # get pooled output from transformer
            if hasattr(outputs, 'pooler_output'):
                pooled_output = outputs.pooler_output
            else:
                pooled_output = outputs.last_hidden_state[:, 0]
        else:  # for training (multiple articles)
            # reshape input for multiple articles
            batch_size, num_articles, seq_len = input_ids.size()
            input_ids = input_ids.view(-1, seq_len)
            attention_mask = attention_mask.view(-1, seq_len)

            # pass reshaped input through transformer
            outputs = self.transformer(input_ids, attention_mask=attention_mask)
            # get pooled output from transformer
            if hasattr(outputs, 'pooler_output'):
                pooled_output = outputs.pooler_output
            else:
                pooled_output = outputs.last_hidden_state[:, 0]

            # reshape and max pool across articles
            pooled_output = pooled_output.view(batch_size, num_articles, -1)
            pooled_output = torch.max(pooled_output, dim=1)[0]

        # pass pooled output through classifier
        logits = self.classifier(pooled_output)

        # calculate loss if labels are provided
        loss = None
        if labels is not None:
            loss_fct = nn.CrossEntropyLoss()
            loss = loss_fct(logits, labels)

        # return sequenceclassifieroutput object with relevant information
        return SequenceClassifierOutput(
            loss=loss,
            logits=logits,
            hidden_states=outputs.hidden_states,
            attentions=outputs.attentions,
        )

In [None]:
# define function to calculate f1 score
def compute_f1_metrics(pred):
    # extract true labels from the prediction object
    labels = pred.label_ids
    # get predicted labels by taking the argmax of the prediction probabilities
    preds = pred.predictions.argmax(-1)
    # compute the weighted F1 score
    f1 = f1_score(labels, preds, average='weighted')
    return {"f1": f1}

In [None]:
# define function to split the dataframe into training, validation, and test
def split_data(df, train_range, val_range, predict_range):
    train_df = df[(df['start_date'] >= train_range[0]) & (df['start_date'] <= train_range[1])]
    val_df = df[(df['start_date'] >= val_range[0]) & (df['start_date'] <= val_range[1])]
    predict_df = df[(df['start_date'] >= predict_range[0]) & (df['start_date'] <= predict_range[1])]
    return train_df, val_df, predict_df

In [None]:
# define dunction to group data by week for training
def group_data(dataframe, for_prediction=False):
    if for_prediction:
        return [(row.to_dict(), row['label']) for _, row in dataframe.iterrows()]
    else:
        grouped = dataframe.groupby(['permco', 'start_date'])
        grouped_data = []
        for (permco, start_date), group in grouped:
            week_data = {
                'permco': permco,
                'start_date': start_date,
                'combined_text': group['combined_text'].tolist()
            }
            label = group['label'].iloc[0]
            grouped_data.append((week_data, label))
        return grouped_data

In [None]:
# define function to train and predict in rolling window with default learning hyperparameters
def train_and_predict(df, model_name, batch_size=8, num_epochs=3, learning_rate=2e-5, warmup_steps=1000, weight_decay=0.01):
    # initialise lists to store results
    results_accuracy = []
    results_prediction = []

    # initialize tokeniser and config
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    config = AutoConfig.from_pretrained(model_name, num_labels=2)

    # get unique permco
    unique_permcos = df['permco'].unique()

    # define rolling windows period for training and prediction
    windows = [
        {'train': ('2005-01-01', '2015-12-31'), 'predict': [('2016-01-01', '2016-12-31'), ('2017-01-01', '2017-12-31')]},
        {'train': ('2007-01-01', '2017-12-31'), 'predict': [('2018-01-01', '2018-12-31'), ('2019-01-01', '2019-12-31')]},
        {'train': ('2009-01-01', '2019-12-31'), 'predict': [('2020-01-01', '2020-12-31'), ('2021-01-01', '2021-12-31')]},
        {'train': ('2011-01-01', '2021-12-31'), 'predict': [('2022-01-01', '2022-12-31'), ('2023-01-01', '2023-12-31')]},
    ]

    # iterate over each permco
    for permco in unique_permcos:
        print(f"Processing Permco {permco}")

        # filter dataframe for current company
        permco_df = df[df['permco'] == permco]

        # iterate over each time window
        for i, window in enumerate(windows):
            print(f"Processing Window {i + 1} for Permco {permco}")

            train_start, train_end = window['train']

            # split the data for training and validation
            train_df = permco_df[(permco_df['start_date'] >= train_start) & (permco_df['start_date'] <= train_end)]
            val_df = train_df[(train_df['start_date'] >= pd.to_datetime(train_end) - pd.DateOffset(years=2))]
            train_df = train_df[~train_df.index.isin(val_df.index)]

            # group the data by week
            train_data = group_data(train_df)
            val_data = group_data(val_df)

            # create dataset object
            train_dataset = NewsDataset(train_data, tokenizer, max_length=512, max_articles=5)
            val_dataset = NewsDataset(val_data, tokenizer, max_length=512, max_articles=5)

            # initialise the model for each company and window
            transformer_model = AutoModel.from_pretrained(model_name).to(device)
            model = TransformerForStockPrediction(transformer_model, config).to(device)

            # define training arguments
            training_args = TrainingArguments(
                output_dir=f'./results_permco_{permco}_window_{i+1}',
                num_train_epochs=num_epochs,
                per_device_train_batch_size=batch_size,
                per_device_eval_batch_size=batch_size,
                warmup_steps=warmup_steps,
                weight_decay=weight_decay,
                learning_rate=learning_rate,
                logging_dir=f'./logs_permco_{permco}_window_{i+1}',
                logging_steps=10,
                evaluation_strategy="epoch",
                save_strategy="epoch",
                save_total_limit=1,
                load_best_model_at_end=True,
                fp16=True,
                gradient_accumulation_steps=1,
                eval_accumulation_steps=10,
            )

            # initialise the trainer
            trainer = Trainer(
                model=model,
                args=training_args,
                train_dataset=train_dataset,
                eval_dataset=val_dataset,
                compute_metrics=compute_f1_metrics,
                data_collator=CustomDataCollator()
            )

            # train the model
            trainer.train()

            # predict for each year in the prediction window
            for year_start, year_end in window['predict']:
                predict_df = permco_df[(permco_df['start_date'] >= year_start) & (permco_df['start_date'] <= year_end)]

                predict_data = group_data(predict_df, for_prediction=True)
                predict_dataset = NewsDataset(predict_data, tokenizer, max_length=512, max_articles=1, for_prediction=True)

                # make predictions using the trained model
                predictions = trainer.predict(predict_dataset)
                # extract the raw logits from the predictions
                logits = predictions.predictions
                # convert logits to probabilities using softmax function
                probabilities = torch.softmax(torch.tensor(logits), dim=-1).cpu().numpy()
                # get the predicted class by taking the argmax of probabilities
                preds = np.argmax(probabilities, axis=1)
                # extract the actual labels from the predict_data
                actuals = np.array([item[1] for item in predict_data])

                # calculate accuracy
                accuracy = np.mean(preds == actuals)

                # store accuracy results
                results_accuracy.append({
                    'company': permco,
                    'train_period': f"{train_start[:4]}-{train_end[:4]}",
                    'test_period': f"{year_start[:4]}",
                    'test_accuracy': accuracy,
                })

                # store prediction results
                for idx in range(len(predictions.predictions)):
                    results_prediction.append({
                        'company': permco,
                        'week_date': predict_df.iloc[idx]['start_date'],
                        'probability_neg': probabilities[idx][0],
                        'probability_pos': probabilities[idx][1],
                        'prediction': preds[idx],
                        'actual': predict_df.iloc[idx]['label']
                    })

            # clear GPU memory
            torch.cuda.empty_cache()
            del predictions, probabilities, preds, actuals, train_dataset, val_dataset, predict_dataset

    # convert results to dataframes
    df_accuracy = pd.DataFrame(results_accuracy)
    df_prediction = pd.DataFrame(results_prediction)

    return df_accuracy, df_prediction

# **FINE-TUNING & PREDICTION**

## **BERT**

In [None]:
# fine-tuning and predict in rolling window
bert_accuracy, bert_prediction = train_and_predict(final_df, 'bert-base-uncased', batch_size=16, num_epochs=3, learning_rate=5e-05, warmup_steps=500, weight_decay=0.001)

In [None]:
# convert to dataframe
bert_accuracy_df = pd.DataFrame(bert_accuracy)
bert_prediction_df = pd.DataFrame(bert_prediction)

In [None]:
# define path to save results
path_bert1 = '/content/drive/MyDrive/PostGrad/5. Extended Research Projects/Results/Rolling Window Prediction/bert_rolling_finetuned_accuracy.csv'
path_bert2 = '/content/drive/MyDrive/PostGrad/5. Extended Research Projects/Results/Rolling Window Prediction/bert_rolling_finetuned_prediction.csv'
# save to csv
bert_accuracy_df.to_csv(path_bert1, index=False)
bert_prediction_df.to_csv(path_bert2, index=False)

In [None]:
# disconnect run time
runtime.unassign()

## **RoBERTa**

In [None]:
# fine-tuning and predict in rolling window
roberta_accuracy, roberta_prediction = train_and_predict(final_df, 'roberta-base', batch_size=8, num_epochs=3, learning_rate=2e-05, warmup_steps=500, weight_decay=0.0001)

In [None]:
# convert to dataframe
roberta_accuracy_df = pd.DataFrame(roberta_accuracy)
roberta_prediction_df = pd.DataFrame(roberta_prediction)

In [None]:
# define path to save results
path_roberta1 = '/content/drive/MyDrive/PostGrad/5. Extended Research Projects/Results/Rolling Window Prediction/roberta_rolling_finetuned_accuracy.csv'
path_roberta2 = '/content/drive/MyDrive/PostGrad/5. Extended Research Projects/Results/Rolling Window Prediction/roberta_rolling_finetuned_prediction.csv'
# save to csv
roberta_accuracy_df.to_csv(path_roberta1, index=False)
roberta_prediction_df.to_csv(path_roberta2, index=False)

In [None]:
# disconnect run time
runtime.unassign()

## **DistilBERT**

In [None]:
# fine-tuning and predict in rolling window
distilbert_accuracy, distilbert_prediction = train_and_predict(final_df, 'distilbert-base-uncased', batch_size=8, num_epochs=3, learning_rate=2e-05, warmup_steps=500, weight_decay=0.0001)

In [None]:
# convert to dataframe
distilbert_accuracy_df = pd.DataFrame(distilbert_accuracy)
distilbert_prediction_df = pd.DataFrame(distilbert_prediction)

In [None]:
# define path to save results
path_distilbert1 = '/content/drive/MyDrive/PostGrad/5. Extended Research Projects/Results/Rolling Window Prediction/distilbert_rolling_finetuned_accuracy.csv'
path_distilbert2 = '/content/drive/MyDrive/PostGrad/5. Extended Research Projects/Results/Rolling Window Prediction/distilbert_rolling_finetuned_prediction.csv'
# save to csv
distilbert_accuracy_df.to_csv(path_distilbert1, index=False)
distilbert_prediction_df.to_csv(path_distilbert2, index=False)

In [None]:
# disconnect run time
runtime.unassign()

## **DistilRoBERTa**

In [None]:
# fine-tuning and predict in rolling window
distilroberta_accuracy, distilroberta_prediction = train_and_predict(final_df, 'distilroberta-base', batch_size=8, num_epochs=3, learning_rate=2e-05, warmup_steps=500, weight_decay=0.0001)

In [None]:
# convert to dataframe
distilroberta_accuracy_df = pd.DataFrame(distilroberta_accuracy)
distilroberta_prediction_df = pd.DataFrame(distilroberta_prediction)

In [None]:
# define path to save results
path_distilroberta1 = '/content/drive/MyDrive/PostGrad/5. Extended Research Projects/Results/Rolling Window Prediction/distilroberta_rolling_finetuned_accuracy.csv'
path_distilroberta2 = '/content/drive/MyDrive/PostGrad/5. Extended Research Projects/Results/Rolling Window Prediction/distilroberta_rolling_finetuned_prediction.csv'
# save to csv
distilroberta_accuracy_df.to_csv(path_distilroberta1, index=False)
distilroberta_prediction_df.to_csv(path_distilroberta2, index=False)

In [None]:
# disconnect run time
from google.colab import runtime

## **FinBERT**

In [None]:
# fine-tuning and predict in rolling window
finbert_accuracy, finbert_prediction = train_and_predict(final_df, 'yiyanghkust/finbert-tone', batch_size=16, num_epochs=3, learning_rate=5e-05, warmup_steps=1000, weight_decay=0.0001)

In [None]:
# convert to dataframe
finbert_accuracy_df = pd.DataFrame(finbert_accuracy)
finbert_prediction_df = pd.DataFrame(finbert_prediction)

In [None]:
# define path to save results
path_finbert1 = '/content/drive/MyDrive/PostGrad/5. Extended Research Projects/Results/Rolling Window Prediction/finbert_rolling_finetuned_accuracy.csv'
path_finbert2 = '/content/drive/MyDrive/PostGrad/5. Extended Research Projects/Results/Rolling Window Prediction/finbert_rolling_finetuned_prediction.csv'
# save to csv
finbert_accuracy_df.to_csv(path_finbert1, index=False)
finbert_prediction_df.to_csv(path_finbert2, index=False)

In [None]:
# disconnect run time
from google.colab import runtime