# **IMPORT LIBRARIES AND LOAD DATA**

In [None]:
# import libraries
import pandas as pd
import numpy as np
import torch
import torch.nn.functional as F
from torch.utils.data import Dataset
from google.colab import files, runtime
from transformers import AutoModel, AutoTokenizer

In [None]:
# mount google drive
from google.colab import drive
drive.mount('/content/drive')

In [None]:
# load training data
file_path_train = '/content/drive/MyDrive/PostGrad/5. Extended Research Projects/Dataset/train_data.csv'
train_df = pd.read_csv(file_path_train)

# load test data
file_path_test = '/content/drive/MyDrive/PostGrad/5. Extended Research Projects/Dataset/test_data.csv'
test_df = pd.read_csv(file_path_test)

# **PREPARE DATASET**

In [None]:
# define function to concatenate headlines, situation, and eventtype with special tokens to separate columns
def concatenate_columns(row):
    return f"{row['headline']} [SEP] {row['situation']} [SEP] {row['eventtype']}"

# apply function to concatenate columns headline, situation, and eventtype
train_df['combined_text'] = train_df.apply(concatenate_columns, axis=1)
test_df['combined_text'] = test_df.apply(concatenate_columns, axis=1)

In [None]:
# filter out rows with no change label due to very small number of occurences
train_df = train_df[train_df['price_direction'] != 'no change']
test_df = test_df[test_df['price_direction'] != 'no change']

In [None]:
# format to datetime
train_df['start_date'] = pd.to_datetime(train_df['start_date'])
test_df['start_date'] = pd.to_datetime(test_df['start_date'])

In [None]:
# encode price direction
label_mapping = {'positive': 1, 'negative': 0}
train_df['label'] = train_df['price_direction'].map(label_mapping)
test_df['label'] = test_df['price_direction'].map(label_mapping)

# **DEFINE FUNCTION**

In [None]:
# set the device to GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# define function to load the model and tokenizer
def load_model_and_tokenizer(model_name):
    model = AutoModel.from_pretrained(model_name)
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    return model.to(device), tokenizer

# define function to generate embeddings
def get_embeddings(texts, model, tokenizer, max_length=512):
    embeddings_list = []

    for text in texts:
        # tokenise the text with truncation and padding to the maximum length
        encodings = tokenizer(
            text,
            truncation=True,
            padding='max_length',
            max_length=max_length,
            return_tensors='pt'
        )

        # move tokenised inputs to the specified device (GPU or CPU)
        encodings = {k: v.to(device) for k, v in encodings.items()}

        with torch.no_grad():
            # pass the tokenised inputs through the model to get outputs
            outputs = model(**encodings)

        # get embeddings from hidden states
        embeddings = outputs.last_hidden_state.mean(dim=1).cpu()
        embeddings_list.append(embeddings)

        # clear GPU memory after processing each text
        del encodings, outputs
        torch.cuda.empty_cache()

    return embeddings_list

# define function to max pool the embeddings for each week
def max_pooling(embeddings_list):
    # stack embeddings into a tensor
    stacked_embeddings = torch.stack(embeddings_list)
    # max pooling across the embeddings (dim=0 aggregates across the batch)
    pooled_output, _ = stacked_embeddings.max(dim=0)
    return pooled_output

def aggregate_weekly_data(group):
    # ensure each group has permco
    if 'permco' not in group.columns:
        raise KeyError("permco is missing from the group")

    # aggregate non-embedding columns into a single representative row for the week
    # all non-embedding columns actually have the same value for a week
    aggregated = {
        'start_date': group['start_date'].iloc[0], # use the first start_date as representative of the week
        'ticker': group['ticker'].iloc[-1], # use the most recent ticker
        'weekly_ret': group['weekly_ret'].mean(), # get avg weekly return
        'adj_prc': group['adj_prc'].mean(), # get avg adjusted price
        'price_direction': group['price_direction'].iloc[0],  # use the first price direction value
        'permco': group['permco'].iloc[0]}  # retain the permco identifier

    # aggregate embeddings for each week
    embeddings_list = list(group['embeddings'])
    aggregated_embedding = max_pooling(embeddings_list)

    # ensure the aggregated embedding is a 1D tensor
    if aggregated_embedding.dim() > 1:
        aggregated_embedding = aggregated_embedding.squeeze()

    # validate the aggregated embedding is a 1D tensor
    assert aggregated_embedding.ndim == 1, f"Expected 1D tensor, got {aggregated_embedding.ndim}D tensor."

    # add each dimension of the aggregated embedding as a separate column
    for i, value in enumerate(aggregated_embedding.numpy()):
        aggregated[f'embedding_{i}'] = value

    return pd.Series(aggregated)

# define function to generate embeddings to each row and aggregate weekly
def add_embeddings_to_dataframe(df, model_name):
    # load the specified model and tokeniser
    model, tokenizer = load_model_and_tokenizer(model_name)

    # generate embeddings for the texts in the dataframe
    embeddings_list = get_embeddings(df['combined_text'].tolist(), model, tokenizer)

    # add embeddings to the dataframe
    df = df.copy()
    df['embeddings'] = embeddings_list

    # aggregate the rows on a weekly basis
    weekly_aggregated_df = df.groupby(['start_date', 'permco']).apply(aggregate_weekly_data).reset_index(drop=True)

    return weekly_aggregated_df


# **GENERATE EMBEDDINGS**

## **BERT**

In [None]:
# generate embeddings for train_df
final_train_df = add_embeddings_to_dataframe(train_df, 'bert-base-uncased')

In [None]:
# generate embeddings for test_df
final_test_df = add_embeddings_to_dataframe(test_df, 'bert-base-uncased')

In [None]:
# define the file path
train_path = '/content/drive/MyDrive/PostGrad/5. Extended Research Projects/Embedding/Pre-Trained/bert_train_pretrained.csv'
test_path = '/content/drive/MyDrive/PostGrad/5. Extended Research Projects/Embedding/Pre-Trained/bert_test_pretrained.csv'

# save to csv
final_train_df.to_csv(train_path, index=False)
final_test_df.to_csv(test_path, index=False)

In [None]:
# disconnect runtime
runtime.unassign()

## **RoBERTa**

In [None]:
# generate embeddings for train_df
final_train_df = add_embeddings_to_dataframe(train_df, 'roberta-base')

In [None]:
# generate embeddings for test_df
final_test_df = add_embeddings_to_dataframe(test_df, 'roberta-base')

In [None]:
# define the file path
train_path = '/content/drive/MyDrive/PostGrad/5. Extended Research Projects/Embedding/Pre-Trained/roberta_train_pretrained.csv'
test_path = '/content/drive/MyDrive/PostGrad/5. Extended Research Projects/Embedding/Pre-Trained/roberta_test_pretrained.csv'

# save to csv
final_train_df.to_csv(train_path, index=False)
final_test_df.to_csv(test_path, index=False)

In [None]:
# disconnect runtime
runtime.unassign()

## **DistilBERT**

In [None]:
# generate embeddings for train_df
final_train_df = add_embeddings_to_dataframe(train_df, 'distilbert-base-uncased')

In [None]:
# generate embeddings for test_df
final_test_df = add_embeddings_to_dataframe(test_df, 'distilbert-base-uncased')

In [None]:
# define the file path
train_path = '/content/drive/MyDrive/PostGrad/5. Extended Research Projects/Embedding/Pre-Trained/distilbert_train_pretrained.csv'
test_path = '/content/drive/MyDrive/PostGrad/5. Extended Research Projects/Embedding/Pre-Trained/distilbert_test_pretrained.csv'

# save to csv
final_train_df.to_csv(train_path, index=False)
final_test_df.to_csv(test_path, index=False)

In [None]:
# disconnect runtime
runtime.unassign()

## **DistilRoBERTa**

In [None]:
# generate embeddings for train_df
final_train_df = add_embeddings_to_dataframe(train_df, 'distilroberta-base')

In [None]:
# generate embeddings for test_df
final_test_df = add_embeddings_to_dataframe(test_df, 'distilroberta-base')

In [None]:
# define the file path
train_path = '/content/drive/MyDrive/PostGrad/5. Extended Research Projects/Embedding/Pre-Trained/distilroberta_train_pretrained.csv'
test_path = '/content/drive/MyDrive/PostGrad/5. Extended Research Projects/Embedding/Pre-Trained/distilroberta_test_pretrained.csv'

# save to csv
final_train_df.to_csv(train_path, index=False)
final_test_df.to_csv(test_path, index=False)

In [None]:
# disconnect runtime
runtime.unassign()

## **FinBERT**

In [None]:
# generate embeddings for train_df
final_train_df = add_embeddings_to_dataframe(train_df, 'yiyanghkust/finbert-tone')

In [None]:
# generate embeddings for test_df
final_test_df = add_embeddings_to_dataframe(test_df, 'yiyanghkust/finbert-tone')

In [None]:
# define the file path
train_path = '/content/drive/MyDrive/PostGrad/5. Extended Research Projects/Embedding/Pre-Trained/finbert_train_pretrained.csv'
test_path = '/content/drive/MyDrive/PostGrad/5. Extended Research Projects/Embedding/Pre-Trained/finbert_test_pretrained.csv'

# save to csv
final_train_df.to_csv(train_path, index=False)
final_test_df.to_csv(test_path, index=False)

In [None]:
# disconnect runtime
runtime.unassign()