# spaCy transformer model for sentiment analysis

In [None]:
# Update setuptools for best performance and get cupy-cuda
!python -m pip install -U setuptools pip
!pip install cupy-cuda12x

In [None]:
!pip install -Uqq spacy pandas tqdm
!pip install -Uqq spacy-transformers

In [None]:
# Download pre-trained spaCy transformer model
!python -m spacy download en_core_web_trf

In [None]:
from datetime import datetime
import pandas as pd
import spacy
from spacy.tokens import DocBin  # to store docs in binary format
import spacy_transformers
from tqdm.notebook import tqdm

In [None]:
from google.colab import drive
drive.mount("/content/gdrive")

In [None]:
# Customise the path after "My Drive/" as needed
%cd /content/gdrive/My Drive/Colab Notebooks/stock-sentiment-analysis

In [None]:
# Get read-out of GPU specs
!nvidia-smi

In [None]:
# Get short read-out of memory usage
!nvidia-smi --query-gpu=memory.free --format=csv

## Initialise spaCy transformer model

In [None]:
nlp = spacy.load("en_core_web_trf")

## Load dataset(s)

For datasets that have already been binary encoded into train.spacy and test.spacy files,  
continue on to the "Model configuration" section.

In [None]:
# SEntFiN financial news headlines (training set)
train = pd.read_csv("./datasets/SEntFiN_train.csv", encoding="utf-8")
train.head()

In [None]:
# SEntFiN financial news headlines (validation set)
valid = pd.read_csv("./datasets/SEntFiN_valid.csv", encoding="utf-8")
valid.head()

In [None]:
train.shape

In [None]:
valid.shape

In [None]:
train["label"].value_counts()

In [None]:
valid["label"].value_counts()

## Train test split

In [None]:
# Shuffle datasets
train = train.sample(frac=1, random_state=42).reset_index(drop=True)
valid = valid.sample(frac=1, random_state=42).reset_index(drop=True)
train.shape, valid.shape

In [None]:
train.reset_index()
train.shape

In [None]:
train.head()

In [None]:
# Create (text, label) tuples for dataset
# For training set
train["tuples"] = train.apply(lambda row: (row["text"], row["label"]), axis=1)
train = train["tuples"].tolist()
# For validation set
valid["tuples"] = valid.apply(lambda row: (row["text"], row["label"]), axis=1)
valid = valid["tuples"].tolist()
train[0]

## One-hot encoding for spaCy pipeline

In [None]:
# User function for converting train and validation datasets into spaCy documents
def get_docs(data: pd.DataFrame) -> list[spacy.tokens.Doc]:
    """
    Processes (text, sentiment) tuples via spaCy NLP pipeline,
    with one-hot encoding for sentiment classification

    Parameters
    ----------
    data : pd.DataFrame
        Data to be processed

    Returns
    -------
    docs_list : list[spacy.tokens.Doc]
        List of spaCy tokens.Doc objects converted from data
    """
    # Initialise list for documents
    docs_list = []
    # Iterate through dataset
    for doc, label in tqdm(nlp.pipe(data, as_tuples=True), desc="Converting to spaCy Docs"):
        if (label==1):
            doc.cats["positive"] = 1
            doc.cats["negative"] = 0
            doc.cats["neutral"]  = 0
        elif (label==0):
            doc.cats["positive"] = 0
            doc.cats["negative"] = 1
            doc.cats["neutral"]  = 0
        else:
            doc.cats["positive"] = 0
            doc.cats["negative"] = 0
            doc.cats["neutral"]  = 1
        # Add doc to docs_list list
        docs_list.append(doc)

    return docs_list

## Convert train and test sets to binary .spacy docs

In [None]:
start_time = datetime.now()

# Get docs object for training set
train_docs = get_docs(train)
# Create binary document using spaCy DocBin function
doc_bin = DocBin(docs = train_docs)
# Save binary document as train.spacy
doc_bin.to_disk("train.spacy")

end_time = datetime.now()
print(f"Duration: {end_time - start_time}")

In [None]:
start_time = datetime.now()

# Get docs object for training set
valid_docs = get_docs(valid)
# Create binary document using spaCy DocBin function
doc_bin = DocBin(docs = valid_docs)
# Save binary document as valid.spacy
doc_bin.to_disk("valid.spacy")

end_time = datetime.now()
print(f"Duration: {end_time - start_time}")

## Model configuration

In [None]:
# Convert base_config.cfg file to full config.cfg
!python -m spacy init fill-config ./base_config.cfg ./config.cfg

## Model training

In [None]:
start_time = datetime.now()

# Enable GPU and set model output to folder output_updated
!python -m spacy train config.cfg --verbose --gpu-id 0 --output ./output_updated
# For CPU only
# !python -m spacy train config.cfg --verbose --output ./output_updated

end_time = datetime.now()
print(f"Duration: {end_time - start_time}")

## Model testing

### Load test dataset

In [None]:
# Load test data
df = pd.read_csv("./datasets/SEntFiN_test.csv")
len(df)

In [None]:
df.head()

In [None]:
df["label"].value_counts()

In [None]:
# Separate headline data and ground truth labels
X, y = df["text"], df["label"]
X.shape, y.shape

In [None]:
X = X.to_list()
y = y.to_list()

In [None]:
X[:5]

In [None]:
y[:5]

### Load spaCy model to test

#### Model (XX)

Update "XX" with relevant model number to load and test

In [None]:
# Load model-best from output_updated folder - update "XX" with model number
nlp = spacy.load("./model-best-XX")
# Initialise list to store predictions
preds_XX = []
# Iterate through example headlines
for headline in tqdm(X, desc="Testing on model-best-XX"):
    prediction = nlp(headline)
    preds_XX.append(prediction.cats)

In [None]:
# Update "XX" with model number
preds_XX[0]

In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score

predicted_labels = []
# Iterate through list prediction dictionaries
for pred_dict in tqdm(preds_XX):
    # Set threshold
    threshold = 0.5
    # Get category with highest probability
    max_prob_category = max(pred_dict, key=pred_dict.get)
    if pred_dict[max_prob_category] > threshold:
        predicted_labels.append(max_prob_category)
    else:
        # Handle cases where no category exceeds the threshold
        predicted_labels.append("unknown")

# Calculate metrics
accuracy_XX = accuracy_score(y, predicted_labels)
precision_XX = precision_score(y, predicted_labels, average="weighted")
recall_XX = recall_score(y, predicted_labels, average="weighted")

# Create confusion matrix
from sklearn.metrics import confusion_matrix
confusion_matrix_XX = confusion_matrix(y, predicted_labels)

print(f"Accuracy (thresholded): {accuracy_XX:.4f}")
print(f"Precision (weighted): {precision_XX:.4f}")
print(f"Recall (weighted): {recall_XX:.4f}")
print("\nConfusion Matrix:\n", confusion_matrix_XX)