<div style="text-align: center; font-weight: bold; font-size: 40px; padding: 10px;">
    Text Mining Project: Stock Sentiment Analysis
</div>
<div style="text-align: center; font-weight: bold; font-size: 25px; padding: 10px;">
    Final model deployment
</div>
<div style="text-align: center; font-weight: bold; font-size: 20px; padding: 10px;">
    Group 21
</div>

In [None]:
# !pip install numpy==1.26.4



In [2]:
import pandas as pd
import numpy as np
import re
from datasets import Dataset
from transformers import AutoModelForSequenceClassification, AutoTokenizer
import transformers
from transformers import pipeline
from transformers import AutoTokenizer
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline, AutoModelForSequenceClassification
from transformers import TrainingArguments, Trainer, AutoModel
from google.colab import drive
drive.mount('/content/drive')
print(np.__version__)
import torch
import os
os.environ["WANDB_DISABLED"] = "true"
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.svm import SVC

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
1.26.4


In [3]:
SEED = 42
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False
torch.cuda.manual_seed_all(SEED)

In [4]:
df_train = pd.read_csv('/content/drive/MyDrive/train.csv')
df_test = pd.read_csv('/content/drive/MyDrive/test.csv')

## Preprocessing

We will use a simple preprocessing to remove extra whitespaces, fixing common punctuation issues, removing newlines, stripping leading/trailing spaces

In [5]:
def normalize_text(s, sep_token = " \n "):
    s = re.sub(r'\s+',  ' ', s).strip() # extra whitespace
    s = re.sub(r'\.\s*,', '', s) # pattern . ,
    s = s.replace("..",".")
    s = s.replace(". .",".")
    s = s.replace("\n", "")
    s = s.strip()

    return s

X_train_processed = df_train['text'].apply(normalize_text)
X_test_processed = df_test['text'].apply(normalize_text)

## Embedding with encoder

In [6]:
class HF_Transformer_Embedding(BaseEstimator, TransformerMixin):
    def __init__(self, model_name, batch_size=16, max_length=128):
        self.model_name = model_name
        self.batch_size = batch_size
        self.max_length = max_length

        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") # It will use GPU if available, else CPU

        #Uses the tokenizer for this model
        self.tokenizer = AutoTokenizer.from_pretrained(model_name)

        # Use automodel from hugging face
        self.model = AutoModel.from_pretrained(model_name)
        self.model.to(self.device)
        self.model.eval()

    def transform(self, X, y=None): # Given X=list of text
        all_embeddings = [] # Empty list to store the number representations for the text

        tokenizer = AutoTokenizer.from_pretrained(self.model_name)


        train_len = X.map(lambda x: len(tokenizer(x)["input_ids"]))
        max_len = train_len.max()

        # Loop over input text
        for i in range(0, len(X), self.batch_size):

            batch_texts = X[i:i+self.batch_size] # We will process in batches

            inputs = self.tokenizer(batch_texts.tolist(), # Conver the batch of sentences to token IDs
                                    padding=True, # same length
                                    truncation=True, #remove if sentence too long
                                    max_length=max_len,
                                    return_tensors="pt")

            # Move to processor
            inputs = {k: v.to(self.device) for k, v in inputs.items()}


            with torch.no_grad(): # no training so no need to calculate gradients
                outputs = self.model(**inputs) # pass tokens through model to get output vetores

                # From the last hidden state, extract the cls token embedding
                cls_embeddings = outputs.last_hidden_state[:, 0, :].cpu().numpy()

                all_embeddings.append(cls_embeddings) # add to the list

        return np.vstack(all_embeddings)

    def fit(self, X, y=None):
        return self

In [7]:
model_name = "RashidNLP/Finance-Sentiment-Classification"

In [8]:
embedder = HF_Transformer_Embedding(
    model_name="RashidNLP/Finance-Sentiment-Classification",
    batch_size=128)

X_train_embeddings = embedder.transform(X_train_processed)
X_test_embeddings = embedder.transform(X_test_processed)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


# Training with SVM

In [9]:
print(f"X_train_embeddings shape: {X_train_embeddings.shape}")
print(f"X_val_embeddings shape: {X_test_embeddings.shape}")

X_train_embeddings shape: (9543, 768)
X_val_embeddings shape: (2388, 768)


In [10]:
model = SVC(C=10, kernel='rbf', gamma='scale', probability=True, random_state=42)

model.fit(X_train_embeddings, df_train['label'])


## Inference

In [11]:
preds = model.predict(X_test_embeddings)

In [12]:
preds_df = pd.DataFrame({'id': df_test['id'], 'predictions': preds})
preds_df.head()

Unnamed: 0,id,predictions
0,0,1
1,1,2
2,2,2
3,3,1
4,4,2


In [15]:
preds_df.shape

(2388, 2)

In [13]:
preds_df.to_csv('/content/drive/MyDrive/pred_21.csv', index=False)