# Prediction of the "classification" label 

In [None]:
import torch
from transformers import AutoTokenizer, AutoModel, RobertaModel, RobertaTokenizer
import pickle 
import torch.nn as nn
import pandas as pd
import numpy as np
import os

from sklearn.model_selection import train_test_split
import torch.optim as optim
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder
from torch.utils.data import TensorDataset, DataLoader, Dataset
from torch.nn import CrossEntropyLoss

In [None]:
df_combined = pd.read_csv('df_combined.csv')

## NLP prediction

https://huggingface.co/BAAI/bge-reranker-large

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device

In [None]:
size_limit: int = 60

final_text_embeddings = []

# Load model from HuggingFace Hub
tokenizer = AutoTokenizer.from_pretrained('BAAI/bge-small-en-v1.5')
model = AutoModel.from_pretrained('BAAI/bge-small-en-v1.5').to(device)
model.eval()

filename = 'text_embeddings.obj'

if not os.path.exists(filename):
    for index in range(0, len(df_combined), size_limit):

        # Sentences we want sentence embeddings for
        # sentences = df_combined.text.to_list()[:size_limit]
        sentences = df_combined.text.to_list()[max(0, index):min(index + size_limit, len(df_combined))]

        # Tokenize sentences
        encoded_input = tokenizer(sentences, padding=True, truncation=True, return_tensors='pt').to(device)
        # for s2p(short query to long passage) retrieval task, add an instruction to query (not add instruction for passages)
        # encoded_input = tokenizer([instruction + q for q in queries], padding=True, truncation=True, return_tensors='pt')

        # Compute token embeddings
        with torch.no_grad():
            model_output = model(**encoded_input)
            # Perform pooling. In this case, cls pooling.
            sentence_embeddings = model_output["last_hidden_state"][:, 0]
        # normalize embeddings
        sentence_embeddings = torch.nn.functional.normalize(sentence_embeddings, p=2, dim=1)
        #print("Sentence embeddings:", sentence_embeddings)
        final_text_embeddings.append(sentence_embeddings)
        print(index)

        filehandler = open(filename, 'wb') 
        pickle.dump(final_text_embeddings, filehandler)
        filehandler.close()
else:
    filehandler = open(filename, 'rb') 
    final_text_embeddings = pickle.load(filehandler)
    filehandler.close()
    
final_text_embeddings

In [None]:
type(model)

In [None]:
#sentence_embeddings.shape

In [None]:
final_text_embeddings

In [None]:
final_text_embeddings[0].shape

In [None]:
#scores = sentence_embeddings[0] @ sentence_embeddings[1].T
scores = final_text_embeddings[0][0] @ final_text_embeddings[0][1].T
scores

In [None]:
scores.shape

In [None]:
df_combined["classification_by_editorial"].value_counts()

In [None]:
df_combined["classification"].unique().__len__()

In [None]:
# TODO: TEMPORARY FIX, NEED TO RECOMPUTE EMBEDDINGS

#non_latin_rows = df_combined[(df_combined['text'].str.contains(r'[^\x00-\x7F]')) | (df_combined['title'].str.contains(r'[^\x00-\x7F]'))]
#df_combined = non_latin_rows
#df_combined.__len__()

In [None]:
# flatten the final embeddings list into a numpy array
final_text_embeddings = torch.cat(final_text_embeddings, dim=0)
final_text_embeddings.shape

In [None]:
class Multiclass(nn.Module):
    def __init__(self):
        super().__init__()
        self.hidden = nn.Linear(384, 256)
        self.act = nn.ReLU()
        self.hidden2 = nn.Linear(256, 64)
        self.act2 = nn.ReLU()
        self.output = nn.Linear(64, 4)
        
    def forward(self, x):
        x = self.hidden(x)
        x = self.act(x)
        x = self.hidden2(x)
        x = self.act2(x)
        x = self.output(x)
        #return x
        return torch.nn.functional.log_softmax(x, dim=1)
    
nn_model = Multiclass()

In [None]:
# Assuming df_combined["classification_by_editorial"] is a pandas Series
labels = df_combined["classification_by_editorial"]

# Initialize the LabelEncoder
le = LabelEncoder()

# Fit the LabelEncoder and transform the labels
labels_encoded = le.fit_transform(labels)

# Convert labels_encoded to a tensor
labels_encoded = torch.tensor(labels_encoded).to(device)

# Split the data into training and test sets
train_data, test_data, train_labels, test_labels = train_test_split(final_text_embeddings, labels_encoded, test_size=0.2, random_state=42)

# Convert the training and test sets into PyTorch Datasets
train_dataset = TensorDataset(train_data, train_labels)
test_dataset = TensorDataset(test_data, test_labels)

# Create DataLoaders for the training and test sets
train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=64, shuffle=False)

# Initialize the model
nn_model = Multiclass().to(device)

# Define the loss function and the optimizer
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

# Training loop
for epoch in range(20):  # Number of epochs
    for inputs, labels in train_loader:
        # Zero the parameter gradients
        optimizer.zero_grad()

        # Forward pass
        outputs = nn_model(inputs)
        loss = criterion(outputs, labels)

        # Backward pass and optimization
        loss.backward()
        optimizer.step()

# Evaluation
correct = 0
total = 0
predictions = []
true_labels = []
with torch.no_grad():
    for inputs, labels in test_loader:
        outputs = nn_model(inputs)
        _, predicted = torch.max(outputs.data, 1)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()
        predictions.extend(predicted.tolist())
        true_labels.extend(labels.tolist())

print('Accuracy of the model on the test set: %d %%' % (100 * correct / total))

# Generate the confusion matrix
cm = confusion_matrix(true_labels, predictions)

# Plot the confusion matrix
plt.figure(figsize=(10, 10))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
plt.xlabel('Predicted labels')
plt.ylabel('True labels')
plt.title('Confusion Matrix')
plt.show()

# Print the classification report
print(classification_report(true_labels, predictions))
le.classes_

In [None]:
df_combined["classification_by_editorial"].values

In [None]:
# Label encoding
label_encoder = LabelEncoder()
encoded_labels = label_encoder.fit_transform(df_combined["classification_by_editorial"])

# Splitting the data with encoded labels
X_train, X_test, y_train, y_test = train_test_split(final_text_embeddings, encoded_labels, test_size=0.2, random_state=42)

X_train = torch.tensor(X_train, dtype=torch.float32).to(device)
X_test = torch.tensor(X_test, dtype=torch.float32).to(device)
y_train = torch.tensor(y_train, dtype=torch.long).to(device)
y_test = torch.tensor(y_test, dtype=torch.long).to(device)

# Initialize the model
nn_model = Multiclass().to(device)

# Define loss function and optimizer
criterion = nn.NLLLoss()
optimizer = optim.Adam(nn_model.parameters(), lr=0.001)

# Training the model
epochs = 100
train_losses = []
test_losses = []

for epoch in range(epochs):
    nn_model.train()
    optimizer.zero_grad()
    output = nn_model(X_train)
    loss = criterion(output, y_train)
    loss.backward()
    optimizer.step()
    train_losses.append(loss.item())

    nn_model.eval()
    output = nn_model(X_test)
    loss = criterion(output, y_test)
    test_losses.append(loss.item())

    print(f"Epoch {epoch+1}/{epochs}, Train Loss: {train_losses[-1]}, Test Loss: {test_losses[-1]}")

# Plotting the training and testing losses
plt.plot(train_losses, label='Training loss')
plt.plot(test_losses, label='Testing loss')
plt.legend()
plt.show()

# Evaluation
with torch.no_grad():
    nn_model.eval()
    output = nn_model(X_test)
    y_test = y_test.cpu()
    _, preds = torch.max(output, 1)
    preds = preds.cpu()
    accuracy = accuracy_score(y_test, preds)
    print("Accuracy:", accuracy)
    print("Classification Report:")
    print(classification_report(y_test, preds))
    
    # Confusion Matrix
    cm = confusion_matrix(y_test, preds)
    plt.figure(figsize=(8, 6))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', cbar=False)
    plt.xlabel('Predicted')
    plt.ylabel('Actual')
    plt.title('Confusion Matrix')
    plt.show()

label_encoder.classes_

## Version 2 of the NLP

In [None]:
# Ensure CUDA is available
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Load pre-trained model tokenizer (vocabulary)
tokenizer = RobertaTokenizer.from_pretrained('roberta-base')

# Load pre-trained model (weights)
model = RobertaModel.from_pretrained('roberta-base')
model = model.to(device)
model.eval()

# Define a custom classifier
class CustomClassifier(torch.nn.Module):
    def __init__(self, roberta_model):
        super(CustomClassifier, self).__init__()
        self.roberta = roberta_model
        self.dense = torch.nn.Linear(768, 4)  # Roberta-base has 768 hidden units

    def forward(self, inputs):
        outputs = self.roberta(**inputs).last_hidden_state
        logits = self.dense(outputs[:, 0, :])  # Use the representation of the [CLS] token
        return logits

# Instantiate the custom classifier
classifier = CustomClassifier(model).to(device)

In [None]:
size_limit: int = 80
filename = 'text_embeddings_roberta.obj'

final_text_embeddings = []

if not os.path.exists(filename):
    for index in range(0, len(df_combined), size_limit):

        # Sentences we want sentence embeddings for
        # sentences = df_combined.text.to_list()[:size_limit]
        sentences = df_combined.text.to_list()[max(0, index):min(index + size_limit, len(df_combined))]

        # Tokenize sentences
        encoded_input = tokenizer(sentences, padding=True, truncation=True, return_tensors='pt')

        # Compute token embeddings
        with torch.no_grad():
            model_output = model(**encoded_input).last_hidden_state
        # normalize embeddings
        # sentence_embeddings = torch.nn.functional.normalize(sentence_embeddings, p=2, dim=1)
        final_text_embeddings.append(model_output)
        print(index)

        filehandler = open(filename, 'wb') 
        pickle.dump(final_text_embeddings, filehandler)
        filehandler.close()
else:
    filehandler = open(filename, 'rb') 
    final_text_embeddings = pickle.load(filehandler)
    filehandler.close()
    
final_text_embeddings

In [None]:
final_text_embeddings[0].shape

In [None]:
final_text_embeddings[0]

In [None]:
final_text_embeddings = torch.cat(final_text_embeddings, dim=0)
final_text_embeddings.shape

In [None]:
# Label encoding
label_encoder = LabelEncoder()
labels = label_encoder.fit_transform(df_combined["classification_by_editorial"])

embeddings = final_text_embeddings

# Split the data into a training set and a test set
train_texts, test_texts, train_labels, test_labels = train_test_split(embeddings, labels[:800], test_size=0.2)

# Convert lists to tensors
train_texts = torch.stack(tuple(train_texts))
train_labels = torch.tensor(train_labels)
test_texts = torch.stack(tuple(test_texts))
test_labels = torch.tensor(test_labels)

# Train the custom model
classifier.train()
optimizer = torch.optim.Adam(classifier.parameters())
loss_fn = CrossEntropyLoss()

for epoch in range(10):  # Number of epochs
    optimizer.zero_grad()
    inputs = {'input_ids': train_texts}
    logits = classifier(inputs)
    loss = loss_fn(logits, train_labels)
    loss.backward()
    optimizer.step()

# Evaluate the model's performance using the test set
classifier.eval()
with torch.no_grad():
    logits = classifier(test_texts)
    predictions = torch.argmax(logits, dim=-1)
    correct_predictions = (predictions == test_labels).sum().item()
    total_predictions = test_labels.size(0)
    accuracy = correct_predictions / total_predictions

print(f'Test Accuracy: {accuracy * 100:.2f}%')


## Version 3

In [None]:
# Create a PyTorch dataset
class ClassifierDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = str(self.texts[idx])
        label = self.labels[idx]
        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_len,
            return_token_type_ids=False,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt',
        )
        return {
            'text': text,
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(label, dtype=torch.long)
        }

# Create data loaders
def create_data_loader(texts, labels, tokenizer, max_len, batch_size):
    ds = ClassifierDataset(
        texts=texts.to_numpy(),
        labels=labels,
        tokenizer=tokenizer,
        max_len=max_len
    )
    return DataLoader(ds, batch_size=batch_size, num_workers=4)

train_data_loader = create_data_loader(train_texts, train_labels, tokenizer, MAX_LEN, BATCH_SIZE)
test_data_loader = create_data_loader(test_texts, test_labels, tokenizer, MAX_LEN, BATCH_SIZE)

class Classifier(nn.Module):
    def __init__(self, n_classes):
        super(Classifier, self).__init__()
        self.roberta = RobertaModel.from_pretrained('roberta-base')
        self.drop = nn.Dropout(p=0.3)
        self.hidden = nn.Linear(self.roberta.config.hidden_size, 128)  # Change 128 to your desired hidden layer size
        self.out = nn.Linear(128, n_classes)

    def forward(self, input_ids, attention_mask):
        outputs = self.roberta(
            input_ids=input_ids,
            attention_mask=attention_mask
        )
        last_hidden_state = outputs.last_hidden_state
        pooled_output = last_hidden_state[:, 0, :]
        output = self.drop(pooled_output)
        output = nn.ReLU()(self.hidden(output))
        return self.out(output)

# Training loop
def train_epoch(model, data_loader, optimizer, device, scheduler, n_examples):
    model = model.train()
    losses = []
    correct_predictions = 0
    for d in data_loader:
        input_ids = d["input_ids"].to(device)
        attention_mask = d["attention_mask"].to(device)
        labels = d["labels"].to(device)
        outputs = model(
            input_ids=input_ids,
            attention_mask=attention_mask
        )
        _, preds = torch.max(outputs, dim=1)
        loss = loss_fn(outputs, labels)
        correct_predictions += torch.sum(preds == labels)
        losses.append(loss.item())
        loss.backward()
        nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
        optimizer.step()
        scheduler.step()
        optimizer.zero_grad()
    return correct_predictions.double() / n_examples, np.mean(losses)

# Evaluation function
def eval_model(model, data_loader, device, n_examples):
    model = model.eval()
    losses = []
    correct_predictions = 0
    with torch.no_grad():
        for d in data_loader:
            input_ids = d["input_ids"].to(device)
            attention_mask = d["attention_mask"].to(device)
            labels = d["labels"].to(device)
            outputs = model(
                input_ids=input_ids,
                attention_mask=attention_mask
            )
            _, preds = torch.max(outputs, dim=1)
            loss = loss_fn(outputs, labels)
            correct_predictions += torch.sum(preds == labels)
            losses.append(loss.item())
    return correct_predictions.double() / n_examples, np.mean(losses)

In [None]:
# Check if CUDA is available and set PyTorch to use GPU or CPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Load pretrained model/tokenizer
tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
# model = RobertaModel.from_pretrained('roberta-base').to(device)

# Encode labels
le = LabelEncoder()
encoded_labels = le.fit_transform(df_combined['classification_by_editorial'])

# Split data into training and test sets
train_texts, test_texts, train_labels, test_labels = train_test_split(df_combined['text'][:400], encoded_labels[:400], test_size=0.2)


BATCH_SIZE = 8
MAX_LEN = 128

# Initialize the classifier and optimizer
model = Classifier(len(le.classes_)).to(device)
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-5)

In [None]:
from transformers import get_linear_schedule_with_warmup
from torch.nn import CrossEntropyLoss

# Define the loss function
loss_fn = CrossEntropyLoss().to(device)

# Define the number of training epochs
EPOCHS = 10

# Define the scheduler
total_steps = len(train_data_loader) * EPOCHS
scheduler = get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps=0,
    num_training_steps=total_steps
)

# Training loop
for epoch in range(EPOCHS):
    print(f'Epoch {epoch + 1}/{EPOCHS}')
    print('-' * 10)

    train_acc, train_loss = train_epoch(
        model,
        train_data_loader,
        optimizer,
        device,
        scheduler,
        len(train_labels)
    )

    print(f'Train loss {train_loss} accuracy {train_acc}')
    print()

# Evaluation
test_acc, test_loss = eval_model(
    model,
    test_data_loader,
    device,
    len(test_labels)
)

print(f'Test Accuracy : {test_acc}')

## Multimodality