# Installs and Imports

In [None]:
!pip install pandas transformers datasets scikit-learn torch transformers[torch]



In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.utils.class_weight import compute_class_weight
from transformers import (
    XLMRobertaForSequenceClassification,
    DataCollatorWithPadding,
    TrainingArguments,
    Trainer,
    AutoTokenizer,
    TrainerCallback,
    EarlyStoppingCallback
)
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, confusion_matrix, classification_report, precision_recall_fscore_support
import torch
import gc
from tqdm import tqdm
from torch.utils.data import Dataset
from torch.cuda.amp import GradScaler, autocast

# Drive Mounting

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


# Dataset Loading

In [None]:
annotated_path = '/content/drive/My Drive/Research/SentimentAnalysisDivorce/Dataset/INITIAL_ANNOTATED_SAMPLE.xlsx'
pseudolabeled_path = '/content/drive/My Drive/Research/SentimentAnalysisDivorce/Dataset/PSEUDOLABELED_SAMPLE_WITH_UNCERTAINTY.xlsx'
cleaned_dataset_path = '/content/drive/My Drive/Research/SentimentAnalysisDivorce/Dataset/CLEANED_PREPROCESSED_DATA_03.xlsx'

df_annotated_nd = pd.read_excel(annotated_path, sheet_name='ND')
df_annotated_dn = pd.read_excel(annotated_path, sheet_name='DN')

df_pseudolabeled_nd = pd.read_excel(pseudolabeled_path, sheet_name='ND')
df_pseudolabeled_dn = pd.read_excel(pseudolabeled_path, sheet_name='DN')

df_cleaned_nd = pd.read_excel(cleaned_dataset_path, sheet_name='ND')
df_cleaned_dn = pd.read_excel(cleaned_dataset_path, sheet_name='DN')

In [None]:
df_pseudolabeled_nd.rename(columns={
    'correctedLabel': 'numericalLabel',
    'correctedCategory': 'categoricalLabel'
}, inplace=True)

df_pseudolabeled_dn.rename(columns={
    'correctedLabel': 'numericalLabel',
    'correctedCategory': 'categoricalLabel'
}, inplace=True)

In [None]:
def filter_data(df, labels_to_remove):
    filtered_df = df[~df['categoricalLabel'].isin(labels_to_remove)]
    filtered_df.dropna(subset=['numericalLabel'], inplace=True)
    filtered_df['numericalLabel'] = pd.to_numeric(filtered_df['numericalLabel'], errors='coerce')
    filtered_df.dropna(subset=['numericalLabel'], inplace=True)
    filtered_df['numericalLabel'] = filtered_df['numericalLabel'].astype(int)
    return filtered_df

labels_to_remove = ['NON-ENGLISH/TAGALOG/TAGLISH', 'NOT HELPFUL']

In [None]:
df_annotated_nd_filtered = filter_data(df_annotated_nd, labels_to_remove)
df_annotated_dn_filtered = filter_data(df_annotated_dn, labels_to_remove)

df_pseudolabeled_nd_filtered = filter_data(df_pseudolabeled_nd, labels_to_remove)
df_pseudolabeled_dn_filtered = filter_data(df_pseudolabeled_dn, labels_to_remove)

In [None]:
df_combined_nd = pd.concat([df_annotated_nd_filtered, df_pseudolabeled_nd_filtered], ignore_index=True)
df_combined_dn = pd.concat([df_annotated_dn_filtered, df_pseudolabeled_dn_filtered], ignore_index=True)

In [None]:
df_combined_nd['numericalLabel'] = pd.to_numeric(df_combined_nd['numericalLabel'], errors='coerce')
df_combined_dn['numericalLabel'] = pd.to_numeric(df_combined_dn['numericalLabel'], errors='coerce')

df_combined_nd = df_combined_nd.dropna(subset=['numericalLabel'])
df_combined_dn = df_combined_dn.dropna(subset=['numericalLabel'])

df_combined_nd['numericalLabel'] = df_combined_nd['numericalLabel'].astype(int)
df_combined_dn['numericalLabel'] = df_combined_dn['numericalLabel'].astype(int)

In [None]:
with pd.ExcelWriter('/content/drive/My Drive/Research/SentimentAnalysisDivorce/Dataset/ANNOTATED_AND_PSEUDOLABELED_DATA_01.xlsx') as writer:
    df_combined_nd.to_excel(writer, sheet_name='ND', index=False)
    df_combined_dn.to_excel(writer, sheet_name='DN', index=False)

In [None]:
texts_to_remove_nd = set(df_combined_nd['text'])
texts_to_remove_dn = set(df_combined_dn['text'])

In [None]:
df_cleaned_nd_filtered = df_cleaned_nd[~df_cleaned_nd['text'].isin(texts_to_remove_nd)]
df_cleaned_dn_filtered = df_cleaned_dn[~df_cleaned_dn['text'].isin(texts_to_remove_dn)]

In [None]:
with pd.ExcelWriter('/content/drive/My Drive/Research/SentimentAnalysisDivorce/Dataset/CLEANED_PREPROCESSED_DATA_05.xlsx') as writer:
    df_cleaned_nd_filtered.to_excel(writer, sheet_name='ND', index=False)
    df_cleaned_dn_filtered.to_excel(writer, sheet_name='DN', index=False)

# Exploratory Data Analysis

# Secondary Training and Evaluation

In [None]:
def preprocess_data(df, tokenizer, max_length=512):
    texts = df['text'].tolist()
    labels = df['numericalLabel'].astype(int).tolist()
    inputs = tokenizer(texts, truncation=True, padding=True, max_length=max_length, return_tensors="pt")
    inputs['labels'] = torch.tensor(labels)
    return inputs

In [None]:
tokenizer_xlm = AutoTokenizer.from_pretrained("xlm-roberta-base")

inputs_nd = preprocess_data(df_combined_nd, tokenizer_xlm)
inputs_dn = preprocess_data(df_combined_dn, tokenizer_xlm)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [None]:
class CustomDataset(Dataset):
    def __init__(self, inputs):
        self.inputs = inputs

    def __len__(self):
        return len(self.inputs['input_ids'])

    def __getitem__(self, idx):
        item = {key: val[idx] for key, val in self.inputs.items()}
        return item

In [None]:
dataset_nd = CustomDataset(inputs_nd)
dataset_dn = CustomDataset(inputs_dn)

In [None]:
class CustomTrainer(Trainer):
    def __init__(self, *args, class_weights=None, **kwargs):
        super().__init__(*args, **kwargs)
        self.class_weights = class_weights

    def compute_loss(self, model, inputs, return_outputs=False):
        labels = inputs.get("labels")
        # Forward pass
        outputs = model(**inputs)
        logits = outputs.get("logits")

        loss_fct = torch.nn.CrossEntropyLoss(weight=self.class_weights)
        loss = loss_fct(logits.view(-1, self.model.config.num_labels), labels.view(-1))
        return (loss, outputs) if return_outputs else loss

In [None]:
def compute_metrics(p):
    preds = np.argmax(p.predictions, axis=1)
    precision, recall, f1, _ = precision_recall_fscore_support(p.label_ids, preds, average='weighted')
    acc = accuracy_score(p.label_ids, preds)
    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }

In [None]:
class BestModelSaverCallback(TrainerCallback):
    def __init__(self, trainer, tokenizer, patience=10, min_delta=0.0, output_dir=None):
        self.trainer = trainer
        self.tokenizer = tokenizer
        self.patience = patience
        self.min_delta = min_delta
        self.output_dir = output_dir
        self.best_loss = None
        self.best_accuracy = None
        self.patience_counter = 0

    def on_evaluate(self, args, state, control, **kwargs):
        logs = kwargs.get("metrics", {})
        eval_loss = logs.get("eval_loss")
        eval_accuracy = logs.get("eval_accuracy")

        if eval_loss is not None and eval_accuracy is not None:
            if self.best_loss is None or eval_loss < self.best_loss - self.min_delta:
                self.best_loss = eval_loss
                self.best_accuracy = eval_accuracy
                self.patience_counter = 0

                if self.output_dir:
                    self.trainer.model.save_pretrained(self.output_dir)
                    self.tokenizer.save_pretrained(self.output_dir)
            else:
                self.patience_counter += 1
                if self.patience_counter > self.patience:
                    control.should_training_stop = True

In [None]:
def train_and_save_model(train_dataset, eval_dataset, category_name, model_path, patience=3):
    training_args = TrainingArguments(
        output_dir=f'/content/drive/My Drive/Research/SentimentAnalysisDivorce/Models/{category_name}',
        evaluation_strategy='epoch',
        save_strategy='no',
        learning_rate=2e-5,
        per_device_train_batch_size=16,
        per_device_eval_batch_size=16,
        num_train_epochs=10,
        weight_decay=0.01,
        load_best_model_at_end=False,
        metric_for_best_model='eval_loss',
        greater_is_better=False,
    )

    model = XLMRobertaForSequenceClassification.from_pretrained(model_path, num_labels=7)
    data_collator = DataCollatorWithPadding(tokenizer=tokenizer_xlm)

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=eval_dataset,
        data_collator=data_collator,
        compute_metrics=compute_metrics
    )

    best_model_saver_callback = BestModelSaverCallback(
        trainer=trainer,
        tokenizer=tokenizer_xlm,
        patience=patience,
        output_dir=training_args.output_dir
    )

    trainer.add_callback(best_model_saver_callback)

    trainer.train()

    model.save_pretrained(training_args.output_dir)
    tokenizer_xlm.save_pretrained(training_args.output_dir)
    print(f"Fine-tuned model saved at {training_args.output_dir}")

In [None]:
train_size_nd = int(0.8 * len(dataset_nd))
eval_size_nd = len(dataset_nd) - train_size_nd
train_dataset_nd, eval_dataset_nd = torch.utils.data.random_split(dataset_nd, [train_size_nd, eval_size_nd])

train_size_dn = int(0.8 * len(dataset_dn))
eval_size_dn = len(dataset_dn) - train_size_dn
train_dataset_dn, eval_dataset_dn = torch.utils.data.random_split(dataset_dn, [train_size_dn, eval_size_dn])

In [None]:
model_path_nd = '/content/drive/My Drive/Research/SentimentAnalysisDivorce/Models/ND'
model_path_dn = '/content/drive/My Drive/Research/SentimentAnalysisDivorce/Models/DN'

In [None]:
train_and_save_model(train_dataset_dn, eval_dataset_dn, 'DN', model_path_dn)



Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,No log,0.539834,0.850446,0.851566,0.855904,0.850446
2,No log,0.625057,0.834821,0.835847,0.84472,0.834821
3,No log,0.633264,0.830357,0.83061,0.834718,0.830357
4,No log,0.786171,0.819196,0.824028,0.842691,0.819196
5,0.356500,0.77773,0.832589,0.833047,0.840161,0.832589


Fine-tuned model saved at /content/drive/My Drive/Research/SentimentAnalysisDivorce/Models/DN


In [None]:
train_and_save_model(train_dataset_nd, eval_dataset_nd, 'ND', model_path_nd)



Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,No log,0.171738,0.94881,0.94906,0.950106,0.94881
2,0.250800,0.236887,0.930786,0.930882,0.932353,0.930786
3,0.194100,0.286255,0.924297,0.925629,0.931257,0.924297
4,0.194100,0.240927,0.938717,0.938866,0.940261,0.938717
5,0.149800,0.265414,0.939438,0.939748,0.940768,0.939438


Fine-tuned model saved at /content/drive/My Drive/Research/SentimentAnalysisDivorce/Models/ND
