# Installs and Imports

In [None]:
!pip install pandas transformers datasets scikit-learn torch transformers[torch] optuna

In [None]:
import pandas as pd
import numpy as np
import optuna
import torch
from torch.utils.data import Dataset, DataLoader, random_split
from transformers import (
    XLMRobertaForSequenceClassification,
    DataCollatorWithPadding,
    TrainingArguments,
    Trainer,
    AutoTokenizer,
    TrainerCallback,
    EarlyStoppingCallback
)
from sklearn.metrics import precision_recall_fscore_support, accuracy_score
from sklearn.model_selection import KFold
from tqdm import tqdm

# Drive Mounting

In [None]:
from google.colab import drive
drive.mount('/content/drive')

# Dataset Loading

In [None]:
annotated_and_pseudolabeled_path = '/content/drive/My Drive/Research/SentimentAnalysisDivorce/Dataset/ANNOTATED_AND_PSEUDOLABELED_DATA_01.xlsx'
cleaned_preprocessed_path = '/content/drive/My Drive/Research/SentimentAnalysisDivorce/Dataset/CLEANED_PREPROCESSED_DATA_05.xlsx'

In [None]:
df_combined_nd = pd.read_excel(annotated_and_pseudolabeled_path, sheet_name='ND')
df_combined_dn = pd.read_excel(annotated_and_pseudolabeled_path, sheet_name='DN')

In [None]:
model_path_nd = '/content/drive/My Drive/Research/SentimentAnalysisDivorce/Models/ND'
model_path_dn = '/content/drive/My Drive/Research/SentimentAnalysisDivorce/Models/DN'

# Helper Functions

In [None]:
def preprocess_data(df, tokenizer, max_length=512):
    texts = df['text'].tolist()
    labels = df['numericalLabel'].astype(int).tolist()
    inputs = tokenizer(texts, truncation=True, padding=True, max_length=max_length, return_tensors="pt")
    inputs['labels'] = torch.tensor(labels)
    return inputs

In [None]:
class CustomDataset(Dataset):
    def __init__(self, inputs):
        self.inputs = inputs

    def __len__(self):
        return len(self.inputs['input_ids'])

    def __getitem__(self, idx):
        item = {key: val[idx] for key, val in self.inputs.items()}
        return item

In [None]:
def compute_metrics(p):
    preds = np.argmax(p.predictions, axis=1)
    precision, recall, f1, _ = precision_recall_fscore_support(p.label_ids, preds, average='weighted')
    acc = accuracy_score(p.label_ids, preds)
    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }

In [None]:
class BestModelSaverCallback(TrainerCallback):
    def __init__(self, trainer, tokenizer, patience=10, min_delta=0.0, output_dir=None):
        self.trainer = trainer
        self.tokenizer = tokenizer
        self.patience = patience
        self.min_delta = min_delta
        self.output_dir = output_dir
        self.best_loss = None
        self.best_accuracy = None
        self.patience_counter = 0

    def on_evaluate(self, args, state, control, **kwargs):
        logs = kwargs.get("metrics", {})
        eval_loss = logs.get("eval_loss")
        eval_accuracy = logs.get("eval_accuracy")

        if eval_loss is not None and eval_accuracy is not None:
            if self.best_loss is None or eval_loss < self.best_loss - self.min_delta:
                self.best_loss = eval_loss
                self.best_accuracy = eval_accuracy
                self.patience_counter = 0

                if self.output_dir:
                    self.trainer.model.save_pretrained(self.output_dir)
                    self.tokenizer.save_pretrained(self.output_dir)
            else:
                self.patience_counter += 1
                if self.patience_counter > self.patience:
                    control.should_training_stop = True

In [None]:
tokenizer_xlm = AutoTokenizer.from_pretrained("xlm-roberta-base")

inputs_nd = preprocess_data(df_combined_nd, tokenizer_xlm)
inputs_dn = preprocess_data(df_combined_dn, tokenizer_xlm)

dataset_nd = CustomDataset(inputs_nd)
dataset_dn = CustomDataset(inputs_dn)

# Hyperparameter Tuning

In [None]:
def objective(trial, train_dataset, model_path, category_name):
    # Define hyperparameters to tune
    learning_rate = trial.suggest_float("learning_rate", 1e-5, 5e-5, log=True)
    num_train_epochs = trial.suggest_int("num_train_epochs", 2, 5)
    per_device_train_batch_size = trial.suggest_categorical("per_device_train_batch_size", [8, 16, 32])
    gradient_accumulation_steps = trial.suggest_int("gradient_accumulation_steps", 1, 4)
    weight_decay = trial.suggest_float("weight_decay", 0, 0.3)
    warmup_steps = trial.suggest_int("warmup_steps", 0, 500)

    training_args = TrainingArguments(
        output_dir=f'/content/drive/My Drive/Research/SentimentAnalysisDivorce/Models/HyperparameterTuned/{category_name}',
        evaluation_strategy='epoch',
        save_strategy='epoch',
        learning_rate=learning_rate,
        per_device_train_batch_size=per_device_train_batch_size,
        per_device_eval_batch_size=16,
        num_train_epochs=num_train_epochs,
        weight_decay=weight_decay,
        warmup_steps=warmup_steps,
        gradient_accumulation_steps=gradient_accumulation_steps,
        load_best_model_at_end=True,
        metric_for_best_model='eval_loss',
        greater_is_better=False,
        fp16=True,  # Use mixed precision training
    )

    # Initialize K-fold cross-validator
    kf = KFold(n_splits=5, shuffle=True, random_state=42)

    # Perform K-Fold cross-validation
    losses = []
    for fold, (train_index, eval_index) in enumerate(kf.split(train_dataset)):
        print(f"Starting fold {fold+1}...")
        train_subset = torch.utils.data.Subset(train_dataset, train_index)
        eval_subset = torch.utils.data.Subset(train_dataset, eval_index)

        model = XLMRobertaForSequenceClassification.from_pretrained(model_path, num_labels=7)
        data_collator = DataCollatorWithPadding(tokenizer=tokenizer_xlm)

        trainer = Trainer(
            model=model,
            args=training_args,
            train_dataset=train_subset,
            eval_dataset=eval_subset,
            data_collator=data_collator,
            compute_metrics=compute_metrics,
            callbacks=[
                EarlyStoppingCallback(early_stopping_patience=3),
            ]
        )

        trainer.train()
        eval_results = trainer.evaluate()
        losses.append(eval_results['eval_loss'])

        # Clear cache
        torch.cuda.empty_cache()
        torch.cuda.ipc_collect()

    # Return the average loss across all folds
    return np.mean(losses)

In [None]:
study_nd = optuna.create_study(direction='minimize')
study_dn = optuna.create_study(direction='minimize')

In [14]:
model_path = model_path_nd
category_name = 'ND'
study_nd.optimize(lambda trial: objective(trial, dataset_nd, model_path, category_name), n_trials=3)



Starting fold 1...


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,No log,0.12338,0.968277,0.968163,0.968866,0.968277
2,No log,0.097493,0.974766,0.974819,0.975183,0.974766


Starting fold 2...


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,No log,0.086527,0.972603,0.972502,0.972518,0.972603
2,No log,0.056578,0.982696,0.982886,0.983664,0.982696


Starting fold 3...


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,No log,0.089354,0.974766,0.974896,0.975227,0.974766
2,No log,0.081145,0.97765,0.977739,0.978301,0.97765


Starting fold 4...


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,No log,0.052011,0.984848,0.984901,0.985184,0.984848
2,No log,0.026197,0.992785,0.992813,0.992935,0.992785


Starting fold 5...


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,No log,0.059898,0.984848,0.984968,0.985665,0.984848
2,No log,0.022515,0.992063,0.992085,0.992181,0.992063


[I 2024-07-09 07:24:34,961] Trial 0 finished with value: 0.056785577535629274 and parameters: {'learning_rate': 2.9912691559155184e-05, 'num_train_epochs': 2, 'per_device_train_batch_size': 32, 'gradient_accumulation_steps': 1, 'weight_decay': 0.12046589013002564, 'warmup_steps': 131}. Best is trial 0 with value: 0.056785577535629274.


Starting fold 1...


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,0.3397,0.317128,0.927181,0.92817,0.932633,0.927181
2,0.3334,0.208237,0.94881,0.948869,0.949109,0.94881
3,0.0957,0.165383,0.963951,0.963979,0.96414,0.963951


Starting fold 2...


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,0.3879,0.367698,0.913482,0.913738,0.920301,0.913482
2,0.2964,0.209227,0.957462,0.957493,0.957972,0.957462
3,0.0833,0.171157,0.96323,0.963463,0.964533,0.96323


Starting fold 3...


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,0.3354,0.408465,0.903389,0.903542,0.908494,0.903389
2,0.3118,0.26281,0.946647,0.946749,0.947405,0.946647
3,0.1161,0.156784,0.967556,0.967673,0.96795,0.967556


Starting fold 4...


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,0.4353,0.144224,0.955988,0.956145,0.957725,0.955988
2,0.3432,0.100603,0.975469,0.975525,0.975756,0.975469
3,0.1129,0.096626,0.978355,0.97853,0.979144,0.978355


Starting fold 5...


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,0.3875,0.187177,0.94228,0.942433,0.94445,0.94228
2,0.3637,0.156376,0.965368,0.965477,0.966273,0.965368
3,0.116,0.102129,0.976912,0.976962,0.97714,0.976912


[I 2024-07-09 08:27:52,883] Trial 1 finished with value: 0.13841579854488373 and parameters: {'learning_rate': 4.1125002663135036e-05, 'num_train_epochs': 3, 'per_device_train_batch_size': 8, 'gradient_accumulation_steps': 1, 'weight_decay': 0.16900923416485533, 'warmup_steps': 158}. Best is trial 0 with value: 0.056785577535629274.


Starting fold 1...


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
0,No log,0.122757,0.972603,0.972714,0.973319,0.972603
1,No log,0.094673,0.973324,0.973366,0.973757,0.973324


Starting fold 2...


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
0,No log,0.071739,0.979813,0.979896,0.980266,0.979813
1,No log,0.089338,0.97765,0.977844,0.978671,0.97765


Starting fold 3...


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
0,No log,0.127962,0.972603,0.973065,0.974883,0.972603
1,No log,0.113232,0.974045,0.974014,0.974599,0.974045


Starting fold 4...


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
0,No log,0.043887,0.986291,0.986327,0.986846,0.986291
1,No log,0.042417,0.989177,0.989197,0.989258,0.989177


Starting fold 5...


Epoch,Training Loss,Validation Loss


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
0,No log,0.028068,0.99062,0.990629,0.990652,0.99062
1,No log,0.034822,0.986291,0.986316,0.986396,0.986291


[I 2024-07-09 09:01:54,296] Trial 2 finished with value: 0.0700259231030941 and parameters: {'learning_rate': 3.275467105788013e-05, 'num_train_epochs': 2, 'per_device_train_batch_size': 8, 'gradient_accumulation_steps': 4, 'weight_decay': 0.29144429794914223, 'warmup_steps': 236}. Best is trial 0 with value: 0.056785577535629274.


In [15]:
model_path = model_path_dn
category_name = 'DN'
study_dn.optimize(lambda trial: objective(trial, dataset_dn, model_path, category_name), n_trials=3)



Starting fold 1...


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
0,No log,0.217241,0.939732,0.940631,0.944527,0.939732
1,No log,0.19921,0.939732,0.940654,0.944307,0.939732


Starting fold 2...


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
0,No log,0.262886,0.928571,0.928251,0.930227,0.928571
1,No log,0.214142,0.944196,0.944214,0.946434,0.944196


Starting fold 3...


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
0,No log,0.171107,0.955357,0.954945,0.955984,0.955357
1,No log,0.159083,0.959821,0.959687,0.960042,0.959821


Starting fold 4...


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
0,No log,0.249271,0.946429,0.946357,0.948128,0.946429
1,No log,0.201864,0.955357,0.955248,0.95579,0.955357


Starting fold 5...


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
0,No log,0.230694,0.941964,0.939983,0.943825,0.941964
1,No log,0.210335,0.946429,0.946131,0.946722,0.946429


[I 2024-07-09 09:14:13,300] Trial 0 finished with value: 0.19692691564559936 and parameters: {'learning_rate': 2.984189879078095e-05, 'num_train_epochs': 2, 'per_device_train_batch_size': 8, 'gradient_accumulation_steps': 3, 'weight_decay': 0.04787734758822324, 'warmup_steps': 63}. Best is trial 0 with value: 0.19692691564559936.


Starting fold 1...


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,No log,0.175293,0.964286,0.964426,0.965547,0.964286
2,No log,0.205358,0.953125,0.954029,0.957467,0.953125


Starting fold 2...


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,No log,0.267817,0.935268,0.935332,0.936293,0.935268
2,No log,0.254852,0.944196,0.943897,0.944788,0.944196


Starting fold 3...


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,No log,0.145861,0.959821,0.960017,0.960888,0.959821
2,No log,0.158338,0.959821,0.960042,0.961063,0.959821


Starting fold 4...


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,No log,0.22823,0.955357,0.955439,0.956278,0.955357
2,No log,0.212134,0.957589,0.957427,0.957987,0.957589


Starting fold 5...


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,No log,0.185859,0.953125,0.953098,0.953653,0.953125
2,No log,0.185557,0.957589,0.957326,0.957614,0.957589


[I 2024-07-09 09:25:35,938] Trial 1 finished with value: 0.19473922550678252 and parameters: {'learning_rate': 4.075920827603692e-05, 'num_train_epochs': 2, 'per_device_train_batch_size': 32, 'gradient_accumulation_steps': 2, 'weight_decay': 0.18552122162370738, 'warmup_steps': 201}. Best is trial 1 with value: 0.19473922550678252.


Starting fold 1...


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
0,No log,0.178818,0.964286,0.964426,0.965547,0.964286
1,No log,0.177139,0.953125,0.953814,0.956462,0.953125
3,No log,0.232962,0.944196,0.945031,0.948447,0.944196


Starting fold 2...


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
0,No log,0.2739,0.939732,0.939833,0.941217,0.939732
1,No log,0.258681,0.941964,0.941614,0.942415,0.941964
3,No log,0.265777,0.941964,0.941846,0.943346,0.941964


Starting fold 3...


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
0,No log,0.143591,0.959821,0.959942,0.96049,0.959821
1,No log,0.151264,0.964286,0.964322,0.965176,0.964286
3,No log,0.153183,0.957589,0.957502,0.958603,0.957589


Starting fold 4...


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
0,No log,0.22113,0.957589,0.957749,0.958454,0.957589
1,No log,0.211238,0.957589,0.957384,0.958031,0.957589
3,No log,0.201132,0.959821,0.959766,0.960572,0.959821


Starting fold 5...


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
0,No log,0.191163,0.955357,0.955279,0.955892,0.955357
1,No log,0.199166,0.950893,0.950897,0.95159,0.950893
3,No log,0.216001,0.948661,0.947704,0.948797,0.948661


[I 2024-07-09 09:49:36,360] Trial 2 finished with value: 0.19312008917331697 and parameters: {'learning_rate': 1.1738603684059503e-05, 'num_train_epochs': 4, 'per_device_train_batch_size': 8, 'gradient_accumulation_steps': 3, 'weight_decay': 0.12441406961535009, 'warmup_steps': 379}. Best is trial 2 with value: 0.19312008917331697.


In [17]:
best_trial_nd = study_nd.best_trial

In [18]:
best_trial_dn = study_dn.best_trial

In [19]:
print(f"Best trial ND: Value: {best_trial_nd.value}, Params: {best_trial_nd.params}")
print(f"Best trial DN: Value: {best_trial_dn.value}, Params: {best_trial_dn.params}")

Best trial ND: Value: 0.056785577535629274, Params: {'learning_rate': 2.9912691559155184e-05, 'num_train_epochs': 2, 'per_device_train_batch_size': 32, 'gradient_accumulation_steps': 1, 'weight_decay': 0.12046589013002564, 'warmup_steps': 131}
Best trial DN: Value: 0.19312008917331697, Params: {'learning_rate': 1.1738603684059503e-05, 'num_train_epochs': 4, 'per_device_train_batch_size': 8, 'gradient_accumulation_steps': 3, 'weight_decay': 0.12441406961535009, 'warmup_steps': 379}


# Final Training

In [None]:
def train_with_best_params(best_params, category_name, model_path, train_dataset):
    training_args = TrainingArguments(
        output_dir=f'/content/drive/My Drive/Research/SentimentAnalysisDivorce/Models/{category_name}',
        evaluation_strategy='epoch',
        save_strategy='no',
        learning_rate=best_params['learning_rate'],
        per_device_train_batch_size=best_params['per_device_train_batch_size'],
        per_device_eval_batch_size=16,
        num_train_epochs=best_params['num_train_epochs'],
        weight_decay=0.01,
        gradient_accumulation_steps=best_params['gradient_accumulation_steps'],
        load_best_model_at_end=True,
        metric_for_best_model='eval_loss',
        greater_is_better=False,
        fp16=True,  # Use mixed precision training
    )

    # Initialize K-fold cross-validator
    kf = KFold(n_splits=5, shuffle=True, random_state=42)

    # Perform K-Fold cross-validation and train final model
    for fold, (train_index, eval_index) in enumerate(kf.split(train_dataset)):
        print(f"Starting fold {fold+1}...")
        train_subset = torch.utils.data.Subset(train_dataset, train_index)
        eval_subset = torch.utils.data.Subset(train_dataset, eval_index)

        model = XLMRobertaForSequenceClassification.from_pretrained(model_path, num_labels=7)
        data_collator = DataCollatorWithPadding(tokenizer=tokenizer_xlm)

        trainer = Trainer(
            model=model,
            args=training_args,
            train_dataset=train_subset,
            eval_dataset=eval_subset,
            data_collator=data_collator,
            compute_metrics=compute_metrics,
            callbacks=[BestModelSaverCallback(
                trainer=trainer,
                tokenizer=tokenizer_xlm,
                patience=3,
                output_dir=training_args.output_dir
            )]
        )

        for epoch in tqdm(range(training_args.num_train_epochs), desc=f"Training fold {fold+1}"):
            trainer.train()
            # Clear cache
            torch.cuda.empty_cache()
            torch.cuda.ipc_collect()