<a href="https://colab.research.google.com/github/claudiarichardxx/Decoding-Personality-Types-from-Text-using-Myers-Briggs-Dimensions/blob/main/setup/modelTraining.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Installs and imports

In [None]:
%%capture
!pip install -U accelerate
!pip install -U transformers
!pip install iterative-stratification

In [None]:
import pandas as pd
import numpy as np
from huggingface_hub import notebook_login
from transformers import AutoModelForSequenceClassification
from transformers import AutoTokenizer
from transformers import TrainingArguments, Trainer
from sklearn.metrics import f1_score, roc_auc_score
from transformers import EvalPrediction
from datasets import load_dataset
from huggingface_hub import notebook_login
import torch

# Download the data

In [None]:
dataset = load_dataset("ClaudiaRichard/mbti_classification_v2")

In [None]:
dataset

DatasetDict({
    train: Dataset({
        features: ['I/E', 'N/S', 'T/F', 'J/P', 'post'],
        num_rows: 95166
    })
    test: Dataset({
        features: ['I/E', 'N/S', 'T/F', 'J/P', 'post'],
        num_rows: 38067
    })
    validation: Dataset({
        features: ['I/E', 'N/S', 'T/F', 'J/P', 'post'],
        num_rows: 25377
    })
})

# Encoding the data

In [None]:
def preprocess_data(examples):
  # take a batch of texts
  text = examples["post"]
  # encode them
  encoding = tokenizer(text, padding="max_length", truncation=True, max_length=128)
  # add labels
  labels_batch = {k: examples[k] for k in examples.keys() if k in labels}
  # create numpy array of shape (batch_size, num_labels)
  labels_matrix = np.zeros((len(text), len(labels)))
  # fill numpy array
  for idx, label in enumerate(labels):
    labels_matrix[:, idx] = labels_batch[label]

  encoding["labels"] = labels_matrix.tolist()

  return encoding

In [None]:
mode = ['ClaudiaRichard/mbti-bert-nli-finetuned', 'ClaudiaRichard/bert-finetuned-sem_eval-english','sentence-transformers/bert-base-nli-mean-tokens']
tokenizer = AutoTokenizer.from_pretrained(mode[2], device)

In [None]:
encoded_dataset = dataset.map(preprocess_data, batched=True, remove_columns=dataset['train'].column_names)

Map:   0%|          | 0/95166 [00:00<?, ? examples/s]

Map:   0%|          | 0/38067 [00:00<?, ? examples/s]

Map:   0%|          | 0/25377 [00:00<?, ? examples/s]

In [None]:
example = encoded_dataset['train'][50]
print(example.keys())

dict_keys(['input_ids', 'token_type_ids', 'attention_mask', 'labels'])


In [None]:
tokenizer.decode(example['input_ids'])

"[CLS] two months? i wouldn't be crazy about the idea. if you are really his best employee, then that's what may be cooking him. who wants their most reliable asset gone for that long? entj employer... [SEP] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD]"

In [None]:
encoded_dataset.set_format("torch")
len(labels)

4

# Model Training

In [None]:
model = AutoModelForSequenceClassification.from_pretrained("sentence-transformers/bert-base-nli-mean-tokens",
                                                           problem_type="multi_label_classification",
                                                           num_labels=len(labels),
                                                           id2label=id2label,
                                                           label2id=label2id
                                                           )

In [None]:
batch_size = 8
metric_name = "f1"

args = TrainingArguments(
    f"mbti-bert-nli-finetuned_v2",
    evaluation_strategy = "epoch",
    save_strategy = "epoch",
    learning_rate=3e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs = 2,
    fp16=True,
    weight_decay=0.01,
    gradient_accumulation_steps=2,
    load_best_model_at_end=True,
    metric_for_best_model=metric_name,
)

In [None]:
class MBTITrainer(Trainer):

    def __init__(self, *args, class_weights = None, **kwargs):
        super().__init__(*args, **kwargs)
        if class_weights is not None:
            class_weights = class_weights.to(self.args.device)

        self.loss_fct = torch.nn.BCEWithLogitsLoss(weight=class_weights)

    def compute_loss(self, model, inputs, return_outputs=False):
        """
        How the loss is computed by Trainer. By default, all models return the loss in the first element.
        Subclass and override for custom behavior.
        """
        labels = inputs.pop("labels")
        outputs = model(**inputs)
        try:
            loss = self.loss_fct(outputs.logits.view(-1, model.num_labels), labels.view(-1, model.num_labels).float())
        except AttributeError:
            loss = self.loss_fct(outputs.logits.view(-1, model.module.num_labels), labels.view(-1))

        return (loss, outputs) if return_outputs else loss


In [None]:
def multi_label_metrics(predictions, labels, threshold=0.5):
    # Apply threshold to convert logits to binary predictions
    y_pred = (predictions >= threshold).astype(np.float32)

    # Compute metrics
    y_true = labels
    f1_micro_average = f1_score(y_true=y_true, y_pred=y_pred, average='micro')
    roc_auc = roc_auc_score(y_true, y_pred, average='micro')

    # Return metrics as a dictionary
    metrics = {'f1': f1_micro_average, 'roc_auc': roc_auc}
    return metrics

def compute_metrics(p: EvalPrediction):
    preds = p.predictions[0] if isinstance(p.predictions,
            tuple) else p.predictions
    result = multi_label_metrics(
        predictions=preds,
        labels=p.label_ids)
    return result

In [None]:
trainer = MBTITrainer(
    model,
    args,
    train_dataset=encoded_dataset["train"],
    eval_dataset=encoded_dataset["validation"],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)


In [None]:
torch.cuda.empty_cache()
trainer.train()

Epoch,Training Loss,Validation Loss,F1,Roc Auc
1,0.5413,0.538476,0.572146,0.676641


Epoch,Training Loss,Validation Loss,F1,Roc Auc
1,0.5413,0.538476,0.572146,0.676641
2,0.5046,0.542942,0.523148,0.654622


TrainOutput(global_step=11896, training_loss=0.5312848633058775, metrics={'train_runtime': 2172.377, 'train_samples_per_second': 87.615, 'train_steps_per_second': 5.476, 'total_flos': 1.2519838164307968e+16, 'train_loss': 0.5312848633058775, 'epoch': 2.0})

In [None]:
trainer.evaluate(eval_dataset=encoded_dataset['test'])

In [None]:
notebook_login()

In [None]:
trainer.push_to_hub("ClaudiaRichard/mbti_classification_v2/")