Import the preprocessed ISOT dataset

In [33]:
from datasets import load_dataset

dataset = load_dataset("csv", data_files={'train': "preprocessed_isot/isot_train.csv", 'test': "preprocessed_isot/isot_test.csv", 'valid': "preprocessed_isot/isot_valid.csv"})
print(dataset)

Generating train split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

Generating valid split: 0 examples [00:00, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['statement', 'label'],
        num_rows: 31716
    })
    test: Dataset({
        features: ['statement', 'label'],
        num_rows: 4351
    })
    valid: Dataset({
        features: ['statement', 'label'],
        num_rows: 4353
    })
})


Pick the pre-trained model

In [34]:
model_name = "albert/albert-base-v2"
your_path = 'isot_new_results'

Look over the label distribution

In [35]:
from collections import Counter

train_label_distribution = Counter(dataset['train']['label'])
test_label_distribution = Counter(dataset['test']['label'])

print("Training Label Distribution:", train_label_distribution)
print("Test Label Distribution:", test_label_distribution)

Training Label Distribution: Counter({True: 16992, False: 14724})
Test Label Distribution: Counter({False: 2214, True: 2137})


Labels in their original form are strings (true/false). We need to convert them to numerical values so they can be processed by the model.

In [36]:
from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder()

label_encoder.fit(dataset['train']['label'])

def encode_labels(example):
    return {'encoded_label': label_encoder.transform([example['label']])[0]}

for split in dataset:
    dataset[split] = dataset[split].map(encode_labels, batched=False)

Map:   0%|          | 0/31716 [00:00<?, ? examples/s]

Map:   0%|          | 0/4351 [00:00<?, ? examples/s]

Map:   0%|          | 0/4353 [00:00<?, ? examples/s]

The id2label and label2id mappings in AutoConfig are used to inform the model of the specific label-to-ID mappings so we can get the actual label names rather than the numerical reps when we do inference with the model.

In [37]:
from transformers import AutoConfig

unique_labels = sorted(list(set(dataset['train']['label'])))
id2label = {i: label for i, label in enumerate(unique_labels)}
label2id = {label: i for i, label in enumerate(unique_labels)}

config = AutoConfig.from_pretrained(model_name)
config.id2label = id2label
config.label2id = label2id


Verify the correct labels are being used

In [38]:
print("ID to Label Mapping:", config.id2label)
print("Label to ID Mapping:", config.label2id)

ID to Label Mapping: {0: False, 1: True}
Label to ID Mapping: {False: 0, True: 1}


Now, we need to tokenize the text data so it can be processed by the model. We will use the tokenizer that corresponds to the model we are using from Hugging Face's Transformers library.

In [39]:
from transformers import AlbertForSequenceClassification, AlbertTokenizer

tokenizer = AlbertTokenizer.from_pretrained(model_name)
model = AlbertForSequenceClassification.from_pretrained(model_name, config=config)


Some weights of AlbertForSequenceClassification were not initialized from the model checkpoint at albert/albert-base-v2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


We need to filter out any examples that have invalid content (e.g. empty strings) before tokenizing the data. Then, we will encode the labels and tokenize the text data. Tokenizing means converting the text data into numerical representations that can be processed by the model.

In [40]:
print(dataset)

DatasetDict({
    train: Dataset({
        features: ['statement', 'label', 'encoded_label'],
        num_rows: 31716
    })
    test: Dataset({
        features: ['statement', 'label', 'encoded_label'],
        num_rows: 4351
    })
    valid: Dataset({
        features: ['statement', 'label', 'encoded_label'],
        num_rows: 4353
    })
})


In [41]:
def filter_invalid_content(example):
    return isinstance(example['statement'], str)

dataset = dataset.filter(filter_invalid_content, batched=False)

def encode_data(batch):
    tokenized_inputs = tokenizer(batch["statement"], padding="max_length", truncation=True, max_length=256)
    tokenized_inputs["labels"] = batch["encoded_label"]
    return tokenized_inputs

dataset_encoded = dataset.map(encode_data, batched=True)

Filter:   0%|          | 0/31716 [00:00<?, ? examples/s]

Filter:   0%|          | 0/4351 [00:00<?, ? examples/s]

Filter:   0%|          | 0/4353 [00:00<?, ? examples/s]

Map:   0%|          | 0/31716 [00:00<?, ? examples/s]

Map:   0%|          | 0/4351 [00:00<?, ? examples/s]

Map:   0%|          | 0/4353 [00:00<?, ? examples/s]

Verify the data is tokenized correctly:

In [42]:
print(dataset_encoded)

DatasetDict({
    train: Dataset({
        features: ['statement', 'label', 'encoded_label', 'input_ids', 'token_type_ids', 'attention_mask', 'labels'],
        num_rows: 31716
    })
    test: Dataset({
        features: ['statement', 'label', 'encoded_label', 'input_ids', 'token_type_ids', 'attention_mask', 'labels'],
        num_rows: 4351
    })
    valid: Dataset({
        features: ['statement', 'label', 'encoded_label', 'input_ids', 'token_type_ids', 'attention_mask', 'labels'],
        num_rows: 4353
    })
})


In [43]:
dataset_encoded.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])

The DataCollatorWithPadding ensures that all input sequences in a batch are padded to the same length, using the padding logic defined by the tokenizer. This is necessary because the model can only process inputs of the same length.

In [44]:
from transformers import DataCollatorWithPadding

data_collator = DataCollatorWithPadding(tokenizer)

Next we'll set up LabelEncoder to encode labels and defines a function to compute per-label accuracy from a confusion matrix, providing label-specific accuracy metrics. I.e. when we train the model we want to see the accuracy metrics per label as well as the average metrics. This is more relevant if you have more than two labels, and one is underperforming. 

In [45]:
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import  confusion_matrix
import numpy as np

label_encoder = LabelEncoder()
label_encoder.fit(unique_labels)

def per_label_accuracy(y_true, y_pred, labels):
    cm = confusion_matrix(y_true, y_pred, labels=labels)
    correct_predictions = cm.diagonal()
    label_totals = cm.sum(axis=1)
    per_label_acc = np.divide(correct_predictions, label_totals, out=np.zeros_like(correct_predictions, dtype=float), where=label_totals != 0)
    return dict(zip(labels, per_label_acc))

Compute the following metrics: accuracy, precision, recall, f1 score, and per-label accuracy.

In [46]:
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score

def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)

    decoded_labels = label_encoder.inverse_transform(labels)
    decoded_preds = label_encoder.inverse_transform(preds)

    precision = precision_score(decoded_labels, decoded_preds, average='weighted')
    recall = recall_score(decoded_labels, decoded_preds, average='weighted')
    f1 = f1_score(decoded_labels, decoded_preds, average='weighted')
    acc = accuracy_score(decoded_labels, decoded_preds)

    labels_list = list(label_encoder.classes_)
    per_label_acc = per_label_accuracy(decoded_labels, decoded_preds, labels_list)

    per_label_acc_metrics = {}
    for label, accuracy in per_label_acc.items():
        label_key = f"accuracy_label_{label}"
        per_label_acc_metrics[label_key] = accuracy

    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall,
        **per_label_acc_metrics
    }

Next, the training begins. Training loss and validation loss should decrease consistenly. If the training loss is decreasing but the validation loss is increasing, the model is overfitting. If both are increasing, the model is underfitting.

In [47]:
print(dataset_encoded['train'][0])


{'input_ids': tensor([    2,  1529,   136,    13,     5,   139, 21458,    18,     6,    13,
            8,    14,  2122,    26,  4397,    16,    40,  6559,    30,   841,
         4541,    19,  1529,   136,  1272,    27,  8885,    28,  3655,  2004,
         5863,    14,   358,    16,    14,   236,   840,   167,    20,    44,
         2863,  2550,    14, 19541,    16,  7355,  1374,     9,  2454,  2797,
          789, 11556,  1232,    58,    87,    65,    14,  2576,    41,    74,
         5863,    37,    14,  8435,    16,    40,   488,   353,    19,    14,
           71,  5093, 12254,   256,    16,    14,  1057,     9,    14,  2122,
           35,    89,  1374,    30,  7355,  1272,   238,   509,  1464,     9,
           19,   600,   203,  3680,   148,   440,    19,    14, 11131,     9,
          732,  6559,    15,    14,   127,  9389,    19,    21,  2782,    15,
           29,   557,    81,     8, 17124,    16,    14, 24064,    19,    14,
         1057,     9,  1201,    30,  1617,    15, 

In [48]:
from transformers import Trainer, TrainingArguments

training_args = TrainingArguments(
    output_dir='./output',
    num_train_epochs=1,
    warmup_steps=3000,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    weight_decay=0.01,
    logging_steps=10,
    eval_strategy='steps',
    eval_steps=100,
    learning_rate=2e-5,
    save_steps=1000,
    gradient_accumulation_steps=2,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset_encoded['train'],
    eval_dataset=dataset_encoded['test'],
    compute_metrics=compute_metrics,
    processing_class=tokenizer,
    data_collator=data_collator,
)

trainer.train()

Step,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall,Accuracy Label False,Accuracy Label True
100,0.7743,0.356207,0.988279,0.988279,0.988423,0.988279,0.980126,0.996724
200,0.1108,0.040462,0.998621,0.998621,0.998625,0.998621,1.0,0.997192
300,0.009,0.007979,0.998621,0.998621,0.998625,0.998621,1.0,0.997192
400,0.0302,0.002679,0.99954,0.99954,0.999541,0.99954,1.0,0.999064
500,0.0017,0.0013,0.99977,0.99977,0.99977,0.99977,1.0,0.999532
600,0.0061,0.001082,0.99977,0.99977,0.99977,0.99977,1.0,0.999532
700,0.0009,0.001634,0.99977,0.99977,0.99977,0.99977,1.0,0.999532
800,0.0003,0.001841,0.99954,0.99954,0.999541,0.99954,1.0,0.999064
900,0.0002,0.000549,0.99977,0.99977,0.99977,0.99977,1.0,0.999532


TrainOutput(global_step=991, training_loss=0.15483132867879346, metrics={'train_runtime': 1271.4204, 'train_samples_per_second': 24.945, 'train_steps_per_second': 0.779, 'total_flos': 378928301506560.0, 'train_loss': 0.15483132867879346, 'epoch': 0.9994957135653051})

Save the results, the model and the state of the model.

In [49]:
trainer.evaluate(dataset_encoded['valid'])
trainer.save_model(your_path)
trainer.save_state()