In [1]:
import evaluate
import numpy as np
import pandas as pd

from datasets import load_dataset
from transformers import AutoTokenizer, DataCollatorWithPadding, TrainingArguments, AutoModelForSequenceClassification, Trainer

In [3]:
# datasets = load_dataset("text", data_dir="../data/stack_overflow_16k/")

# split_labels = np.concatenate([np.zeros(2000), np.ones(2000), np.full(2000, 2), np.full(2000, 3)]).astype(int)

# for split, dataset in datasets.items():
#     datasets[split] = dataset.add_column(name="label", column=split_labels)
    
datasets = load_dataset("csv", data_files={
    "train": "../data/stack_overflow_16k/train/train.csv",
    "test": "../data/stack_overflow_16k/test/test.csv"})
datasets

Generating train split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 8000
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 8000
    })
})

In [5]:
datasets["train"][1]

{'text': 'blank bins every 20 minutes i have a time field that goes from 07:00 to 21:00. i want to make bins of 20 minutes, is there something like this in blank:..07:00 - 07:20.07:20 - 07:40.07:40 - 08:00.08:00 - 08:20.08:20 - 08:40.08:40 - 09:00.09:00 - 09:20.09:20 - 09:40.09:40 - 10:00',
 'label': 0}

In [6]:
type(datasets["test"])

datasets.arrow_dataset.Dataset

In [7]:
val_test_splits = datasets["test"].train_test_split(test_size=0.5, shuffle=True, seed=42)
val_test_splits

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 4000
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 4000
    })
})

In [8]:
datasets["validation"] = val_test_splits["train"]
datasets["test"] = val_test_splits["test"]
datasets

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 8000
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 4000
    })
    validation: Dataset({
        features: ['text', 'label'],
        num_rows: 4000
    })
})

In [9]:
checkpoint = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

In [10]:
def tokenize_function(example):
    return tokenizer(example["text"], truncation=True)

tokenized_datasets = datasets.map(tokenize_function, batched=True)

Map:   0%|          | 0/8000 [00:00<?, ? examples/s]

Map:   0%|          | 0/4000 [00:00<?, ? examples/s]

Map:   0%|          | 0/4000 [00:00<?, ? examples/s]

In [11]:
tokenized_datasets

DatasetDict({
    train: Dataset({
        features: ['text', 'label', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 8000
    })
    test: Dataset({
        features: ['text', 'label', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 4000
    })
    validation: Dataset({
        features: ['text', 'label', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 4000
    })
})

A _collate_ function will apply the correct amount of padding to the items of the dataset we want to batch together. Without dynamic padding, all of the samples would have to be padded to the maximum length in the whole dataset, or the maximum length the model can accept.

It takes a tokenizer when instantiated to know which padding token to use, and whether the model expects padding to be on the left or on the right of the inputs.

In [12]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [13]:
training_args = TrainingArguments(
    output_dir="stackoverflow-classifier-training-dir",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    num_train_epochs=10,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=64,
)

In [14]:
model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=4)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


The warning is due to the fact that BERT has not been pretrained on classifying pairs of sentences, so the head of the pretrained model has been discarded and a new head suitable for sequence classification has been added instead.

In [15]:
def compute_metrics(eval_preds):
    accuracy_metric, precision_metric, recall_metric, roc_auc_metric =\
        evaluate.load("accuracy"), evaluate.load("precision"), evaluate.load("recall"), evaluate.load("roc_auc")
    
    logits, labels = eval_preds
    predictions = np.argmax(logits, axis=-1)
    
    metrics = accuracy_metric.compute(predictions=predictions, references=labels)
    metrics.update(precision_metric.compute(predictions=predictions, references=labels))
    metrics.update(recall_metric.compute(predictions=predictions, references=labels))
    metrics.update(roc_auc_metric.compute(predictions=predictions, references=labels))

    return metrics

In [16]:
trainer = Trainer(
    model,
    training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

This will start the fine-tuning.

In [17]:
trainer.train()

You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss



KeyboardInterrupt

