# Dependency Installation and Repository Cloning

### Run it if you're using this notebook in Google Colab.

In [None]:
!git clone 'https://github.com/dakopecky/nlp-course-itmo.git'

%cd nlp-course-itmo
!git checkout hw6
%cd hw6

!pip install poetry
!poetry remove torch
!poetry config virtualenvs.create false
!poetry install --no-ansi
!pip install transformers=="4.35.0" datasets=="2.14.6" accelerate=="0.24.1"

import IPython
IPython.Application.instance().kernel.do_shutdown(True)

# Fine-Tuning and Evaluating BERT Model for Text Classification

Import deps

In [None]:
# This code includes software developed by the following open-source projects:
# - numpy (License: BSD-3-Clause license, Authors: NumPy Developers)
# - pandas (License: BSD-3-Clause License, Authors: Pandas Development Team)
# - datasets (License: Apache License 2.0, Authors: Hugging Face Inc.)
# - transformers (License: Apache License 2.0, Authors: Hugging Face Inc.)
# - accelerate (License: Apache License 2.0, Authors: Hugging Face Inc.)
# - scikit-learn (License: BSD License, Authors: scikit-learn Developers)
# - Jupyter Notebook (License: Modified BSD License, Authors: Project Jupyter)
# For the full license information, please see the `licenses` directory.


import numpy as np
import pandas as pd
from datasets import load_dataset
from IPython.display import display
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification, Trainer, TrainingArguments
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, f1_score

Define random state for reproducing

In [None]:
RANDOM_STATE = 42

## Preparing data

Preparing "AG News" dataset

In [None]:
dataset = load_dataset("ag_news")

tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased")

def encode_batch(batch):
    return tokenizer(batch['text'], padding='max_length', truncation=True, max_length=256)

train_data = dataset['train'].select(range(10000)).map(encode_batch, batched=True)
test_data = dataset['test'].select(range(2000)).map(encode_batch, batched=True)

train_data.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])
test_data.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])

## Fine-tuning

In [None]:
model = DistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=4)

training_args = TrainingArguments(
    output_dir="./results",
    num_train_epochs=3,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    gradient_accumulation_steps=4,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=100,
    do_train=True,
    do_eval=True,
    use_cpu=False,
    seed=RANDOM_STATE,
    evaluation_strategy="steps",
    eval_steps=100,
    save_steps=100,
)

def compute_metrics(p):
    pred, labels = p
    pred = np.argmax(pred, axis=1)
    accuracy = np.mean(pred == labels)
    return {"accuracy": accuracy}

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_data,
    eval_dataset=test_data,
    compute_metrics=compute_metrics,
)

trainer.train()

Downloading model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.weight', 'classifier.bias', 'classifier.weight', 'pre_classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss,Validation Loss,Accuracy
100,1.1929,0.598452,0.8635
200,0.4028,0.320334,0.9015


TrainOutput(global_step=234, training_loss=0.7235767168876452, metrics={'train_runtime': 649.7852, 'train_samples_per_second': 46.169, 'train_steps_per_second': 0.36, 'total_flos': 1981782965944320.0, 'train_loss': 0.7235767168876452, 'epoch': 2.99})

## Evaluation

In [None]:
models = {
    'Before fine-tuning': DistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=4),
    'After fine-tuning': DistilBertForSequenceClassification.from_pretrained("./results/checkpoint-200", num_labels=4)
}

def compute_metrics(p):
    pred, labels = p
    pred = np.argmax(pred, axis=1)
    accuracy = accuracy_score(labels, pred)
    f1 = f1_score(labels, pred, average='macro')
    return {"accuracy": accuracy, "f1": f1}

for model_name, model in models.items():
    trainer = Trainer(model=model, compute_metrics=compute_metrics)
    results = trainer.evaluate(eval_dataset=test_data)
    predictions = trainer.predict(test_data)
    pred_labels = np.argmax(predictions.predictions, axis=1)

    print(f"\n{model_name}:")

    metrics_df = pd.DataFrame({
        'Metric': ['Accuracy', 'F1 Score'],
        'Value': [results['eval_accuracy'], results['eval_f1']]
    })
    display(metrics_df)

    print("\nConfusion Matrix:")
    display(pd.DataFrame(confusion_matrix(test_data['label'], pred_labels)))
    print("\nClassification Report:")
    display(pd.DataFrame.from_dict(classification_report(test_data['label'], pred_labels, output_dict=True, zero_division=0)).T)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.weight', 'classifier.bias', 'classifier.weight', 'pre_classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



Before fine-tuning:


Unnamed: 0,Metric,Value
0,Accuracy,0.2405
1,F1 Score,0.130206



Confusion Matrix:


Unnamed: 0,0,1,2,3
0,1,2,498,10
1,2,2,479,43
2,0,0,436,13
3,0,5,467,42



Classification Report:


Unnamed: 0,precision,recall,f1-score,support
0,0.333333,0.001957,0.003891,511.0
1,0.222222,0.003802,0.007477,526.0
2,0.231915,0.971047,0.37441,449.0
3,0.388889,0.081712,0.135048,514.0
accuracy,0.2405,0.2405,0.2405,0.2405
macro avg,0.29409,0.26463,0.130206,2000.0
weighted avg,0.29562,0.2405,0.121723,2000.0



After fine-tuning:


Unnamed: 0,Metric,Value
0,Accuracy,0.9015
1,F1 Score,0.898412



Confusion Matrix:


Unnamed: 0,0,1,2,3
0,435,25,24,27
1,1,520,2,3
2,12,3,354,80
3,4,3,13,494



Classification Report:


Unnamed: 0,precision,recall,f1-score,support
0,0.962389,0.851272,0.903427,511.0
1,0.943739,0.988593,0.965645,526.0
2,0.900763,0.788419,0.840855,449.0
3,0.817881,0.961089,0.883721,514.0
accuracy,0.9015,0.9015,0.9015,0.9015
macro avg,0.906193,0.897343,0.898412,2000.0
weighted avg,0.90651,0.9015,0.900679,2000.0
