In [None]:
!pip install datasets

Collecting datasets
  Downloading datasets-2.21.0-py3-none-any.whl.metadata (21 kB)
Collecting pyarrow>=15.0.0 (from datasets)
  Downloading pyarrow-17.0.0-cp310-cp310-manylinux_2_28_x86_64.whl.metadata (3.3 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Downloading datasets-2.21.0-py3-none-any.whl (527 kB)
[2K   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m527.3/527.3 kB[0m [31m28.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚

In [None]:
import torch
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
from datasets import load_dataset
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
import matplotlib.pyplot as plt
from tqdm import tqdm

In [None]:
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='weighted')
    acc = accuracy_score(labels, preds)
    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }

In [None]:
dataset = load_dataset('ag_news')
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Downloading readme:   0%|          | 0.00/8.07k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/18.6M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/1.23M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/120000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/7600 [00:00<?, ? examples/s]

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]



In [None]:
def tokenize_function(examples):
    return tokenizer(examples['text'], padding="max_length", truncation=True)

In [None]:
tokenized_datasets = dataset.map(tokenize_function, batched=True, batch_size=1000, desc='Tokenizing')

Tokenizing:   0%|          | 0/120000 [00:00<?, ? examples/s]

Tokenizing:   0%|          | 0/7600 [00:00<?, ? examples/s]

In [None]:
tokenized_datasets = tokenized_datasets.remove_columns(["text"]) # To save some memory

In [None]:
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=4)

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
training_args = TrainingArguments(
    output_dir='./results',
    evaluation_strategy="epoch",
    logging_dir='./logs',
    logging_steps=10,
    per_device_train_batch_size=8,  # Reduced batch size for less memory usage
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    learning_rate=2e-5,
    weight_decay=0.01,
    save_total_limit=1
    )

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets['train'],
    eval_dataset=tokenized_datasets['test'],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

print("Training the model...")
for epoch in tqdm(range(int(training_args.num_train_epochs)), desc="Epochs"):
    trainer.train()

log_history = trainer.state.log_history




Training the model...


Epochs:   0%|          | 0/3 [00:00<?, ?it/s]

Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,0.1058,0.202364,0.945263,0.945302,0.945745,0.945263
2,0.117,0.218446,0.948026,0.948087,0.948171,0.948026
3,0.0871,0.267796,0.949342,0.949379,0.949501,0.949342


Epochs:  33%|‚ñà‚ñà‚ñà‚ñé      | 1/3 [2:16:21<4:32:42, 8181.04s/it]

Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,0.0744,0.29725,0.942368,0.942455,0.942604,0.942368


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,0.0744,0.29725,0.942368,0.942455,0.942604,0.942368
2,0.0012,0.326299,0.944211,0.944208,0.944228,0.944211
3,0.0001,0.422793,0.944605,0.944607,0.944618,0.944605


Epochs:  67%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñã   | 2/3 [4:32:42<2:16:21, 8181.16s/it]

Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,0.1036,0.373029,0.942632,0.942615,0.942726,0.942632
2,0.0641,0.429971,0.943816,0.943813,0.943829,0.943816
3,0.0,0.463944,0.944342,0.944314,0.944304,0.944342


Epochs: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 3/3 [6:48:57<00:00, 8179.23s/it]


In [None]:
evaluation_results = trainer.evaluate()

In [None]:
print(evaluation_results)

{'eval_loss': 0.4639444947242737, 'eval_accuracy': 0.9443421052631579, 'eval_f1': 0.9443138389906125, 'eval_precision': 0.944304000914162, 'eval_recall': 0.9443421052631579, 'eval_runtime': 55.2698, 'eval_samples_per_second': 137.507, 'eval_steps_per_second': 17.188, 'epoch': 3.0}


In [None]:
from google.colab import files

In [None]:
import shutil

output_dir = "./saved_model"
trainer.save_model(output_dir)

zipfile_name = "saved_model.zip"
shutil.make_archive(zipfile_name.replace(".zip", ""), 'zip', output_dir)

files.download(zipfile_name)

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
import json
with open('evaluation_results.json', 'w') as f:
    json.dump(evaluation_results, f)

with open('log_history.json', 'w') as f:
    json.dump(trainer.state.log_history, f)

files.download('evaluation_results.json')
files.download('log_history.json')


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
train_loss = [log['loss'] for log in log_history if 'loss' in log]
eval_loss = [log['eval_loss'] for log in log_history if 'eval_loss' in log]
eval_accuracy = [log['eval_accuracy'] for log in log_history if 'eval_accuracy' in log]