In [1]:
import torch
from transformers.file_utils import is_tf_available, is_torch_available, is_torch_tpu_available
from transformers import BertTokenizerFast, BertForSequenceClassification
from transformers import Trainer, TrainingArguments
import numpy as np
import random
from sklearn.datasets import fetch_20newsgroups
from sklearn.model_selection import train_test_split

In [2]:
# the model we gonna train, base uncased BERT
# check text classification models here: https://huggingface.co/models?filter=text-classification
model_name = "bert-base-uncased"
# max sequence length for each document/sentence sample
max_length = 512

In [3]:
tokenizer = BertTokenizerFast.from_pretrained(model_name, do_lower_case=True)

In [4]:
def read_20newsgroups(test_size=0.2):
  # download & load 20newsgroups dataset from sklearn's repos
  dataset = fetch_20newsgroups(subset="all", shuffle=True, remove=("headers", "footers", "quotes"))
  documents = dataset.data[:200]
  labels = dataset.target[:200]
  # split into training & testing a return data as well as label names
  return train_test_split(documents, labels, test_size=test_size), dataset.target_names

In [5]:
# call the function
(train_texts, valid_texts, train_labels, valid_labels), target_names = read_20newsgroups()

In [6]:
train_encodings = tokenizer(train_texts, truncation=True, padding=True, max_length=max_length)
valid_encodings = tokenizer(valid_texts, truncation=True, padding=True, max_length=max_length)

In [7]:
class NewsGroupsDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {k: torch.tensor(v[idx]) for k, v in self.encodings.items()}
        item["labels"] = torch.tensor([self.labels[idx]])
        return item

    def __len__(self):
        return len(self.labels)

In [8]:
train_dataset = NewsGroupsDataset(train_encodings, train_labels)
valid_dataset = NewsGroupsDataset(valid_encodings, valid_labels)

In [9]:
model = BertForSequenceClassification.from_pretrained(model_name, num_labels=len(target_names)).to("cuda")

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

In [10]:
from sklearn.metrics import accuracy_score

def compute_metrics(pred):
  labels = pred.label_ids
  preds = pred.predictions.argmax(-1)
  # calculate accuracy using sklearn's function
  acc = accuracy_score(labels, preds)
  return {
      'accuracy': acc,
  }

In [11]:
training_args = TrainingArguments(
    output_dir='./results',          # output directory
    num_train_epochs=3,              # total number of training epochs
    per_device_train_batch_size=4,  # batch size per device during training
    per_device_eval_batch_size=4,   # batch size for evaluation
    warmup_steps=20,                # number of warmup steps for learning rate scheduler
    weight_decay=0.01,               # strength of weight decay
    logging_dir='./logs',            # directory for storing logs
    load_best_model_at_end=True,     # load the best model when finished training (default metric is loss)
    # but you can specify `metric_for_best_model` argument to change to accuracy or other metric
    logging_steps=10,               # log & save weights each logging_steps
    evaluation_strategy="steps",     # evaluate each `logging_steps`
)

In [12]:
trainer = Trainer(
    model=model,                         # the instantiated Transformers model to be trained
    args=training_args,                  # training arguments, defined above
    train_dataset=train_dataset,         # training dataset
    eval_dataset=valid_dataset,          # evaluation dataset
    compute_metrics=compute_metrics,     # the callback that computes metrics of interest
)

In [13]:
trainer.train()

***** Running training *****
  Num examples = 160
  Num Epochs = 3
  Instantaneous batch size per device = 4
  Total train batch size (w. parallel, distributed & accumulation) = 4
  Gradient Accumulation steps = 1
  Total optimization steps = 120


Step,Training Loss,Validation Loss,Accuracy
10,3.0022,3.041451,0.125
20,2.9809,3.046293,0.0
30,3.0729,3.021334,0.075
40,2.962,3.020222,0.075
50,2.7876,3.064489,0.175
60,2.7968,3.017297,0.15
70,2.7512,2.914529,0.05
80,2.8636,2.846666,0.125
90,2.3908,2.884681,0.175
100,2.5526,2.831975,0.225


***** Running Evaluation *****
  Num examples = 40
  Batch size = 4
***** Running Evaluation *****
  Num examples = 40
  Batch size = 4
***** Running Evaluation *****
  Num examples = 40
  Batch size = 4
***** Running Evaluation *****
  Num examples = 40
  Batch size = 4
***** Running Evaluation *****
  Num examples = 40
  Batch size = 4
***** Running Evaluation *****
  Num examples = 40
  Batch size = 4
***** Running Evaluation *****
  Num examples = 40
  Batch size = 4
***** Running Evaluation *****
  Num examples = 40
  Batch size = 4
***** Running Evaluation *****
  Num examples = 40
  Batch size = 4
***** Running Evaluation *****
  Num examples = 40
  Batch size = 4
***** Running Evaluation *****
  Num examples = 40
  Batch size = 4
***** Running Evaluation *****
  Num examples = 40
  Batch size = 4


Training completed. Do not forget to share your model on huggingface.co/models =)




TrainOutput(global_step=120, training_loss=2.7366894245147706, metrics={'train_runtime': 24.9501, 'train_samples_per_second': 19.238, 'train_steps_per_second': 4.81, 'total_flos': 126313717432320.0, 'train_loss': 2.7366894245147706, 'epoch': 3.0})

In [14]:
trainer.evaluate()

***** Running Evaluation *****
  Num examples = 40
  Batch size = 4


{'eval_loss': 2.778193950653076,
 'eval_accuracy': 0.225,
 'eval_runtime': 0.5009,
 'eval_samples_per_second': 79.861,
 'eval_steps_per_second': 19.965,
 'epoch': 3.0}

In [15]:
def get_prediction(text):
    # prepare our text into tokenized sequence
    inputs = tokenizer(text, padding=True, truncation=True, max_length=max_length, return_tensors="pt").to("cuda")
    # perform inference to our model
    outputs = model(**inputs)
    # get output probabilities by doing softmax
    probs = outputs[0].softmax(1)
    # executing argmax function to get the candidate label
    return target_names[probs.argmax()]