In [1]:
import torch
import transformers
from transformers.file_utils import is_tf_available, is_torch_available, is_torch_tpu_available
from transformers import BertTokenizerFast, BertForSequenceClassification
from transformers import Trainer, TrainingArguments
import numpy as np
import random
from sklearn.datasets import fetch_20newsgroups
from sklearn.model_selection import train_test_split

In [2]:
from absl import app
from absl import flags
from absl import logging
from lit_nlp import dev_server
from lit_nlp import server_flags
from lit_nlp.api import model as lit_model
from lit_nlp.api import types as lit_types
# Use the regular GLUE data loaders, because these are very simple already.
from lit_nlp.lib import utils

In [3]:
# the model we gonna train, base uncased BERT
# check text classification models here: https://huggingface.co/models?filter=text-classification
model_name = "bert-base-uncased"
# max sequence length for each document/sentence sample
max_length = 512

In [4]:
tokenizer = BertTokenizerFast.from_pretrained(model_name, do_lower_case=True)

In [5]:
def read_20newsgroups(test_size=0.2):
  # download & load 20newsgroups dataset from sklearn's repos
  dataset = fetch_20newsgroups(subset="all", shuffle=True, remove=("headers", "footers", "quotes"))
  documents = dataset.data[:200]
  labels = dataset.target[:200]
  # split into training & testing a return data as well as label names
  return train_test_split(documents, labels, test_size=test_size), dataset.target_names

In [6]:
# call the function
(train_texts, valid_texts, train_labels, valid_labels), target_names = read_20newsgroups()

In [7]:
len(target_names)

20

In [7]:
train_encodings = tokenizer(train_texts, truncation=True, padding=True, max_length=max_length)
valid_encodings = tokenizer(valid_texts, truncation=True, padding=True, max_length=max_length)

In [8]:
class NewsGroupsDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {k: torch.tensor(v[idx]) for k, v in self.encodings.items()}
        item["labels"] = torch.tensor([self.labels[idx]])
        return item

    def __len__(self):
        return len(self.labels)

In [9]:
train_dataset = NewsGroupsDataset(train_encodings, train_labels)
valid_dataset = NewsGroupsDataset(valid_encodings, valid_labels)

In [10]:
model = BertForSequenceClassification.from_pretrained(model_name, num_labels=len(target_names)).to("cuda")

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

In [11]:
from sklearn.metrics import accuracy_score

def compute_metrics(pred):
  labels = pred.label_ids
  preds = pred.predictions.argmax(-1)
  # calculate accuracy using sklearn's function
  acc = accuracy_score(labels, preds)
  return {
      'accuracy': acc,
  }

In [12]:
training_args = TrainingArguments(
    output_dir='./results',          # output directory
    num_train_epochs=3,              # total number of training epochs
    per_device_train_batch_size=4,  # batch size per device during training
    per_device_eval_batch_size=4,   # batch size for evaluation
    warmup_steps=20,                # number of warmup steps for learning rate scheduler
    weight_decay=0.01,               # strength of weight decay
    logging_dir='./logs',            # directory for storing logs
    load_best_model_at_end=True,     # load the best model when finished training (default metric is loss)
    # but you can specify `metric_for_best_model` argument to change to accuracy or other metric
    logging_steps=10,               # log & save weights each logging_steps
    evaluation_strategy="steps",     # evaluate each `logging_steps`
)

In [13]:
trainer = Trainer(
    model=model,                         # the instantiated Transformers model to be trained
    args=training_args,                  # training arguments, defined above
    train_dataset=train_dataset,         # training dataset
    eval_dataset=valid_dataset,          # evaluation dataset
    compute_metrics=compute_metrics,     # the callback that computes metrics of interest
)

In [14]:
trainer.train()

***** Running training *****
  Num examples = 160
  Num Epochs = 3
  Instantaneous batch size per device = 4
  Total train batch size (w. parallel, distributed & accumulation) = 4
  Gradient Accumulation steps = 1
  Total optimization steps = 120


Step,Training Loss,Validation Loss,Accuracy
10,3.0782,3.098712,0.025
20,3.0931,2.963238,0.05
30,2.9591,2.926141,0.1
40,3.013,2.926519,0.125
50,2.9249,2.926198,0.025
60,2.8548,3.027048,0.025
70,2.9077,2.967653,0.075
80,2.8089,2.932856,0.05
90,2.7027,2.770093,0.225
100,2.5974,2.659444,0.325


***** Running Evaluation *****
  Num examples = 40
  Batch size = 4
***** Running Evaluation *****
  Num examples = 40
  Batch size = 4
***** Running Evaluation *****
  Num examples = 40
  Batch size = 4
***** Running Evaluation *****
  Num examples = 40
  Batch size = 4
***** Running Evaluation *****
  Num examples = 40
  Batch size = 4
***** Running Evaluation *****
  Num examples = 40
  Batch size = 4
***** Running Evaluation *****
  Num examples = 40
  Batch size = 4
***** Running Evaluation *****
  Num examples = 40
  Batch size = 4
***** Running Evaluation *****
  Num examples = 40
  Batch size = 4
***** Running Evaluation *****
  Num examples = 40
  Batch size = 4
***** Running Evaluation *****
  Num examples = 40
  Batch size = 4
***** Running Evaluation *****
  Num examples = 40
  Batch size = 4


Training completed. Do not forget to share your model on huggingface.co/models =)




TrainOutput(global_step=120, training_loss=2.822476323445638, metrics={'train_runtime': 25.0234, 'train_samples_per_second': 19.182, 'train_steps_per_second': 4.796, 'total_flos': 126313717432320.0, 'train_loss': 2.822476323445638, 'epoch': 3.0})

In [15]:
trainer.evaluate()

***** Running Evaluation *****
  Num examples = 40
  Batch size = 4


{'eval_loss': 2.6060421466827393,
 'eval_accuracy': 0.275,
 'eval_runtime': 0.4881,
 'eval_samples_per_second': 81.954,
 'eval_steps_per_second': 20.488,
 'epoch': 3.0}

In [15]:
def get_prediction(text):
    # prepare our text into tokenized sequence
    inputs = tokenizer(text, padding=True, truncation=True, max_length=max_length, return_tensors="pt").to("cuda")
    # perform inference to our model
    outputs = model(**inputs)
    # get output probabilities by doing softmax
    probs = outputs[0].softmax(1)
    # executing argmax function to get the candidate label
    return target_names[probs.argmax()]

In [16]:
class SimpleModelWrapper(lit_model.Model):

    def __init__(self, tokenizer, model, labels):
        self.tokenizer = tokenizer
        self.model = model
        self.labels = labels
  # end


    def max_minibatch_size(self):
        return 32
    # end
    
    
    def predict_minibatch(self, inputs):
        inputs = tokenizer(text, padding=True, truncation=True, max_length=max_length,return_tensors="pt").to("cuda")
        with torch.no_grad():  # remove this if you need gradients.
            out: transformers.modeling_outputs.SequenceClassifierOutput = self.model(**inputs)
        # end
        
        batched_outputs = {
            "probas": torch.nn.functional.softmax(out.logits, dim=-1),
            "input_ids": inputs["input_ids"],
            "ntok": torch.sum(inputs["attention_mask"], dim=1),
            "cls_emb": out.hidden_states[-1][:, 0],  # last layer, first token
        }

        # Return as NumPy for further processing.
        detached_outputs = {k: v.cpu().numpy() for k, v in batched_outputs.items()}
        # Unbatch outputs so we get one record per input example.
        for output in utils.unbatch_preds(detached_outputs):
            ntok = output.pop("ntok")
            output["tokens"] = self.tokenizer.convert_ids_to_tokens(output.pop("input_ids")[1:ntok - 1])
            yield output
        # end
    # end
        
        
    def input_spec(self) -> lit_types.Spec:
        return {
            "sentence": lit_types.TextSegment(),
            "label": lit_types.CategoryLabel(vocab=self.labels, required=False)
        }
    # end
    
    
    def output_spec(self) -> lit_types.Spec:
        return {
            "tokens": lit_types.Tokens(),
            "probas": lit_types.MulticlassPreds(parent="label", vocab=self.labels, null_idx=0),
            "cls_emb": lit_types.Embeddings()
        }
    # end
# end class

In [17]:
model_wrapper = SimpleModelWrapper(tokenizer, model, target_names)

In [26]:
from lit_nlp.api import dataset as lit_dataset
class LITDataset(lit_dataset.Dataset):
    
    def __init__(self, texts, labels, heads):
        self.texts = texts
        self.labels = labels
        self.heads = heads

        self._examples = []  # populate this with data records
        for text, label in zip(texts, labels):
            self._examples.append({
                "text": text,
                "label": heads[label],
            })

    def spec(self) -> lit_types.Spec:
        return {
            "text": lit_types.TextSegment(),
            "label": lit_types.CategoryLabel(vocab=self.heads),
        }
    # end
# end

In [27]:
datasets = {'jinyuj': LITDataset(valid_texts, valid_labels, target_names)}
models = {'jinyuj': model_wrapper}

In [28]:
from lit_nlp import notebook
widget = notebook.LitWidget(models, datasets, height=1024)

INFO:absl:
 (    (           
 )\ ) )\ )  *   ) 
(()/((()/(` )  /( 
 /(_))/(_))( )(_))
(_)) (_)) (_(_()) 
| |  |_ _||_   _| 
| |__ | |   | |   
|____|___|  |_|   


INFO:absl:Starting LIT server...
INFO:absl:CachingModelWrapper 'jinyuj': no cache path specified, not loading.
INFO:absl:Warm-start of model 'jinyuj' on dataset '_union_empty'
INFO:absl:CachingModelWrapper 'jinyuj': misses (dataset=_union_empty): []
INFO:absl:CachingModelWrapper 'jinyuj': 0 misses out of 0 inputs
INFO:absl:Prepared 0 inputs for model
INFO:absl:Received 0 predictions from model
INFO:absl:Requested types: ['LitType']
INFO:absl:Will return keys: {'cls_emb', 'tokens', 'probas'}
INFO:absl:CachingModelWrapper 'jinyuj': no cache path specified, not saving.


In [29]:
widget.render()