# Visualize your 🤗 Hugging Face data
#### 🛠️ Installation and set-up

In [5]:
import numpy as np
from transformers import RobertaTokenizer
import torch

### 🛫 Data and model preparation
#### 🏷️ Loading a dataset

In [2]:
from datasets import load_dataset
dataset = load_dataset("sst2")

  from .autonotebook import tqdm as notebook_tqdm
Found cached dataset sst2 (/zhome/94/5/127021/.cache/huggingface/datasets/sst2/default/2.0.0/9896208a8d85db057ac50c72282bcb8fe755accc671a57dd8059d4e130961ed5)
100%|██████████| 3/3 [00:00<00:00, 489.91it/s]


In [3]:
dataset

DatasetDict({
    train: Dataset({
        features: ['idx', 'sentence', 'label'],
        num_rows: 67349
    })
    validation: Dataset({
        features: ['idx', 'sentence', 'label'],
        num_rows: 872
    })
    test: Dataset({
        features: ['idx', 'sentence', 'label'],
        num_rows: 1821
    })
})

For demo sub-sample dataset 

In [4]:
small_data_train = dataset['train'].select(range(dataset['train'].num_rows // 10))
# alternative methods
# dataset["train"].shuffle(seed=42).select([i for i in list(range(100))])
small_data_val = dataset['validation'].select(range(dataset['validation'].num_rows // 10)) # dataset["validation"].shuffle(seed=42).select([i for i in list(range(50))])

### ⚙️ Tokenizing the dataset
In a typical NLP workflow, we must first tokenize our dataset.

Converting the stream of characters in the text into a stream of defined "tokens", which can be anything from a smaller set of characters to words from a vocabulary.

We will use a pretrained model, so we inherit its tokenization scheme.

In [6]:
tokenizer = RobertaTokenizer.from_pretrained('roberta-base')

Downloading: 100%|██████████| 899k/899k [00:00<00:00, 1.42MB/s] 
Downloading: 100%|██████████| 456k/456k [00:00<00:00, 863kB/s] 
Downloading: 100%|██████████| 481/481 [00:00<00:00, 378kB/s]


In [13]:
small_data_train['sentence'][:5]

['hide new secretions from the parental units ',
 'contains no wit , only labored gags ',
 'that loves its characters and communicates something rather beautiful about human nature ',
 'remains utterly satisfied to remain the same throughout ',
 'on the worst revenge-of-the-nerds clichés the filmmakers could dredge up ']

In [26]:
test_text = ['hello world',' hello world', 'hello world.','hello world?',' hello world ','hello world .']
test_text2 = ['in store and dog',' in store ','in store ',' in store','in store?','in store.','in store .']

In [28]:
tokenizer(test_text2, truncation = True)

{'input_ids': [[0, 179, 1400, 8, 2335, 2], [0, 11, 1400, 1437, 2], [0, 179, 1400, 1437, 2], [0, 11, 1400, 2], [0, 179, 1400, 116, 2], [0, 179, 1400, 4, 2], [0, 179, 1400, 479, 2]], 'attention_mask': [[1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1], [1, 1, 1, 1, 1], [1, 1, 1, 1], [1, 1, 1, 1, 1], [1, 1, 1, 1, 1], [1, 1, 1, 1, 1]]}

In [19]:
tokenizer(small_data_train['sentence'][:5])#, truncation = True)

{'input_ids': [[0, 37265, 92, 3556, 2485, 31, 5, 20536, 2833, 1437, 2], [0, 10800, 5069, 117, 22094, 2156, 129, 6348, 3995, 821, 8299, 1437, 2], [0, 6025, 6138, 63, 3768, 8, 39906, 402, 1195, 2721, 59, 1050, 2574, 1437, 2], [0, 5593, 5069, 19223, 10028, 7, 1091, 5, 276, 1328, 1437, 2], [0, 261, 5, 2373, 13543, 12, 1116, 12, 627, 12, 1396, 11622, 43848, 5739, 5, 17504, 115, 31120, 1899, 62, 1437, 2]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]]}

In [5]:
    
# tokenizer(examples["sentence"], truncation=True)




tokenized_train = small_train_dataset.map(tokenizer(examples["sentence"], truncation=True), batched=True)
tokenized_val = small_val_dataset.map(preprocess_function, batched=True)

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)


# tokenizer = AutoTokenizer.from_pretrained('distilbert-base-uncased')

We then map the tokenizer over our dataset:

In [22]:
from wandb.sdk.integration_utils.data_logging import ValidationDataLogger
#from transformers import AutoTokenizer, AutoModelForSequenceClassification
from transformers import RobertaTokenizer, DataCollatorWithPadding, RobertaForSequenceClassification, TrainingArguments, Trainer
from datasets import load_dataset
import torch

import wandb
wandb.init(project="SST2_sentiment_analysis",
            entity="mmfogh")

VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.0166690305651476, max=1.0))…

In [3]:
def preprocess_function(examples):
    return tokenizer(examples["sentence"], truncation=True)

dataset = load_dataset("sst2")

small_train_dataset = dataset["train"].shuffle(seed=42).select([i for i in list(range(100))])
small_val_dataset = dataset["validation"].shuffle(seed=42).select([i for i in list(range(50))])

tokenizer = RobertaTokenizer.from_pretrained('roberta-base')

tokenized_train = small_train_dataset.map(preprocess_function, batched=True)
tokenized_val = small_val_dataset.map(preprocess_function, batched=True)

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

#model = AutoModelForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=num_labels)
model = RobertaForSequenceClassification.from_pretrained('roberta-base')

The history saving thread hit an unexpected error (OperationalError('unable to open database file')).History will not be written to the database.
Downloading and preparing dataset sst2/default (download: 7.09 MiB, generated: 4.78 MiB, post-processed: Unknown size, total: 11.88 MiB) to /zhome/94/5/127021/.cache/huggingface/datasets/sst2/default/2.0.0/9896208a8d85db057ac50c72282bcb8fe755accc671a57dd8059d4e130961ed5...


OSError: [Errno 28] No space left on device: '/zhome/94/5/127021/.cache/huggingface/datasets/sst2/default/2.0.0/9896208a8d85db057ac50c72282bcb8fe755accc671a57dd8059d4e130961ed5.incomplete'

In [36]:
validation_inputs = tokenized_val.remove_columns(['label', 'idx'])
validation_targets = [tokenized_val.features['label'].int2str(x) for x in tokenized_val['label']]

validation_inputs[0], validation_targets[0]

validation_logger = ValidationDataLogger(
    inputs = validation_inputs[:],
    targets = validation_targets
)

In this case, we are loading a pre-trained network to which a custom head has been added for sequence classification.


In [18]:
from transformers import AutoModelForSequenceClassification

model = AutoModelForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=2)

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_transform.weight', 'vocab_transform.bias', 'vocab_layer_norm.weight', 'vocab_projector.weight', 'vocab_projector.bias', 'vocab_layer_norm.bias']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.weight', 'classifier.weight', 'classifi


Let's make a function to return the topic prediction from a sample question.

In [16]:
import torch

def get_topic(sentence, tokenize=tokenizer, model=model):
    # tokenize the input
    inputs = tokenizer(sentence, return_tensors='pt')
    # ensure model and inputs are on the same device (GPU)
    #inputs = {name: tensor.cuda() for name, tensor in inputs.items()}
    #model = model.cuda()
    # get prediction - 10 classes "probabilities" (not really true because they still need to be normalized)
    with torch.no_grad():
        predictions = model(**inputs)[0].cpu().numpy()
    # get the top prediction class and convert it to its associated label
    top_prediction = predictions.argmax().item()
    return dataset['train'].features['label'].int2str(top_prediction)

We can test our prediction pipeline on a sample sentence.

In [17]:
get_topic('Why is cheese so much better with wine?')

'positive'

When we ran our model, we got the answer `Politics & Government`,
which doesn't seem quite right for a question about cheese and wine.

That's because the model has not been trained yet so the outputs are still random. But at least we have a working pipeline!

# 📊 Log your data for better visualization


Before we train our model, let's set up some better logging during training.

Without the ability to inspect model behavior, it can be hard to debug or understand models.
So we'll log a table of information about the model's behavior on the validation set --
not just the loss or accuracy, but the inputs and outputs as well.

Our data is already in a pandas `DataFrame`, so there's not much we have to do besides
slightly reformat them
and then use them to define a `ValidationDataLogger` instance.

In [10]:
from wandb.sdk.integration_utils.data_logging import ValidationDataLogger

In [11]:
dataset['test']

Dataset({
    features: ['idx', 'sentence', 'label', 'input_ids', 'attention_mask'],
    num_rows: 87
})

In [12]:
validation_inputs = dataset['test'].remove_columns(['label', 'attention_mask', 'input_ids'])
validation_targets = [dataset['test'].features['label'].int2str(x) for x in dataset['test']['label']]

validation_inputs[0], validation_targets[0]

({'idx': 0, 'sentence': "it 's a charming and often affecting journey . "},
 'positive')

In [13]:
validation_logger = ValidationDataLogger(
    inputs = validation_inputs[:],
    targets = validation_targets
)

We can now log our predictions for visualization with `validation_logger.log_predictions(prediction_labels)`.

# 👟 Training the model and logging to W&B

We are now ready to fine-tune the model to solve our task.

The Hugging Face [`Trainer` class](https://huggingface.co/transformers/main_classes/trainer.html)
lets us easily train a model and is very flexible.

**Note:** set `report_to` to `wandb` in `TrainingArguments` to enable logging through W&B.

In [14]:
from transformers import Trainer, TrainingArguments

args = TrainingArguments(
    report_to='wandb',                    # enable logging to W&B
    output_dir='topic_classification',    # set output directory
    overwrite_output_dir=True,
    evaluation_strategy='steps',          # check evaluation metrics on a given # of steps
    learning_rate=5e-5,                   # we can customize learning rate
    max_steps=1000,
    logging_steps=100,                    # we will log every 100 steps
    eval_steps=500,                       # we will perform evaluation every 1000 steps
    eval_accumulation_steps=1,            # report evaluation results after each step
    load_best_model_at_end=True,
    metric_for_best_model='accuracy',
    run_name='my_training_run'            # name of the W&B run
)

In [15]:
# automatically log model to W&B at the end
%env WANDB_LOG_MODEL=true

env: WANDB_LOG_MODEL=true


For more customization, refer to [`TrainingArguments` documentation](https://huggingface.co/transformers/main_classes/trainer.html#transformers.TrainingArguments).

We can optionally define metrics to calculate in addition to the loss through the `compute_metrics` function.

Several [metrics](https://huggingface.co/metrics) are readily available from the datasets library to monitor model performance.

We'll also use it to log all of our predictions at each evaluation loop,
using the `validation_logger`.

In [16]:
from datasets import load_metric
import numpy as np

accuracy_metric = load_metric("accuracy")

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)

    # convert predictions from class (0, 1, 2…) to label (Health, Science…)
    prediction_labels = [dataset['test'].features['labels'].int2str(x.item())
                         for x in predictions]
    
    # log predictions
    validation_logger.log_predictions(prediction_labels)

    # metrics from the datasets library have a compute method
    return accuracy_metric.compute(predictions=predictions, references=labels)

  after removing the cwd from sys.path.


Downloading builder script:   0%|          | 0.00/1.65k [00:00<?, ?B/s]

The `Trainer` handles all the training & evaluation logic.

In [19]:
trainer = Trainer(
    model=model,                  # model to be trained
    args=args,                    # training args
    train_dataset=dataset['train'],
    eval_dataset=dataset['test'],
    tokenizer=tokenizer,            # for padding batched data
    compute_metrics=compute_metrics # for custom metrics
)

max_steps is given, it will override any value given in num_train_epochs
