In [142]:
from transformers import Trainer, TrainingArguments, DistilBertForSequenceClassification, DistilBertTokenizerFast, DataCollatorWithPadding, pipeline
from datasets import Dataset
from evaluate import load
import numpy as np
import torch

In [106]:
snips_file = open('data/snips.train.txt', 'rb')
snips_rows = snips_file.readlines()
print(f'{len(snips_rows):,}')
snips_rows[:20]

143,868


[b'listen O\r\n',
 b'to O\r\n',
 b'westbam B-artist\r\n',
 b'alumb O\r\n',
 b'allergic B-album\r\n',
 b'on O\r\n',
 b'google B-service\r\n',
 b'music I-service\r\n',
 b'PlayMusic\r\n',
 b'\r\n',
 b'add O\r\n',
 b'step B-entity_name\r\n',
 b'to I-entity_name\r\n',
 b'me I-entity_name\r\n',
 b'to O\r\n',
 b'the O\r\n',
 b'50 B-playlist\r\n',
 b'cl\xc3\xa1sicos I-playlist\r\n',
 b'playlist O\r\n',
 b'AddToPlaylist\r\n']

In [107]:
utterances = []
tokenized_utterances = []
labels_for_tokens = []
sequence_labels = []

utterance, tokenized_utterance, labels_for_utterances = '', [], []
for snip_row in snips_rows:
    if len(snip_row) == 2:
        continue
    if ' ' not in snip_row.decode():
        sequence_labels.append(snip_row.decode().strip())
        utterances.append(utterance.strip())
        tokenized_utterances.append(tokenized_utterance)
        labels_for_tokens.append(labels_for_utterances)
        utterance, tokenized_utterance, labels_for_utterances = '', [], []
        continue
    token, token_label = snip_row.decode().split(' ')
    token_label = token_label.strip()
    utterance += f'{token} '
    tokenized_utterance.append(token)
    labels_for_utterances.append(token_label)


In [108]:
utterances

['listen to westbam alumb allergic on google music',
 'add step to me to the 50 clásicos playlist',
 'i give this current textbook a rating value of 1 and a best rating of 6',
 'play the song little robin redbreast',
 'please add iris dement to my playlist this is selena',
 'add slimm cutta calhoun to my this is prince playlist',
 'i want to listen to seventies music',
 'play a popular chant by brian epstein',
 'find fish story',
 'book a spot for 3 in mt',
 'i need a forecast for jetmore massachusetts in 1 hour and 1 second from now',
 'rate this series a 5',
 'give me a list of movie times for films in the area',
 'can you play me some eighties music by adele',
 'please let me know the weather forcast of stanislaus national forest far in nine months',
 'book a restaurant for eight people in six years',
 'i need to book a restaurant in fork mountain sc for valarie mari and i',
 'add to playlist confidence boost here comes santa claus',
 'book a restaurant at sixteen o clock in sc',
 '

In [109]:
tokenized_utterances

[['listen', 'to', 'westbam', 'alumb', 'allergic', 'on', 'google', 'music'],
 ['add', 'step', 'to', 'me', 'to', 'the', '50', 'clásicos', 'playlist'],
 ['i',
  'give',
  'this',
  'current',
  'textbook',
  'a',
  'rating',
  'value',
  'of',
  '1',
  'and',
  'a',
  'best',
  'rating',
  'of',
  '6'],
 ['play', 'the', 'song', 'little', 'robin', 'redbreast'],
 ['please',
  'add',
  'iris',
  'dement',
  'to',
  'my',
  'playlist',
  'this',
  'is',
  'selena'],
 ['add',
  'slimm',
  'cutta',
  'calhoun',
  'to',
  'my',
  'this',
  'is',
  'prince',
  'playlist'],
 ['i', 'want', 'to', 'listen', 'to', 'seventies', 'music'],
 ['play', 'a', 'popular', 'chant', 'by', 'brian', 'epstein'],
 ['find', 'fish', 'story'],
 ['book', 'a', 'spot', 'for', '3', 'in', 'mt'],
 ['i',
  'need',
  'a',
  'forecast',
  'for',
  'jetmore',
  'massachusetts',
  'in',
  '1',
  'hour',
  'and',
  '1',
  'second',
  'from',
  'now'],
 ['rate', 'this', 'series', 'a', '5'],
 ['give',
  'me',
  'a',
  'list',
  'of',

In [110]:
labels_for_tokens

[['O', 'O', 'B-artist', 'O', 'B-album', 'O', 'B-service', 'I-service'],
 ['O',
  'B-entity_name',
  'I-entity_name',
  'I-entity_name',
  'O',
  'O',
  'B-playlist',
  'I-playlist',
  'O'],
 ['O',
  'O',
  'O',
  'B-object_select',
  'B-object_type',
  'O',
  'O',
  'O',
  'O',
  'B-rating_value',
  'O',
  'O',
  'O',
  'O',
  'O',
  'B-best_rating'],
 ['O', 'O', 'B-music_item', 'B-track', 'I-track', 'I-track'],
 ['O',
  'O',
  'B-artist',
  'I-artist',
  'O',
  'B-playlist_owner',
  'O',
  'B-playlist',
  'I-playlist',
  'I-playlist'],
 ['O',
  'B-artist',
  'I-artist',
  'I-artist',
  'O',
  'B-playlist_owner',
  'B-playlist',
  'I-playlist',
  'I-playlist',
  'O'],
 ['O', 'O', 'O', 'O', 'O', 'B-year', 'O'],
 ['O', 'O', 'B-sort', 'B-music_item', 'O', 'B-artist', 'I-artist'],
 ['O', 'B-movie_name', 'I-movie_name'],
 ['O', 'O', 'O', 'O', 'B-party_size_number', 'O', 'B-state'],
 ['O',
  'O',
  'O',
  'O',
  'O',
  'B-city',
  'B-state',
  'O',
  'B-timeRange',
  'I-timeRange',
  'I-time

In [111]:
sequence_labels

['PlayMusic',
 'AddToPlaylist',
 'RateBook',
 'PlayMusic',
 'AddToPlaylist',
 'AddToPlaylist',
 'PlayMusic',
 'PlayMusic',
 'SearchScreeningEvent',
 'BookRestaurant',
 'GetWeather',
 'RateBook',
 'SearchScreeningEvent',
 'PlayMusic',
 'GetWeather',
 'BookRestaurant',
 'BookRestaurant',
 'AddToPlaylist',
 'BookRestaurant',
 'AddToPlaylist',
 'SearchScreeningEvent',
 'AddToPlaylist',
 'SearchCreativeWork',
 'SearchCreativeWork',
 'AddToPlaylist',
 'SearchScreeningEvent',
 'SearchCreativeWork',
 'RateBook',
 'SearchScreeningEvent',
 'SearchScreeningEvent',
 'PlayMusic',
 'RateBook',
 'AddToPlaylist',
 'SearchScreeningEvent',
 'SearchCreativeWork',
 'BookRestaurant',
 'GetWeather',
 'PlayMusic',
 'AddToPlaylist',
 'SearchCreativeWork',
 'SearchCreativeWork',
 'BookRestaurant',
 'SearchScreeningEvent',
 'PlayMusic',
 'GetWeather',
 'AddToPlaylist',
 'GetWeather',
 'RateBook',
 'SearchScreeningEvent',
 'SearchScreeningEvent',
 'BookRestaurant',
 'PlayMusic',
 'PlayMusic',
 'BookRestaurant',


In [112]:
len(labels_for_tokens), len(tokenized_utterances), len(utterances), len(sequence_labels)

(13084, 13084, 13084, 13084)

In [113]:
print(tokenized_utterances[0])
print(labels_for_tokens[0])
print(utterances[0])
print(sequence_labels[0])

['listen', 'to', 'westbam', 'alumb', 'allergic', 'on', 'google', 'music']
['O', 'O', 'B-artist', 'O', 'B-album', 'O', 'B-service', 'I-service']
listen to westbam alumb allergic on google music
PlayMusic


In [114]:
unique_sequence_labels = list(set(sequence_labels))
unique_sequence_labels

['AddToPlaylist',
 'SearchCreativeWork',
 'BookRestaurant',
 'RateBook',
 'GetWeather',
 'SearchScreeningEvent',
 'PlayMusic']

In [115]:
sequence_labels = [unique_sequence_labels.index(l) for l in sequence_labels]

In [116]:
labels_for_tokens

[['O', 'O', 'B-artist', 'O', 'B-album', 'O', 'B-service', 'I-service'],
 ['O',
  'B-entity_name',
  'I-entity_name',
  'I-entity_name',
  'O',
  'O',
  'B-playlist',
  'I-playlist',
  'O'],
 ['O',
  'O',
  'O',
  'B-object_select',
  'B-object_type',
  'O',
  'O',
  'O',
  'O',
  'B-rating_value',
  'O',
  'O',
  'O',
  'O',
  'O',
  'B-best_rating'],
 ['O', 'O', 'B-music_item', 'B-track', 'I-track', 'I-track'],
 ['O',
  'O',
  'B-artist',
  'I-artist',
  'O',
  'B-playlist_owner',
  'O',
  'B-playlist',
  'I-playlist',
  'I-playlist'],
 ['O',
  'B-artist',
  'I-artist',
  'I-artist',
  'O',
  'B-playlist_owner',
  'B-playlist',
  'I-playlist',
  'I-playlist',
  'O'],
 ['O', 'O', 'O', 'O', 'O', 'B-year', 'O'],
 ['O', 'O', 'B-sort', 'B-music_item', 'O', 'B-artist', 'I-artist'],
 ['O', 'B-movie_name', 'I-movie_name'],
 ['O', 'O', 'O', 'O', 'B-party_size_number', 'O', 'B-state'],
 ['O',
  'O',
  'O',
  'O',
  'O',
  'B-city',
  'B-state',
  'O',
  'B-timeRange',
  'I-timeRange',
  'I-time

In [117]:
from functools import reduce

unique_token_labels = list(set(reduce(lambda x, y: x + y, labels_for_tokens)))
len(unique_token_labels), unique_token_labels

(72,
 ['B-movie_name',
  'I-poi',
  'B-object_part_of_series_type',
  'B-restaurant_name',
  'I-object_part_of_series_type',
  'B-party_size_number',
  'B-genre',
  'I-current_location',
  'B-rating_value',
  'I-object_name',
  'B-spatial_relation',
  'I-party_size_description',
  'I-movie_type',
  'B-state',
  'I-movie_name',
  'B-restaurant_type',
  'I-location_name',
  'B-current_location',
  'B-facility',
  'B-city',
  'B-timeRange',
  'B-geographic_poi',
  'B-track',
  'I-track',
  'B-party_size_description',
  'I-city',
  'I-service',
  'I-artist',
  'I-entity_name',
  'B-music_item',
  'I-object_location_type',
  'I-playlist_owner',
  'O',
  'B-object_location_type',
  'B-poi',
  'I-country',
  'B-album',
  'B-object_type',
  'B-artist',
  'I-spatial_relation',
  'I-sort',
  'I-restaurant_type',
  'B-best_rating',
  'I-state',
  'B-location_name',
  'B-rating_unit',
  'I-facility',
  'B-entity_name',
  'B-served_dish',
  'I-cuisine',
  'I-timeRange',
  'I-object_select',
  'B-co

In [118]:
labels_for_tokens = [[unique_token_labels.index(_) for _ in l] for l in labels_for_tokens]
labels_for_tokens

[[32, 32, 38, 32, 36, 32, 68, 26],
 [32, 47, 28, 28, 32, 32, 60, 65, 32],
 [32, 32, 32, 53, 37, 32, 32, 32, 32, 8, 32, 32, 32, 32, 32, 42],
 [32, 32, 29, 22, 23, 23],
 [32, 32, 38, 27, 32, 54, 32, 60, 65, 65],
 [32, 38, 27, 27, 32, 54, 60, 65, 65, 32],
 [32, 32, 32, 32, 32, 67, 32],
 [32, 32, 55, 29, 32, 38, 27],
 [32, 0, 14],
 [32, 32, 32, 32, 5, 32, 13],
 [32, 32, 32, 32, 32, 19, 13, 32, 20, 50, 50, 50, 50, 50, 50],
 [32, 53, 2, 32, 8],
 [32, 32, 32, 32, 32, 37, 66, 32, 70, 10, 39, 39],
 [32, 32, 32, 32, 32, 67, 32, 32, 38],
 [32, 32, 32, 32, 32, 32, 32, 32, 21, 58, 58, 10, 20, 50, 50],
 [32, 32, 15, 32, 5, 32, 20, 50, 50],
 [32, 32, 32, 32, 32, 15, 32, 19, 25, 13, 32, 24, 11, 11, 11],
 [32, 32, 32, 60, 65, 47, 28, 28, 28],
 [32, 32, 15, 32, 20, 50, 50, 32, 13],
 [32, 32, 29, 32, 32, 60, 65, 65, 65, 32],
 [32, 32, 33, 30, 32, 0, 14, 14, 14],
 [32, 47, 28, 32, 54, 32, 60, 65, 65],
 [32, 32, 32, 37, 71, 9, 9, 9, 9],
 [32, 32, 71, 9, 9, 9, 9, 9],
 [32, 32, 29, 32, 54, 60, 65],
 [32, 70,

In [119]:
print(tokenized_utterances[0])
print(labels_for_tokens[0])
print([unique_token_labels[l] for l in labels_for_tokens[0]])
print(utterances[0])
print(sequence_labels[0])
print(unique_sequence_labels[sequence_labels[0]])

['listen', 'to', 'westbam', 'alumb', 'allergic', 'on', 'google', 'music']
[32, 32, 38, 32, 36, 32, 68, 26]
['O', 'O', 'B-artist', 'O', 'B-album', 'O', 'B-service', 'I-service']
listen to westbam alumb allergic on google music
6
PlayMusic


In [120]:
snips_dataset = Dataset.from_dict(
    dict(
        utterance=utterances,
        label=sequence_labels,
        tokens=tokenized_utterances,
        token_labels=labels_for_tokens
    )
)
snips_dataset = snips_dataset.train_test_split(test_size=0.2)

In [121]:
type(snips_dataset)

datasets.dataset_dict.DatasetDict

In [122]:
snips_dataset['train'][0]

{'utterance': 'rate homicide: a year on the killing streets five stars',
 'label': 3,
 'tokens': ['rate',
  'homicide:',
  'a',
  'year',
  'on',
  'the',
  'killing',
  'streets',
  'five',
  'stars'],
 'token_labels': [32, 71, 9, 9, 9, 9, 9, 9, 8, 45]}

In [123]:
tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')
tokenizer

loading file vocab.txt from cache at /root/.cache/huggingface/hub/models--distilbert-base-uncased/snapshots/1c4513b2eedbda136f57676a34eea67aba266e5c/vocab.txt
loading file tokenizer.json from cache at /root/.cache/huggingface/hub/models--distilbert-base-uncased/snapshots/1c4513b2eedbda136f57676a34eea67aba266e5c/tokenizer.json
loading file added_tokens.json from cache at None
loading file special_tokens_map.json from cache at None
loading file tokenizer_config.json from cache at /root/.cache/huggingface/hub/models--distilbert-base-uncased/snapshots/1c4513b2eedbda136f57676a34eea67aba266e5c/tokenizer_config.json
loading configuration file config.json from cache at /root/.cache/huggingface/hub/models--distilbert-base-uncased/snapshots/1c4513b2eedbda136f57676a34eea67aba266e5c/config.json
Model config DistilBertConfig {
  "_name_or_path": "distilbert-base-uncased",
  "activation": "gelu",
  "architectures": [
    "DistilBertForMaskedLM"
  ],
  "attention_dropout": 0.1,
  "dim": 768,
  "dropo

DistilBertTokenizerFast(name_or_path='distilbert-base-uncased', vocab_size=30522, model_max_length=512, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'})

In [124]:
preprocess_function = lambda examples: tokenizer(examples['utterance'], truncation=True)
preprocess_function

<function __main__.<lambda>(examples)>

In [125]:
def preprocess_function(examples):
    return tokenizer(examples['utterance'], truncation=True)
preprocess_function

<function __main__.preprocess_function(examples)>

In [126]:
seq_clf_tokenized_snips = snips_dataset.map(preprocess_function, batched=True)
seq_clf_tokenized_snips

Map:   0%|          | 0/10467 [00:00<?, ? examples/s]

Map:   0%|          | 0/2617 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['utterance', 'label', 'tokens', 'token_labels', 'input_ids', 'attention_mask'],
        num_rows: 10467
    })
    test: Dataset({
        features: ['utterance', 'label', 'tokens', 'token_labels', 'input_ids', 'attention_mask'],
        num_rows: 2617
    })
})

In [127]:
seq_clf_tokenized_snips['train'][0]

{'utterance': 'rate homicide: a year on the killing streets five stars',
 'label': 3,
 'tokens': ['rate',
  'homicide:',
  'a',
  'year',
  'on',
  'the',
  'killing',
  'streets',
  'five',
  'stars'],
 'token_labels': [32, 71, 9, 9, 9, 9, 9, 9, 8, 45],
 'input_ids': [101,
  3446,
  18268,
  1024,
  1037,
  2095,
  2006,
  1996,
  4288,
  4534,
  2274,
  3340,
  102],
 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}

In [128]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
data_collator

DataCollatorWithPadding(tokenizer=DistilBertTokenizerFast(name_or_path='distilbert-base-uncased', vocab_size=30522, model_max_length=512, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'}), padding=True, max_length=None, pad_to_multiple_of=None, return_tensors='pt')

In [129]:
sequence_clf_model = DistilBertForSequenceClassification.from_pretrained(
    'distilbert-base-uncased',
    num_labels=len(unique_sequence_labels),
)

loading configuration file config.json from cache at /root/.cache/huggingface/hub/models--distilbert-base-uncased/snapshots/1c4513b2eedbda136f57676a34eea67aba266e5c/config.json
Model config DistilBertConfig {
  "activation": "gelu",
  "architectures": [
    "DistilBertForMaskedLM"
  ],
  "attention_dropout": 0.1,
  "dim": 768,
  "dropout": 0.1,
  "hidden_dim": 3072,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1",
    "2": "LABEL_2",
    "3": "LABEL_3",
    "4": "LABEL_4",
    "5": "LABEL_5",
    "6": "LABEL_6"
  },
  "initializer_range": 0.02,
  "label2id": {
    "LABEL_0": 0,
    "LABEL_1": 1,
    "LABEL_2": 2,
    "LABEL_3": 3,
    "LABEL_4": 4,
    "LABEL_5": 5,
    "LABEL_6": 6
  },
  "max_position_embeddings": 512,
  "model_type": "distilbert",
  "n_heads": 12,
  "n_layers": 6,
  "pad_token_id": 0,
  "qa_dropout": 0.1,
  "seq_classif_dropout": 0.2,
  "sinusoidal_pos_embds": false,
  "tie_weights_": true,
  "transformers_version": "4.26.1",
  "vocab_size": 30522
}

loading 

In [130]:
unique_sequence_labels

['AddToPlaylist',
 'SearchCreativeWork',
 'BookRestaurant',
 'RateBook',
 'GetWeather',
 'SearchScreeningEvent',
 'PlayMusic']

In [131]:
{i: l for i, l in enumerate(unique_sequence_labels)}

{0: 'AddToPlaylist',
 1: 'SearchCreativeWork',
 2: 'BookRestaurant',
 3: 'RateBook',
 4: 'GetWeather',
 5: 'SearchScreeningEvent',
 6: 'PlayMusic'}

In [132]:
sequence_clf_model.config.id2label = {i: l for i, l in enumerate(unique_sequence_labels)}

In [133]:
sequence_clf_model.config.id2label[0]

'AddToPlaylist'

In [134]:
metric = load('accuracy')

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

In [135]:
len(seq_clf_tokenized_snips['train']) // 5

2093

In [136]:
epochs = 5
training_args = TrainingArguments(
    output_dir='snips_clf/results',
    num_train_epochs=epochs,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    load_best_model_at_end=True,

    warmup_steps=len(seq_clf_tokenized_snips['train']) // 5,
    weight_decay=0.05,

    logging_steps=1,
    log_level='info',
    evaluation_strategy='epoch',
    save_strategy='epoch'
)

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).


In [137]:
trainer = Trainer(
    model=sequence_clf_model,
    args=training_args,
    train_dataset=seq_clf_tokenized_snips['train'],
    eval_dataset=seq_clf_tokenized_snips['test'],
    compute_metrics=compute_metrics,
    data_collator=data_collator
)

In [138]:
trainer.evaluate()

The following columns in the evaluation set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: utterance, token_labels, tokens. If utterance, token_labels, tokens are not expected by `DistilBertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 2617
  Batch size = 32
***** Running Evaluation *****
  Num examples = 2617
  Batch size = 32
You're using a DistilBertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


{'eval_loss': 1.9488182067871094,
 'eval_accuracy': 0.13526939243408484,
 'eval_runtime': 2.3445,
 'eval_samples_per_second': 1116.253,
 'eval_steps_per_second': 34.976}

In [139]:
trainer.train()

The following columns in the training set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: utterance, token_labels, tokens. If utterance, token_labels, tokens are not expected by `DistilBertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 10467
  Num Epochs = 5
  Instantaneous batch size per device = 32
  Total train batch size (w. parallel, distributed & accumulation) = 32
  Gradient Accumulation steps = 1
  Total optimization steps = 1640
  Number of trainable parameters = 66958855


Epoch,Training Loss,Validation Loss,Accuracy
1,0.1924,0.235212,0.973634


The following columns in the evaluation set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: utterance, token_labels, tokens. If utterance, token_labels, tokens are not expected by `DistilBertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 2617
  Batch size = 32
***** Running Evaluation *****
  Num examples = 2617
  Batch size = 32
Saving model checkpoint to snips_clf/results/checkpoint-328
Configuration saved in snips_clf/results/checkpoint-328/config.json
Saving model checkpoint to snips_clf/results/checkpoint-328
Configuration saved in snips_clf/results/checkpoint-328/config.json
Model weights saved in snips_clf/results/checkpoint-328/pytorch_model.bin
The following columns in the evaluation set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: utterance, token_labels, tokens. If utterance, token

TrainOutput(global_step=1640, training_loss=0.310913683539512, metrics={'train_runtime': 115.9147, 'train_samples_per_second': 451.496, 'train_steps_per_second': 14.148, 'total_flos': 293283097192344.0, 'train_loss': 0.310913683539512, 'epoch': 5.0})

In [140]:
trainer.evaluate()

The following columns in the evaluation set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: utterance, token_labels, tokens. If utterance, token_labels, tokens are not expected by `DistilBertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 2617
  Batch size = 32


{'eval_loss': 0.04882054403424263,
 'eval_accuracy': 0.9893007260221628,
 'eval_runtime': 2.353,
 'eval_samples_per_second': 1112.2,
 'eval_steps_per_second': 34.849,
 'epoch': 5.0}

In [155]:
device = torch.device(torch.cuda.current_device() if torch.cuda.is_available() else 'cpu')
pipe = pipeline('text-classification', sequence_clf_model, tokenizer=tokenizer, device=device)
pipe('Add Two Coins by Dispatch to my road trip playlist')

[{'label': 'AddToPlaylist', 'score': 0.9988709092140198}]

In [156]:
trainer.save_model()

Saving model checkpoint to snips_clf/results
Configuration saved in snips_clf/results/config.json
Model weights saved in snips_clf/results/pytorch_model.bin


In [158]:
pipe = pipeline('text-classification', 'snips_clf/results', tokenizer=tokenizer, device=device)
pipe('Add Two Coins by Dispatch to my road trip playlist')

loading configuration file snips_clf/results/config.json
Model config DistilBertConfig {
  "_name_or_path": "snips_clf/results",
  "activation": "gelu",
  "architectures": [
    "DistilBertForSequenceClassification"
  ],
  "attention_dropout": 0.1,
  "dim": 768,
  "dropout": 0.1,
  "hidden_dim": 3072,
  "id2label": {
    "0": "AddToPlaylist",
    "1": "SearchCreativeWork",
    "2": "BookRestaurant",
    "3": "RateBook",
    "4": "GetWeather",
    "5": "SearchScreeningEvent",
    "6": "PlayMusic"
  },
  "initializer_range": 0.02,
  "label2id": {
    "LABEL_0": 0,
    "LABEL_1": 1,
    "LABEL_2": 2,
    "LABEL_3": 3,
    "LABEL_4": 4,
    "LABEL_5": 5,
    "LABEL_6": 6
  },
  "max_position_embeddings": 512,
  "model_type": "distilbert",
  "n_heads": 12,
  "n_layers": 6,
  "pad_token_id": 0,
  "problem_type": "single_label_classification",
  "qa_dropout": 0.1,
  "seq_classif_dropout": 0.2,
  "sinusoidal_pos_embds": false,
  "tie_weights_": true,
  "torch_dtype": "float32",
  "transformers

[{'label': 'AddToPlaylist', 'score': 0.9988709092140198}]

In [159]:
frozen_sequence_clf_model = DistilBertForSequenceClassification.from_pretrained(
    'distilbert-base-uncased',
    num_labels=len(unique_sequence_labels),
)

loading configuration file config.json from cache at /root/.cache/huggingface/hub/models--distilbert-base-uncased/snapshots/1c4513b2eedbda136f57676a34eea67aba266e5c/config.json
Model config DistilBertConfig {
  "activation": "gelu",
  "architectures": [
    "DistilBertForMaskedLM"
  ],
  "attention_dropout": 0.1,
  "dim": 768,
  "dropout": 0.1,
  "hidden_dim": 3072,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1",
    "2": "LABEL_2",
    "3": "LABEL_3",
    "4": "LABEL_4",
    "5": "LABEL_5",
    "6": "LABEL_6"
  },
  "initializer_range": 0.02,
  "label2id": {
    "LABEL_0": 0,
    "LABEL_1": 1,
    "LABEL_2": 2,
    "LABEL_3": 3,
    "LABEL_4": 4,
    "LABEL_5": 5,
    "LABEL_6": 6
  },
  "max_position_embeddings": 512,
  "model_type": "distilbert",
  "n_heads": 12,
  "n_layers": 6,
  "pad_token_id": 0,
  "qa_dropout": 0.1,
  "seq_classif_dropout": 0.2,
  "sinusoidal_pos_embds": false,
  "tie_weights_": true,
  "transformers_version": "4.26.1",
  "vocab_size": 30522
}

loading 

In [161]:
frozen_sequence_clf_model

DistilBertForSequenceClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0): TransformerBlock(
          (attention): MultiHeadSelfAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)
       

In [160]:
for param in frozen_sequence_clf_model.distilbert.parameters():
    param.requires_grad = False

In [165]:
epochs = 3
training_args = TrainingArguments(
    output_dir='snips_clf/results',
    num_train_epochs=epochs,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    load_best_model_at_end=True,

    warmup_steps=len(seq_clf_tokenized_snips['train']) // 5,
    weight_decay=0.05,

    logging_steps=1,
    log_level='info',
    evaluation_strategy='epoch',
    save_strategy='epoch'
)

trainer = Trainer(
    model=frozen_sequence_clf_model,
    args=training_args,
    train_dataset=seq_clf_tokenized_snips['train'],
    eval_dataset=seq_clf_tokenized_snips['test'],
    compute_metrics=compute_metrics,
    data_collator=data_collator
)

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).


In [166]:
trainer.evaluate()

The following columns in the evaluation set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: utterance, token_labels, tokens. If utterance, token_labels, tokens are not expected by `DistilBertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 2617
  Batch size = 32


{'eval_loss': 1.952977180480957,
 'eval_accuracy': 0.10584638899503249,
 'eval_runtime': 1.2053,
 'eval_samples_per_second': 2171.182,
 'eval_steps_per_second': 68.031}

In [167]:
trainer.train()

The following columns in the training set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: utterance, token_labels, tokens. If utterance, token_labels, tokens are not expected by `DistilBertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 10467
  Num Epochs = 3
  Instantaneous batch size per device = 32
  Total train batch size (w. parallel, distributed & accumulation) = 32
  Gradient Accumulation steps = 1
  Total optimization steps = 984
  Number of trainable parameters = 595975


Epoch,Training Loss,Validation Loss


The following columns in the evaluation set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: utterance, token_labels, tokens. If utterance, token_labels, tokens are not expected by `DistilBertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 2617
  Batch size = 32
***** Running Evaluation *****
  Num examples = 2617
  Batch size = 32
Saving model checkpoint to snips_clf/results/checkpoint-328
Configuration saved in snips_clf/results/checkpoint-328/config.json
The following columns in the evaluation set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: utterance, token_labels, tokens. If utterance, token_labels, tokens are not expected by `DistilBertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 2617
  Batch size = 32
Saving mode

TrainOutput(global_step=984, training_loss=1.6876993434337098, metrics={'train_runtime': 29.5383, 'train_samples_per_second': 1063.06, 'train_steps_per_second': 33.313, 'total_flos': 175579717390434.0, 'train_loss': 1.6876993434337098, 'epoch': 3.0})

In [168]:
trainer.evaluate()

The following columns in the evaluation set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: utterance, token_labels, tokens. If utterance, token_labels, tokens are not expected by `DistilBertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 2617
  Batch size = 32
***** Running Evaluation *****
  Num examples = 2617
  Batch size = 32


{'eval_loss': 1.058956503868103,
 'eval_accuracy': 0.8926251432938479,
 'eval_runtime': 1.1883,
 'eval_samples_per_second': 2202.213,
 'eval_steps_per_second': 69.003,
 'epoch': 3.0}