## Fine-tuning BERT by adding a FFL on top of the pooler layer, in the artitecture

#### Codes adapted from: https://github.com/sinanuozdemir

### Import Packages

In [1]:
from transformers import Trainer, TrainingArguments, DistilBertForSequenceClassification, DistilBertTokenizerFast, \
     DataCollatorWithPadding, pipeline
from datasets import load_metric, Dataset
import numpy as np
from sklearn.preprocessing import LabelEncoder

  from .autonotebook import tqdm as notebook_tqdm





In [2]:
snips_file = open('Data/snips.train.txt', 'rb')

snips_rows = snips_file.readlines()

snips_rows[:20]

[b'listen O\r\n',
 b'to O\r\n',
 b'westbam B-artist\r\n',
 b'alumb O\r\n',
 b'allergic B-album\r\n',
 b'on O\r\n',
 b'google B-service\r\n',
 b'music I-service\r\n',
 b'PlayMusic\r\n',
 b'\r\n',
 b'add O\r\n',
 b'step B-entity_name\r\n',
 b'to I-entity_name\r\n',
 b'me I-entity_name\r\n',
 b'to O\r\n',
 b'the O\r\n',
 b'50 B-playlist\r\n',
 b'cl\xc3\xa1sicos I-playlist\r\n',
 b'playlist O\r\n',
 b'AddToPlaylist\r\n']

In [3]:
# This code segment parses the snips dataset into a more manageable format

utterances = []
tokenized_utterances = []
labels_for_tokens = []
sequence_labels = []

utterance, tokenized_utterance, label_for_utterances = '', [], []
for snip_row in snips_rows:
    if len(snip_row) == 2:  # skip over rows with no data
        continue
    if ' ' not in snip_row.decode():  # we've hit a sequence label
        sequence_labels.append(snip_row.decode().strip())
        utterances.append(utterance.strip())
        tokenized_utterances.append(tokenized_utterance)
        labels_for_tokens.append(label_for_utterances)
        utterance = ''
        tokenized_utterance = []
        label_for_utterances = []
        continue
    token, token_label = snip_row.decode().split(' ')
    token_label = token_label.strip()
    utterance += f'{token} '
    tokenized_utterance.append(token)
    label_for_utterances.append(token_label)

In [4]:
len(labels_for_tokens), len(tokenized_utterances), len(utterances), len(sequence_labels)

(13084, 13084, 13084, 13084)

In [5]:
print(tokenized_utterances[0])
print(labels_for_tokens[0])
print(utterances[0])
print(sequence_labels[0])

['listen', 'to', 'westbam', 'alumb', 'allergic', 'on', 'google', 'music']
['O', 'O', 'B-artist', 'O', 'B-album', 'O', 'B-service', 'I-service']
listen to westbam alumb allergic on google music
PlayMusic


In [6]:
unique_sequence_labels = list(set(sequence_labels))
unique_sequence_labels

['AddToPlaylist',
 'SearchCreativeWork',
 'RateBook',
 'PlayMusic',
 'GetWeather',
 'BookRestaurant',
 'SearchScreeningEvent']

In [7]:
sequence_labels = [unique_sequence_labels.index(l) for l in sequence_labels]
print(f'There are {len(unique_sequence_labels)} unique sequence labels')

There are 7 unique sequence labels


In [8]:
from functools import reduce

unique_token_labels = list(set(reduce(lambda x, y: x + y, labels_for_tokens)))
labels_for_tokens = [[unique_token_labels.index(_) for _ in l] for l in labels_for_tokens]

print(f'There are {len(unique_token_labels)} unique token labels')

There are 72 unique token labels


In [9]:
print(tokenized_utterances[0])
print(labels_for_tokens[0])
print([unique_token_labels[l] for l in labels_for_tokens[0]])
print(utterances[0])
print(sequence_labels[0])
print(unique_sequence_labels[sequence_labels[0]])

['listen', 'to', 'westbam', 'alumb', 'allergic', 'on', 'google', 'music']
[11, 11, 14, 11, 41, 11, 55, 64]
['O', 'O', 'B-artist', 'O', 'B-album', 'O', 'B-service', 'I-service']
listen to westbam alumb allergic on google music
3
PlayMusic


In [10]:
snips_dataset = Dataset.from_dict(
    dict(
        utterance=utterances, 
        label=sequence_labels,
        tokens=tokenized_utterances,
        token_labels=labels_for_tokens
    )
)
snips_dataset = snips_dataset.train_test_split(test_size=0.2)

In [11]:
snips_dataset['train'][0]

{'utterance': 'find check in',
 'label': 1,
 'tokens': ['find', 'check', 'in'],
 'token_labels': [11, 12, 43]}

In [12]:
tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')

In [13]:
# simple function to batch tokenize utterances with truncation
def preprocess_function(examples):
    return tokenizer(examples["utterance"], truncation=True)

In [14]:
seq_clf_tokenized_snips = snips_dataset.map(preprocess_function, batched=True)

Map:   0%|          | 0/10467 [00:00<?, ? examples/s]

Map: 100%|██████████| 10467/10467 [00:00<00:00, 12086.95 examples/s]
Map: 100%|██████████| 2617/2617 [00:00<00:00, 13699.26 examples/s]


In [15]:
seq_clf_tokenized_snips['train'][0]

{'utterance': 'find check in',
 'label': 1,
 'tokens': ['find', 'check', 'in'],
 'token_labels': [11, 12, 43],
 'input_ids': [101, 2424, 4638, 1999, 102],
 'attention_mask': [1, 1, 1, 1, 1]}

In [16]:
# DataCollatorWithPadding creates batch of data. It also dynamically pads text to the 
#  length of the longest element in the batch, making them all the same length. 
#  It's possible to pad your text in the tokenizer function with padding=True, dynamic padding is more efficient.
# Data Collator will pad data so that all examples are the same input length.
#  Attention mask is how we ignore attention scores for padding tokens

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [17]:
sequence_clf_model = DistilBertForSequenceClassification.from_pretrained(
    'distilbert-base-uncased', 
    num_labels=len(unique_sequence_labels),
)

# set an index -> label dictionary
sequence_clf_model.config.id2label = {i: l for i, l in enumerate(unique_sequence_labels)}

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.bias', 'classifier.weight', 'classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [18]:
sequence_clf_model.config.id2label[0]

'AddToPlaylist'

In [19]:
metric = load_metric("accuracy")

def compute_metrics(eval_pred):  # custom method to take in logits and calculate accuracy of the eval set
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

  metric = load_metric("accuracy")


In [20]:
epochs = 2

training_args = TrainingArguments(
    output_dir="./snips_clf",
    num_train_epochs=epochs,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    load_best_model_at_end=True,
    
    # some deep learning parameters that the Trainer is able to take in
    warmup_steps=len(seq_clf_tokenized_snips['train']) // 5,  # number of warmup steps for learning rate scheduler,
    weight_decay = 0.05,
    
    logging_steps=1,
    log_level='info',
    evaluation_strategy='epoch',
    eval_steps=50,
    save_strategy='epoch'
)

# Define the trainer:

trainer = Trainer(
    model=sequence_clf_model,
    args=training_args,
    train_dataset=seq_clf_tokenized_snips['train'],
    eval_dataset=seq_clf_tokenized_snips['test'],
    compute_metrics=compute_metrics,
    data_collator=data_collator
)


In [21]:

# Get initial metrics before training (based randomly)
trainer.evaluate()

The following columns in the evaluation set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: token_labels, utterance, tokens. If token_labels, utterance, tokens are not expected by `DistilBertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 2617
  Batch size = 32
You're using a DistilBertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
100%|██████████| 82/82 [00:29<00:00,  2.80it/s]


{'eval_loss': 1.9434489011764526,
 'eval_accuracy': 0.14520443255636226,
 'eval_runtime': 29.7363,
 'eval_samples_per_second': 88.007,
 'eval_steps_per_second': 2.758}

In [22]:
trainer.train()

The following columns in the training set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: token_labels, utterance, tokens. If token_labels, utterance, tokens are not expected by `DistilBertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 10,467
  Num Epochs = 2
  Instantaneous batch size per device = 32
  Total train batch size (w. parallel, distributed & accumulation) = 32
  Gradient Accumulation steps = 1
  Total optimization steps = 656
  Number of trainable parameters = 66,958,855
  0%|          | 1/656 [00:01<18:29,  1.69s/it]

{'loss': 1.9381, 'learning_rate': 2.3889154323936934e-08, 'epoch': 0.0}


  0%|          | 2/656 [00:03<17:49,  1.64s/it]

{'loss': 1.9401, 'learning_rate': 4.777830864787387e-08, 'epoch': 0.01}


  0%|          | 3/656 [00:05<19:22,  1.78s/it]

{'loss': 1.9522, 'learning_rate': 7.16674629718108e-08, 'epoch': 0.01}


  1%|          | 4/656 [00:07<21:36,  1.99s/it]

{'loss': 1.9548, 'learning_rate': 9.555661729574773e-08, 'epoch': 0.01}


  1%|          | 5/656 [00:09<22:48,  2.10s/it]

{'loss': 1.964, 'learning_rate': 1.1944577161968468e-07, 'epoch': 0.02}


  1%|          | 6/656 [00:12<24:02,  2.22s/it]

{'loss': 1.9442, 'learning_rate': 1.433349259436216e-07, 'epoch': 0.02}


  1%|          | 7/656 [00:14<25:16,  2.34s/it]

{'loss': 1.9725, 'learning_rate': 1.6722408026755853e-07, 'epoch': 0.02}


  1%|          | 8/656 [00:17<25:13,  2.34s/it]

{'loss': 1.9612, 'learning_rate': 1.9111323459149547e-07, 'epoch': 0.02}


  1%|▏         | 9/656 [00:19<24:53,  2.31s/it]

{'loss': 1.9665, 'learning_rate': 2.150023889154324e-07, 'epoch': 0.03}


  2%|▏         | 10/656 [00:21<25:34,  2.37s/it]

{'loss': 1.9446, 'learning_rate': 2.3889154323936937e-07, 'epoch': 0.03}


  2%|▏         | 11/656 [00:25<29:19,  2.73s/it]

{'loss': 1.9457, 'learning_rate': 2.6278069756330625e-07, 'epoch': 0.03}


  2%|▏         | 12/656 [00:28<28:56,  2.70s/it]

{'loss': 1.9578, 'learning_rate': 2.866698518872432e-07, 'epoch': 0.04}


  2%|▏         | 13/656 [00:30<27:28,  2.56s/it]

{'loss': 1.9414, 'learning_rate': 3.1055900621118013e-07, 'epoch': 0.04}


  2%|▏         | 14/656 [00:33<27:52,  2.61s/it]

{'loss': 1.9301, 'learning_rate': 3.3444816053511706e-07, 'epoch': 0.04}


  2%|▏         | 15/656 [00:35<26:55,  2.52s/it]

{'loss': 1.9664, 'learning_rate': 3.58337314859054e-07, 'epoch': 0.05}


  2%|▏         | 16/656 [00:38<28:33,  2.68s/it]

{'loss': 1.929, 'learning_rate': 3.8222646918299094e-07, 'epoch': 0.05}


  3%|▎         | 17/656 [00:40<28:00,  2.63s/it]

{'loss': 1.9643, 'learning_rate': 4.0611562350692793e-07, 'epoch': 0.05}


  3%|▎         | 18/656 [00:43<27:47,  2.61s/it]

{'loss': 1.9514, 'learning_rate': 4.300047778308648e-07, 'epoch': 0.05}


  3%|▎         | 19/656 [00:46<27:12,  2.56s/it]

{'loss': 1.9324, 'learning_rate': 4.5389393215480175e-07, 'epoch': 0.06}


  3%|▎         | 20/656 [00:48<26:01,  2.46s/it]

{'loss': 1.94, 'learning_rate': 4.777830864787387e-07, 'epoch': 0.06}


  3%|▎         | 21/656 [00:50<26:14,  2.48s/it]

{'loss': 1.9471, 'learning_rate': 5.016722408026756e-07, 'epoch': 0.06}


  3%|▎         | 22/656 [00:52<25:26,  2.41s/it]

{'loss': 1.9258, 'learning_rate': 5.255613951266125e-07, 'epoch': 0.07}


  4%|▎         | 23/656 [00:55<25:52,  2.45s/it]

{'loss': 1.9436, 'learning_rate': 5.494505494505495e-07, 'epoch': 0.07}


  4%|▎         | 24/656 [00:57<25:19,  2.40s/it]

{'loss': 1.9363, 'learning_rate': 5.733397037744864e-07, 'epoch': 0.07}


  4%|▍         | 25/656 [01:00<25:05,  2.39s/it]

{'loss': 1.948, 'learning_rate': 5.972288580984234e-07, 'epoch': 0.08}


  4%|▍         | 26/656 [01:02<26:02,  2.48s/it]

{'loss': 1.9586, 'learning_rate': 6.211180124223603e-07, 'epoch': 0.08}


  4%|▍         | 27/656 [01:05<26:03,  2.49s/it]

{'loss': 1.9609, 'learning_rate': 6.450071667462972e-07, 'epoch': 0.08}


  4%|▍         | 28/656 [01:07<25:58,  2.48s/it]

{'loss': 1.9427, 'learning_rate': 6.688963210702341e-07, 'epoch': 0.09}


  4%|▍         | 29/656 [01:10<25:48,  2.47s/it]

{'loss': 1.9423, 'learning_rate': 6.92785475394171e-07, 'epoch': 0.09}


  5%|▍         | 30/656 [01:13<26:35,  2.55s/it]

{'loss': 1.9402, 'learning_rate': 7.16674629718108e-07, 'epoch': 0.09}


  5%|▍         | 31/656 [01:15<26:07,  2.51s/it]

{'loss': 1.9306, 'learning_rate': 7.405637840420449e-07, 'epoch': 0.09}


  5%|▍         | 32/656 [01:18<29:08,  2.80s/it]

{'loss': 1.9235, 'learning_rate': 7.644529383659819e-07, 'epoch': 0.1}


  5%|▌         | 33/656 [01:21<27:39,  2.66s/it]

{'loss': 1.9595, 'learning_rate': 7.883420926899189e-07, 'epoch': 0.1}


  5%|▌         | 34/656 [01:24<29:29,  2.84s/it]

{'loss': 1.9616, 'learning_rate': 8.122312470138559e-07, 'epoch': 0.1}


  5%|▌         | 35/656 [01:28<34:06,  3.30s/it]

{'loss': 1.9548, 'learning_rate': 8.361204013377926e-07, 'epoch': 0.11}


  5%|▌         | 36/656 [01:31<32:05,  3.11s/it]

{'loss': 1.937, 'learning_rate': 8.600095556617296e-07, 'epoch': 0.11}


  6%|▌         | 37/656 [01:34<30:50,  2.99s/it]

{'loss': 1.9392, 'learning_rate': 8.838987099856666e-07, 'epoch': 0.11}


  6%|▌         | 38/656 [01:36<29:06,  2.83s/it]

{'loss': 1.9197, 'learning_rate': 9.077878643096035e-07, 'epoch': 0.12}


  6%|▌         | 39/656 [01:39<29:14,  2.84s/it]

{'loss': 1.9618, 'learning_rate': 9.316770186335405e-07, 'epoch': 0.12}


  6%|▌         | 40/656 [01:42<29:27,  2.87s/it]

{'loss': 1.9409, 'learning_rate': 9.555661729574775e-07, 'epoch': 0.12}


  6%|▋         | 41/656 [01:45<28:35,  2.79s/it]

{'loss': 1.9526, 'learning_rate': 9.794553272814141e-07, 'epoch': 0.12}


  6%|▋         | 42/656 [01:48<29:27,  2.88s/it]

{'loss': 1.9375, 'learning_rate': 1.0033444816053512e-06, 'epoch': 0.13}


  7%|▋         | 43/656 [01:50<27:23,  2.68s/it]

{'loss': 1.9227, 'learning_rate': 1.0272336359292883e-06, 'epoch': 0.13}


  7%|▋         | 44/656 [01:52<26:43,  2.62s/it]

{'loss': 1.9305, 'learning_rate': 1.051122790253225e-06, 'epoch': 0.13}


  7%|▋         | 45/656 [01:55<27:09,  2.67s/it]

{'loss': 1.9256, 'learning_rate': 1.0750119445771621e-06, 'epoch': 0.14}


  7%|▋         | 46/656 [01:57<25:46,  2.54s/it]

{'loss': 1.9353, 'learning_rate': 1.098901098901099e-06, 'epoch': 0.14}


  7%|▋         | 47/656 [02:00<26:59,  2.66s/it]

{'loss': 1.9367, 'learning_rate': 1.1227902532250359e-06, 'epoch': 0.14}


  7%|▋         | 48/656 [02:03<28:09,  2.78s/it]

{'loss': 1.9507, 'learning_rate': 1.1466794075489728e-06, 'epoch': 0.15}


  7%|▋         | 49/656 [02:07<29:18,  2.90s/it]

{'loss': 1.9265, 'learning_rate': 1.1705685618729096e-06, 'epoch': 0.15}


  8%|▊         | 50/656 [02:09<28:52,  2.86s/it]

{'loss': 1.9559, 'learning_rate': 1.1944577161968467e-06, 'epoch': 0.15}


  8%|▊         | 51/656 [02:12<27:12,  2.70s/it]

{'loss': 1.9446, 'learning_rate': 1.2183468705207836e-06, 'epoch': 0.16}


  8%|▊         | 52/656 [02:14<27:06,  2.69s/it]

{'loss': 1.9396, 'learning_rate': 1.2422360248447205e-06, 'epoch': 0.16}


  8%|▊         | 53/656 [02:18<28:41,  2.86s/it]

{'loss': 1.9346, 'learning_rate': 1.2661251791686574e-06, 'epoch': 0.16}


  8%|▊         | 54/656 [02:21<29:49,  2.97s/it]

{'loss': 1.9219, 'learning_rate': 1.2900143334925945e-06, 'epoch': 0.16}


  8%|▊         | 55/656 [02:24<29:15,  2.92s/it]

{'loss': 1.9224, 'learning_rate': 1.3139034878165314e-06, 'epoch': 0.17}


  9%|▊         | 56/656 [02:27<30:04,  3.01s/it]

{'loss': 1.9288, 'learning_rate': 1.3377926421404683e-06, 'epoch': 0.17}


  9%|▊         | 57/656 [02:30<29:55,  3.00s/it]

{'loss': 1.9343, 'learning_rate': 1.3616817964644054e-06, 'epoch': 0.17}


  9%|▉         | 58/656 [02:34<32:55,  3.30s/it]

{'loss': 1.9382, 'learning_rate': 1.385570950788342e-06, 'epoch': 0.18}


  9%|▉         | 59/656 [02:37<32:37,  3.28s/it]

{'loss': 1.9286, 'learning_rate': 1.4094601051122791e-06, 'epoch': 0.18}


  9%|▉         | 60/656 [02:40<31:51,  3.21s/it]

{'loss': 1.9283, 'learning_rate': 1.433349259436216e-06, 'epoch': 0.18}


  9%|▉         | 61/656 [02:43<30:23,  3.07s/it]

{'loss': 1.9233, 'learning_rate': 1.4572384137601529e-06, 'epoch': 0.19}


  9%|▉         | 62/656 [02:46<31:54,  3.22s/it]

{'loss': 1.9385, 'learning_rate': 1.4811275680840898e-06, 'epoch': 0.19}


 10%|▉         | 63/656 [02:49<30:51,  3.12s/it]

{'loss': 1.9041, 'learning_rate': 1.5050167224080269e-06, 'epoch': 0.19}


 10%|▉         | 64/656 [02:52<29:08,  2.95s/it]

{'loss': 1.933, 'learning_rate': 1.5289058767319638e-06, 'epoch': 0.2}


 10%|▉         | 65/656 [02:54<27:43,  2.81s/it]

{'loss': 1.9213, 'learning_rate': 1.5527950310559006e-06, 'epoch': 0.2}


 10%|█         | 66/656 [02:57<27:59,  2.85s/it]

{'loss': 1.9216, 'learning_rate': 1.5766841853798377e-06, 'epoch': 0.2}


 10%|█         | 67/656 [03:00<27:40,  2.82s/it]

{'loss': 1.9284, 'learning_rate': 1.6005733397037744e-06, 'epoch': 0.2}


 10%|█         | 68/656 [03:03<27:47,  2.84s/it]

{'loss': 1.9327, 'learning_rate': 1.6244624940277117e-06, 'epoch': 0.21}


 11%|█         | 69/656 [03:06<27:50,  2.85s/it]

{'loss': 1.9136, 'learning_rate': 1.6483516483516484e-06, 'epoch': 0.21}


 11%|█         | 70/656 [03:09<27:22,  2.80s/it]

{'loss': 1.9262, 'learning_rate': 1.6722408026755853e-06, 'epoch': 0.21}


 11%|█         | 71/656 [03:12<28:33,  2.93s/it]

{'loss': 1.9326, 'learning_rate': 1.6961299569995224e-06, 'epoch': 0.22}


 11%|█         | 72/656 [03:14<27:10,  2.79s/it]

{'loss': 1.904, 'learning_rate': 1.7200191113234592e-06, 'epoch': 0.22}


 11%|█         | 73/656 [03:17<27:59,  2.88s/it]

{'loss': 1.8994, 'learning_rate': 1.7439082656473961e-06, 'epoch': 0.22}


 11%|█▏        | 74/656 [03:20<27:41,  2.85s/it]

{'loss': 1.9135, 'learning_rate': 1.7677974199713332e-06, 'epoch': 0.23}


 11%|█▏        | 75/656 [03:23<27:09,  2.81s/it]

{'loss': 1.9135, 'learning_rate': 1.7916865742952701e-06, 'epoch': 0.23}


 12%|█▏        | 76/656 [03:26<27:39,  2.86s/it]

{'loss': 1.9111, 'learning_rate': 1.815575728619207e-06, 'epoch': 0.23}


 12%|█▏        | 77/656 [03:29<27:22,  2.84s/it]

{'loss': 1.895, 'learning_rate': 1.839464882943144e-06, 'epoch': 0.23}


 12%|█▏        | 78/656 [03:32<27:41,  2.87s/it]

{'loss': 1.9019, 'learning_rate': 1.863354037267081e-06, 'epoch': 0.24}


 12%|█▏        | 79/656 [03:35<28:49,  3.00s/it]

{'loss': 1.9108, 'learning_rate': 1.8872431915910176e-06, 'epoch': 0.24}


 12%|█▏        | 80/656 [03:38<28:23,  2.96s/it]

{'loss': 1.9149, 'learning_rate': 1.911132345914955e-06, 'epoch': 0.24}


 12%|█▏        | 81/656 [03:41<30:09,  3.15s/it]

{'loss': 1.92, 'learning_rate': 1.935021500238892e-06, 'epoch': 0.25}


 12%|█▎        | 82/656 [03:44<29:35,  3.09s/it]

{'loss': 1.885, 'learning_rate': 1.9589106545628283e-06, 'epoch': 0.25}


 13%|█▎        | 83/656 [03:47<28:33,  2.99s/it]

{'loss': 1.8982, 'learning_rate': 1.9827998088867656e-06, 'epoch': 0.25}


 13%|█▎        | 84/656 [03:50<27:27,  2.88s/it]

{'loss': 1.9031, 'learning_rate': 2.0066889632107025e-06, 'epoch': 0.26}


 13%|█▎        | 85/656 [03:53<28:42,  3.02s/it]

{'loss': 1.9105, 'learning_rate': 2.0305781175346394e-06, 'epoch': 0.26}


 13%|█▎        | 86/656 [03:56<28:18,  2.98s/it]

{'loss': 1.9067, 'learning_rate': 2.0544672718585767e-06, 'epoch': 0.26}


 13%|█▎        | 87/656 [03:59<29:24,  3.10s/it]

{'loss': 1.9125, 'learning_rate': 2.078356426182513e-06, 'epoch': 0.27}


 13%|█▎        | 88/656 [04:02<28:17,  2.99s/it]

{'loss': 1.8697, 'learning_rate': 2.10224558050645e-06, 'epoch': 0.27}


 14%|█▎        | 89/656 [04:06<30:05,  3.18s/it]

{'loss': 1.9022, 'learning_rate': 2.1261347348303873e-06, 'epoch': 0.27}


 14%|█▎        | 90/656 [04:08<28:24,  3.01s/it]

{'loss': 1.9016, 'learning_rate': 2.1500238891543242e-06, 'epoch': 0.27}


 14%|█▍        | 91/656 [04:11<28:08,  2.99s/it]

{'loss': 1.8808, 'learning_rate': 2.173913043478261e-06, 'epoch': 0.28}


 14%|█▍        | 92/656 [04:14<27:14,  2.90s/it]

{'loss': 1.897, 'learning_rate': 2.197802197802198e-06, 'epoch': 0.28}


 14%|█▍        | 93/656 [04:17<27:37,  2.94s/it]

{'loss': 1.9097, 'learning_rate': 2.221691352126135e-06, 'epoch': 0.28}


 14%|█▍        | 94/656 [04:20<27:37,  2.95s/it]

{'loss': 1.9082, 'learning_rate': 2.2455805064500718e-06, 'epoch': 0.29}


 14%|█▍        | 95/656 [04:23<27:44,  2.97s/it]

{'loss': 1.8818, 'learning_rate': 2.269469660774009e-06, 'epoch': 0.29}


 15%|█▍        | 96/656 [04:26<28:05,  3.01s/it]

{'loss': 1.8644, 'learning_rate': 2.2933588150979455e-06, 'epoch': 0.29}


 15%|█▍        | 97/656 [04:29<28:22,  3.05s/it]

{'loss': 1.8963, 'learning_rate': 2.3172479694218824e-06, 'epoch': 0.3}


 15%|█▍        | 98/656 [04:32<29:17,  3.15s/it]

{'loss': 1.8689, 'learning_rate': 2.3411371237458193e-06, 'epoch': 0.3}


 15%|█▌        | 99/656 [04:35<28:08,  3.03s/it]

{'loss': 1.8721, 'learning_rate': 2.3650262780697566e-06, 'epoch': 0.3}


 15%|█▌        | 100/656 [04:38<26:56,  2.91s/it]

{'loss': 1.8857, 'learning_rate': 2.3889154323936935e-06, 'epoch': 0.3}


 15%|█▌        | 101/656 [04:41<26:45,  2.89s/it]

{'loss': 1.8874, 'learning_rate': 2.41280458671763e-06, 'epoch': 0.31}


 16%|█▌        | 102/656 [04:44<26:37,  2.88s/it]

{'loss': 1.8731, 'learning_rate': 2.4366937410415673e-06, 'epoch': 0.31}


 16%|█▌        | 103/656 [04:47<26:55,  2.92s/it]

{'loss': 1.8798, 'learning_rate': 2.460582895365504e-06, 'epoch': 0.31}


 16%|█▌        | 104/656 [04:50<27:30,  2.99s/it]

{'loss': 1.8787, 'learning_rate': 2.484472049689441e-06, 'epoch': 0.32}


 16%|█▌        | 105/656 [04:53<27:30,  3.00s/it]

{'loss': 1.8778, 'learning_rate': 2.508361204013378e-06, 'epoch': 0.32}


 16%|█▌        | 106/656 [04:56<27:30,  3.00s/it]

{'loss': 1.8608, 'learning_rate': 2.5322503583373148e-06, 'epoch': 0.32}


 16%|█▋        | 107/656 [04:59<27:44,  3.03s/it]

{'loss': 1.8539, 'learning_rate': 2.5561395126612517e-06, 'epoch': 0.33}


 16%|█▋        | 108/656 [05:03<29:40,  3.25s/it]

{'loss': 1.8604, 'learning_rate': 2.580028666985189e-06, 'epoch': 0.33}


 17%|█▋        | 109/656 [05:06<29:10,  3.20s/it]

{'loss': 1.8397, 'learning_rate': 2.603917821309126e-06, 'epoch': 0.33}


 17%|█▋        | 110/656 [05:09<28:09,  3.09s/it]

{'loss': 1.8524, 'learning_rate': 2.6278069756330627e-06, 'epoch': 0.34}


 17%|█▋        | 111/656 [05:11<26:22,  2.90s/it]

{'loss': 1.8386, 'learning_rate': 2.6516961299569996e-06, 'epoch': 0.34}


 17%|█▋        | 112/656 [05:14<26:03,  2.87s/it]

{'loss': 1.8464, 'learning_rate': 2.6755852842809365e-06, 'epoch': 0.34}


 17%|█▋        | 113/656 [05:17<26:47,  2.96s/it]

{'loss': 1.8262, 'learning_rate': 2.6994744386048734e-06, 'epoch': 0.34}


 17%|█▋        | 114/656 [05:20<26:41,  2.96s/it]

{'loss': 1.8698, 'learning_rate': 2.7233635929288107e-06, 'epoch': 0.35}


 18%|█▊        | 115/656 [05:23<27:45,  3.08s/it]

{'loss': 1.8195, 'learning_rate': 2.747252747252747e-06, 'epoch': 0.35}


 18%|█▊        | 116/656 [05:31<40:44,  4.53s/it]

{'loss': 1.8378, 'learning_rate': 2.771141901576684e-06, 'epoch': 0.35}


 18%|█▊        | 117/656 [05:36<42:47,  4.76s/it]

{'loss': 1.8118, 'learning_rate': 2.7950310559006214e-06, 'epoch': 0.36}


 18%|█▊        | 118/656 [05:40<40:22,  4.50s/it]

{'loss': 1.8591, 'learning_rate': 2.8189202102245582e-06, 'epoch': 0.36}


 18%|█▊        | 119/656 [05:44<37:12,  4.16s/it]

{'loss': 1.8149, 'learning_rate': 2.842809364548495e-06, 'epoch': 0.36}


 18%|█▊        | 120/656 [05:47<33:45,  3.78s/it]

{'loss': 1.8252, 'learning_rate': 2.866698518872432e-06, 'epoch': 0.37}


 18%|█▊        | 121/656 [05:50<31:27,  3.53s/it]

{'loss': 1.8305, 'learning_rate': 2.890587673196369e-06, 'epoch': 0.37}


 19%|█▊        | 122/656 [05:53<29:54,  3.36s/it]

{'loss': 1.7864, 'learning_rate': 2.9144768275203058e-06, 'epoch': 0.37}


 19%|█▉        | 123/656 [05:56<28:56,  3.26s/it]

{'loss': 1.7932, 'learning_rate': 2.938365981844243e-06, 'epoch': 0.38}


 19%|█▉        | 124/656 [05:59<28:12,  3.18s/it]

{'loss': 1.764, 'learning_rate': 2.9622551361681795e-06, 'epoch': 0.38}


 19%|█▉        | 125/656 [06:02<29:27,  3.33s/it]

{'loss': 1.7714, 'learning_rate': 2.9861442904921164e-06, 'epoch': 0.38}


 19%|█▉        | 126/656 [06:05<27:52,  3.16s/it]

{'loss': 1.8183, 'learning_rate': 3.0100334448160537e-06, 'epoch': 0.38}


 19%|█▉        | 127/656 [06:08<27:26,  3.11s/it]

{'loss': 1.7755, 'learning_rate': 3.0339225991399906e-06, 'epoch': 0.39}


 20%|█▉        | 128/656 [06:11<28:10,  3.20s/it]

{'loss': 1.7723, 'learning_rate': 3.0578117534639275e-06, 'epoch': 0.39}


 20%|█▉        | 129/656 [06:15<28:03,  3.19s/it]

{'loss': 1.7572, 'learning_rate': 3.0817009077878644e-06, 'epoch': 0.39}


 20%|█▉        | 130/656 [06:18<27:26,  3.13s/it]

{'loss': 1.7637, 'learning_rate': 3.1055900621118013e-06, 'epoch': 0.4}


 20%|█▉        | 131/656 [06:21<28:08,  3.22s/it]

{'loss': 1.747, 'learning_rate': 3.1294792164357386e-06, 'epoch': 0.4}


 20%|██        | 132/656 [06:25<29:55,  3.43s/it]

{'loss': 1.799, 'learning_rate': 3.1533683707596755e-06, 'epoch': 0.4}


 20%|██        | 133/656 [06:30<34:05,  3.91s/it]

{'loss': 1.7508, 'learning_rate': 3.1772575250836123e-06, 'epoch': 0.41}


 20%|██        | 134/656 [06:34<33:15,  3.82s/it]

{'loss': 1.7636, 'learning_rate': 3.201146679407549e-06, 'epoch': 0.41}


 21%|██        | 135/656 [06:37<31:22,  3.61s/it]

{'loss': 1.7308, 'learning_rate': 3.2250358337314857e-06, 'epoch': 0.41}


 21%|██        | 136/656 [06:40<30:12,  3.48s/it]

{'loss': 1.7013, 'learning_rate': 3.2489249880554234e-06, 'epoch': 0.41}


 21%|██        | 137/656 [06:43<28:17,  3.27s/it]

{'loss': 1.7594, 'learning_rate': 3.2728141423793603e-06, 'epoch': 0.42}


 21%|██        | 138/656 [06:46<29:00,  3.36s/it]

{'loss': 1.7071, 'learning_rate': 3.2967032967032968e-06, 'epoch': 0.42}


 21%|██        | 139/656 [06:49<27:08,  3.15s/it]

{'loss': 1.7235, 'learning_rate': 3.3205924510272337e-06, 'epoch': 0.42}


 21%|██▏       | 140/656 [06:52<26:30,  3.08s/it]

{'loss': 1.756, 'learning_rate': 3.3444816053511705e-06, 'epoch': 0.43}


 21%|██▏       | 141/656 [06:55<27:00,  3.15s/it]

{'loss': 1.6627, 'learning_rate': 3.3683707596751074e-06, 'epoch': 0.43}


 22%|██▏       | 142/656 [06:59<27:42,  3.23s/it]

{'loss': 1.7523, 'learning_rate': 3.3922599139990447e-06, 'epoch': 0.43}


 22%|██▏       | 143/656 [07:01<26:57,  3.15s/it]

{'loss': 1.7134, 'learning_rate': 3.4161490683229816e-06, 'epoch': 0.44}


 22%|██▏       | 144/656 [07:05<27:43,  3.25s/it]

{'loss': 1.6807, 'learning_rate': 3.4400382226469185e-06, 'epoch': 0.44}


 22%|██▏       | 145/656 [07:08<26:34,  3.12s/it]

{'loss': 1.7133, 'learning_rate': 3.4639273769708554e-06, 'epoch': 0.44}


 22%|██▏       | 146/656 [07:11<25:59,  3.06s/it]

{'loss': 1.6957, 'learning_rate': 3.4878165312947923e-06, 'epoch': 0.45}


 22%|██▏       | 147/656 [07:14<25:25,  3.00s/it]

{'loss': 1.6754, 'learning_rate': 3.511705685618729e-06, 'epoch': 0.45}


 23%|██▎       | 148/656 [07:17<26:14,  3.10s/it]

{'loss': 1.6592, 'learning_rate': 3.5355948399426665e-06, 'epoch': 0.45}


 23%|██▎       | 149/656 [07:20<25:31,  3.02s/it]

{'loss': 1.6504, 'learning_rate': 3.5594839942666033e-06, 'epoch': 0.45}


 23%|██▎       | 150/656 [07:23<25:05,  2.98s/it]

{'loss': 1.6612, 'learning_rate': 3.5833731485905402e-06, 'epoch': 0.46}


 23%|██▎       | 151/656 [07:25<24:52,  2.95s/it]

{'loss': 1.7191, 'learning_rate': 3.607262302914477e-06, 'epoch': 0.46}


 23%|██▎       | 152/656 [07:29<25:22,  3.02s/it]

{'loss': 1.6459, 'learning_rate': 3.631151457238414e-06, 'epoch': 0.46}


 23%|██▎       | 153/656 [07:31<24:49,  2.96s/it]

{'loss': 1.7107, 'learning_rate': 3.6550406115623505e-06, 'epoch': 0.47}


 23%|██▎       | 154/656 [07:34<23:38,  2.83s/it]

{'loss': 1.6219, 'learning_rate': 3.678929765886288e-06, 'epoch': 0.47}


 24%|██▎       | 155/656 [07:37<23:51,  2.86s/it]

{'loss': 1.5918, 'learning_rate': 3.702818920210225e-06, 'epoch': 0.47}


 24%|██▍       | 156/656 [07:40<25:18,  3.04s/it]

{'loss': 1.6346, 'learning_rate': 3.726708074534162e-06, 'epoch': 0.48}


 24%|██▍       | 157/656 [07:44<27:18,  3.28s/it]

{'loss': 1.6574, 'learning_rate': 3.7505972288580984e-06, 'epoch': 0.48}


 24%|██▍       | 158/656 [07:47<26:53,  3.24s/it]

{'loss': 1.6143, 'learning_rate': 3.7744863831820353e-06, 'epoch': 0.48}


 24%|██▍       | 159/656 [07:50<25:21,  3.06s/it]

{'loss': 1.6179, 'learning_rate': 3.798375537505972e-06, 'epoch': 0.48}


 24%|██▍       | 160/656 [07:53<24:56,  3.02s/it]

{'loss': 1.6304, 'learning_rate': 3.82226469182991e-06, 'epoch': 0.49}


 25%|██▍       | 161/656 [07:56<24:00,  2.91s/it]

{'loss': 1.6144, 'learning_rate': 3.846153846153847e-06, 'epoch': 0.49}


 25%|██▍       | 162/656 [07:59<24:22,  2.96s/it]

{'loss': 1.6102, 'learning_rate': 3.870043000477784e-06, 'epoch': 0.49}


 25%|██▍       | 163/656 [08:02<25:08,  3.06s/it]

{'loss': 1.5717, 'learning_rate': 3.8939321548017206e-06, 'epoch': 0.5}


 25%|██▌       | 164/656 [08:05<25:14,  3.08s/it]

{'loss': 1.5706, 'learning_rate': 3.917821309125657e-06, 'epoch': 0.5}


 25%|██▌       | 165/656 [08:08<25:16,  3.09s/it]

{'loss': 1.6062, 'learning_rate': 3.9417104634495935e-06, 'epoch': 0.5}


 25%|██▌       | 166/656 [08:11<24:41,  3.02s/it]

{'loss': 1.6454, 'learning_rate': 3.965599617773531e-06, 'epoch': 0.51}


 25%|██▌       | 167/656 [08:14<24:34,  3.02s/it]

{'loss': 1.5555, 'learning_rate': 3.989488772097468e-06, 'epoch': 0.51}


 26%|██▌       | 168/656 [08:17<24:04,  2.96s/it]

{'loss': 1.4528, 'learning_rate': 4.013377926421405e-06, 'epoch': 0.51}


 26%|██▌       | 169/656 [08:20<24:51,  3.06s/it]

{'loss': 1.5804, 'learning_rate': 4.037267080745342e-06, 'epoch': 0.52}


 26%|██▌       | 170/656 [08:23<24:58,  3.08s/it]

{'loss': 1.4911, 'learning_rate': 4.061156235069279e-06, 'epoch': 0.52}


 26%|██▌       | 171/656 [08:26<24:03,  2.98s/it]

{'loss': 1.5174, 'learning_rate': 4.085045389393216e-06, 'epoch': 0.52}


 26%|██▌       | 172/656 [08:30<26:27,  3.28s/it]

{'loss': 1.5404, 'learning_rate': 4.108934543717153e-06, 'epoch': 0.52}


 26%|██▋       | 173/656 [08:34<28:22,  3.53s/it]

{'loss': 1.564, 'learning_rate': 4.132823698041089e-06, 'epoch': 0.53}


 27%|██▋       | 174/656 [08:38<30:06,  3.75s/it]

{'loss': 1.5464, 'learning_rate': 4.156712852365026e-06, 'epoch': 0.53}


 27%|██▋       | 175/656 [08:42<28:56,  3.61s/it]

{'loss': 1.494, 'learning_rate': 4.180602006688963e-06, 'epoch': 0.53}


 27%|██▋       | 176/656 [08:45<27:20,  3.42s/it]

{'loss': 1.4922, 'learning_rate': 4.2044911610129e-06, 'epoch': 0.54}


 27%|██▋       | 177/656 [08:48<26:36,  3.33s/it]

{'loss': 1.5078, 'learning_rate': 4.228380315336837e-06, 'epoch': 0.54}


 27%|██▋       | 178/656 [08:51<25:53,  3.25s/it]

{'loss': 1.5305, 'learning_rate': 4.252269469660775e-06, 'epoch': 0.54}


 27%|██▋       | 179/656 [08:54<24:45,  3.11s/it]

{'loss': 1.4523, 'learning_rate': 4.2761586239847116e-06, 'epoch': 0.55}


 27%|██▋       | 180/656 [08:56<23:35,  2.97s/it]

{'loss': 1.4055, 'learning_rate': 4.3000477783086484e-06, 'epoch': 0.55}


 28%|██▊       | 181/656 [08:59<23:27,  2.96s/it]

{'loss': 1.4896, 'learning_rate': 4.323936932632585e-06, 'epoch': 0.55}


 28%|██▊       | 182/656 [09:03<25:02,  3.17s/it]

{'loss': 1.4603, 'learning_rate': 4.347826086956522e-06, 'epoch': 0.55}


 28%|██▊       | 183/656 [09:06<24:19,  3.09s/it]

{'loss': 1.4832, 'learning_rate': 4.371715241280458e-06, 'epoch': 0.56}


 28%|██▊       | 184/656 [09:09<24:45,  3.15s/it]

{'loss': 1.3973, 'learning_rate': 4.395604395604396e-06, 'epoch': 0.56}


 28%|██▊       | 185/656 [09:12<23:18,  2.97s/it]

{'loss': 1.3811, 'learning_rate': 4.419493549928333e-06, 'epoch': 0.56}


 28%|██▊       | 186/656 [09:14<22:45,  2.91s/it]

{'loss': 1.376, 'learning_rate': 4.44338270425227e-06, 'epoch': 0.57}


 29%|██▊       | 187/656 [09:17<22:24,  2.87s/it]

{'loss': 1.3462, 'learning_rate': 4.467271858576207e-06, 'epoch': 0.57}


 29%|██▊       | 188/656 [09:20<22:41,  2.91s/it]

{'loss': 1.4016, 'learning_rate': 4.4911610129001435e-06, 'epoch': 0.57}


 29%|██▉       | 189/656 [09:24<24:38,  3.16s/it]

{'loss': 1.3549, 'learning_rate': 4.51505016722408e-06, 'epoch': 0.58}


 29%|██▉       | 190/656 [09:28<27:00,  3.48s/it]

{'loss': 1.3403, 'learning_rate': 4.538939321548018e-06, 'epoch': 0.58}


 29%|██▉       | 191/656 [09:31<26:02,  3.36s/it]

{'loss': 1.3275, 'learning_rate': 4.562828475871954e-06, 'epoch': 0.58}


 29%|██▉       | 192/656 [09:35<25:52,  3.35s/it]

{'loss': 1.3306, 'learning_rate': 4.586717630195891e-06, 'epoch': 0.59}


 29%|██▉       | 193/656 [09:38<25:53,  3.35s/it]

{'loss': 1.2866, 'learning_rate': 4.610606784519828e-06, 'epoch': 0.59}


 30%|██▉       | 194/656 [09:44<31:21,  4.07s/it]

{'loss': 1.3727, 'learning_rate': 4.634495938843765e-06, 'epoch': 0.59}


 30%|██▉       | 195/656 [09:49<34:16,  4.46s/it]

{'loss': 1.3253, 'learning_rate': 4.658385093167702e-06, 'epoch': 0.59}


 30%|██▉       | 196/656 [09:52<31:47,  4.15s/it]

{'loss': 1.2829, 'learning_rate': 4.682274247491639e-06, 'epoch': 0.6}


 30%|███       | 197/656 [09:56<29:28,  3.85s/it]

{'loss': 1.2733, 'learning_rate': 4.706163401815576e-06, 'epoch': 0.6}


 30%|███       | 198/656 [09:58<27:08,  3.56s/it]

{'loss': 1.231, 'learning_rate': 4.730052556139513e-06, 'epoch': 0.6}


 30%|███       | 199/656 [10:02<25:51,  3.40s/it]

{'loss': 1.3417, 'learning_rate': 4.75394171046345e-06, 'epoch': 0.61}


 30%|███       | 200/656 [10:04<24:50,  3.27s/it]

{'loss': 1.2355, 'learning_rate': 4.777830864787387e-06, 'epoch': 0.61}


 31%|███       | 201/656 [10:07<23:50,  3.14s/it]

{'loss': 1.2452, 'learning_rate': 4.801720019111324e-06, 'epoch': 0.61}


 31%|███       | 202/656 [10:11<24:01,  3.18s/it]

{'loss': 1.1995, 'learning_rate': 4.82560917343526e-06, 'epoch': 0.62}


 31%|███       | 203/656 [10:14<24:52,  3.29s/it]

{'loss': 1.163, 'learning_rate': 4.849498327759198e-06, 'epoch': 0.62}


 31%|███       | 204/656 [10:18<25:37,  3.40s/it]

{'loss': 1.1905, 'learning_rate': 4.8733874820831345e-06, 'epoch': 0.62}


 31%|███▏      | 205/656 [10:21<24:36,  3.27s/it]

{'loss': 1.1404, 'learning_rate': 4.897276636407071e-06, 'epoch': 0.62}


 31%|███▏      | 206/656 [10:24<25:22,  3.38s/it]

{'loss': 1.1898, 'learning_rate': 4.921165790731008e-06, 'epoch': 0.63}


 32%|███▏      | 207/656 [10:27<23:47,  3.18s/it]

{'loss': 1.1523, 'learning_rate': 4.945054945054945e-06, 'epoch': 0.63}


 32%|███▏      | 208/656 [10:30<23:48,  3.19s/it]

{'loss': 1.1058, 'learning_rate': 4.968944099378882e-06, 'epoch': 0.63}


 32%|███▏      | 209/656 [10:33<23:34,  3.16s/it]

{'loss': 1.1382, 'learning_rate': 4.99283325370282e-06, 'epoch': 0.64}


 32%|███▏      | 210/656 [10:37<23:21,  3.14s/it]

{'loss': 1.1242, 'learning_rate': 5.016722408026756e-06, 'epoch': 0.64}


 32%|███▏      | 211/656 [10:40<24:35,  3.32s/it]

{'loss': 1.0913, 'learning_rate': 5.040611562350693e-06, 'epoch': 0.64}


 32%|███▏      | 212/656 [10:43<24:20,  3.29s/it]

{'loss': 1.1689, 'learning_rate': 5.0645007166746296e-06, 'epoch': 0.65}


 32%|███▏      | 213/656 [10:46<23:33,  3.19s/it]

{'loss': 1.1821, 'learning_rate': 5.0883898709985665e-06, 'epoch': 0.65}


 33%|███▎      | 214/656 [10:49<22:30,  3.06s/it]

{'loss': 1.0881, 'learning_rate': 5.112279025322503e-06, 'epoch': 0.65}


 33%|███▎      | 215/656 [10:52<22:00,  2.99s/it]

{'loss': 1.1589, 'learning_rate': 5.136168179646441e-06, 'epoch': 0.66}


 33%|███▎      | 216/656 [10:55<21:34,  2.94s/it]

{'loss': 1.0496, 'learning_rate': 5.160057333970378e-06, 'epoch': 0.66}


 33%|███▎      | 217/656 [10:58<21:13,  2.90s/it]

{'loss': 1.031, 'learning_rate': 5.183946488294315e-06, 'epoch': 0.66}


 33%|███▎      | 218/656 [11:01<23:00,  3.15s/it]

{'loss': 1.0284, 'learning_rate': 5.207835642618252e-06, 'epoch': 0.66}


 33%|███▎      | 219/656 [11:04<21:54,  3.01s/it]

{'loss': 1.0048, 'learning_rate': 5.231724796942189e-06, 'epoch': 0.67}


 34%|███▎      | 220/656 [11:07<21:23,  2.94s/it]

{'loss': 1.0197, 'learning_rate': 5.2556139512661255e-06, 'epoch': 0.67}


 34%|███▎      | 221/656 [11:10<21:08,  2.92s/it]

{'loss': 1.0859, 'learning_rate': 5.279503105590062e-06, 'epoch': 0.67}


 34%|███▍      | 222/656 [11:12<20:44,  2.87s/it]

{'loss': 0.9883, 'learning_rate': 5.303392259913999e-06, 'epoch': 0.68}


 34%|███▍      | 223/656 [11:15<20:23,  2.83s/it]

{'loss': 0.9424, 'learning_rate': 5.327281414237936e-06, 'epoch': 0.68}


 34%|███▍      | 224/656 [11:18<21:21,  2.97s/it]

{'loss': 0.9435, 'learning_rate': 5.351170568561873e-06, 'epoch': 0.68}


 34%|███▍      | 225/656 [11:22<22:11,  3.09s/it]

{'loss': 1.0415, 'learning_rate': 5.37505972288581e-06, 'epoch': 0.69}


 34%|███▍      | 226/656 [11:25<22:41,  3.17s/it]

{'loss': 0.966, 'learning_rate': 5.398948877209747e-06, 'epoch': 0.69}


 35%|███▍      | 227/656 [11:29<23:30,  3.29s/it]

{'loss': 0.8793, 'learning_rate': 5.4228380315336845e-06, 'epoch': 0.69}


 35%|███▍      | 228/656 [11:32<23:01,  3.23s/it]

{'loss': 0.8737, 'learning_rate': 5.446727185857621e-06, 'epoch': 0.7}


 35%|███▍      | 229/656 [11:35<22:39,  3.18s/it]

{'loss': 0.8914, 'learning_rate': 5.4706163401815574e-06, 'epoch': 0.7}


 35%|███▌      | 230/656 [11:38<22:57,  3.23s/it]

{'loss': 0.8968, 'learning_rate': 5.494505494505494e-06, 'epoch': 0.7}


 35%|███▌      | 231/656 [11:41<21:55,  3.10s/it]

{'loss': 0.8822, 'learning_rate': 5.518394648829431e-06, 'epoch': 0.7}


 35%|███▌      | 232/656 [11:44<22:12,  3.14s/it]

{'loss': 0.8522, 'learning_rate': 5.542283803153368e-06, 'epoch': 0.71}


 36%|███▌      | 233/656 [11:48<22:50,  3.24s/it]

{'loss': 0.8443, 'learning_rate': 5.566172957477306e-06, 'epoch': 0.71}


 36%|███▌      | 234/656 [11:51<23:34,  3.35s/it]

{'loss': 0.8794, 'learning_rate': 5.590062111801243e-06, 'epoch': 0.71}


 36%|███▌      | 235/656 [11:55<24:13,  3.45s/it]

{'loss': 0.9124, 'learning_rate': 5.61395126612518e-06, 'epoch': 0.72}


 36%|███▌      | 236/656 [11:59<24:10,  3.45s/it]

{'loss': 0.922, 'learning_rate': 5.6378404204491165e-06, 'epoch': 0.72}


 36%|███▌      | 237/656 [12:02<25:02,  3.59s/it]

{'loss': 0.8267, 'learning_rate': 5.661729574773053e-06, 'epoch': 0.72}


 36%|███▋      | 238/656 [12:06<25:24,  3.65s/it]

{'loss': 0.8166, 'learning_rate': 5.68561872909699e-06, 'epoch': 0.73}


 36%|███▋      | 239/656 [12:10<24:39,  3.55s/it]

{'loss': 0.8765, 'learning_rate': 5.709507883420927e-06, 'epoch': 0.73}


 37%|███▋      | 240/656 [12:13<23:47,  3.43s/it]

{'loss': 0.8227, 'learning_rate': 5.733397037744864e-06, 'epoch': 0.73}


 37%|███▋      | 241/656 [12:16<23:16,  3.36s/it]

{'loss': 0.8696, 'learning_rate': 5.757286192068801e-06, 'epoch': 0.73}


 37%|███▋      | 242/656 [12:19<23:00,  3.33s/it]

{'loss': 0.8475, 'learning_rate': 5.781175346392738e-06, 'epoch': 0.74}


 37%|███▋      | 243/656 [12:23<23:42,  3.45s/it]

{'loss': 0.7548, 'learning_rate': 5.805064500716675e-06, 'epoch': 0.74}


 37%|███▋      | 244/656 [12:27<24:16,  3.54s/it]

{'loss': 0.8178, 'learning_rate': 5.8289536550406116e-06, 'epoch': 0.74}


 37%|███▋      | 245/656 [12:30<23:41,  3.46s/it]

{'loss': 0.765, 'learning_rate': 5.852842809364549e-06, 'epoch': 0.75}


 38%|███▊      | 246/656 [12:34<24:23,  3.57s/it]

{'loss': 0.7517, 'learning_rate': 5.876731963688486e-06, 'epoch': 0.75}


 38%|███▊      | 247/656 [12:37<24:06,  3.54s/it]

{'loss': 0.7034, 'learning_rate': 5.900621118012423e-06, 'epoch': 0.75}


 38%|███▊      | 248/656 [12:41<24:01,  3.53s/it]

{'loss': 0.8063, 'learning_rate': 5.924510272336359e-06, 'epoch': 0.76}


 38%|███▊      | 249/656 [12:44<23:19,  3.44s/it]

{'loss': 0.7407, 'learning_rate': 5.948399426660296e-06, 'epoch': 0.76}


 38%|███▊      | 250/656 [12:48<24:20,  3.60s/it]

{'loss': 0.7087, 'learning_rate': 5.972288580984233e-06, 'epoch': 0.76}


 38%|███▊      | 251/656 [12:51<23:17,  3.45s/it]

{'loss': 0.6723, 'learning_rate': 5.996177735308171e-06, 'epoch': 0.77}


 38%|███▊      | 252/656 [12:54<22:24,  3.33s/it]

{'loss': 0.7706, 'learning_rate': 6.0200668896321075e-06, 'epoch': 0.77}


 39%|███▊      | 253/656 [12:57<21:57,  3.27s/it]

{'loss': 0.751, 'learning_rate': 6.043956043956044e-06, 'epoch': 0.77}


 39%|███▊      | 254/656 [13:00<21:28,  3.21s/it]

{'loss': 0.6379, 'learning_rate': 6.067845198279981e-06, 'epoch': 0.77}


 39%|███▉      | 255/656 [13:04<21:36,  3.23s/it]

{'loss': 0.6375, 'learning_rate': 6.091734352603918e-06, 'epoch': 0.78}


 39%|███▉      | 256/656 [13:07<22:29,  3.37s/it]

{'loss': 0.6234, 'learning_rate': 6.115623506927855e-06, 'epoch': 0.78}


 39%|███▉      | 257/656 [13:11<22:14,  3.34s/it]

{'loss': 0.6768, 'learning_rate': 6.139512661251792e-06, 'epoch': 0.78}


 39%|███▉      | 258/656 [13:14<21:46,  3.28s/it]

{'loss': 0.5849, 'learning_rate': 6.163401815575729e-06, 'epoch': 0.79}


 39%|███▉      | 259/656 [13:17<21:41,  3.28s/it]

{'loss': 0.5848, 'learning_rate': 6.187290969899666e-06, 'epoch': 0.79}


 40%|███▉      | 260/656 [13:20<21:37,  3.28s/it]

{'loss': 0.5782, 'learning_rate': 6.2111801242236025e-06, 'epoch': 0.79}


 40%|███▉      | 261/656 [13:24<22:57,  3.49s/it]

{'loss': 0.6509, 'learning_rate': 6.2350692785475394e-06, 'epoch': 0.8}


 40%|███▉      | 262/656 [13:28<23:52,  3.64s/it]

{'loss': 0.5083, 'learning_rate': 6.258958432871477e-06, 'epoch': 0.8}


 40%|████      | 263/656 [13:32<23:35,  3.60s/it]

{'loss': 0.5918, 'learning_rate': 6.282847587195413e-06, 'epoch': 0.8}


 40%|████      | 264/656 [13:35<22:29,  3.44s/it]

{'loss': 0.6159, 'learning_rate': 6.306736741519351e-06, 'epoch': 0.8}


 40%|████      | 265/656 [13:38<22:56,  3.52s/it]

{'loss': 0.4722, 'learning_rate': 6.330625895843287e-06, 'epoch': 0.81}


 41%|████      | 266/656 [13:42<23:39,  3.64s/it]

{'loss': 0.4878, 'learning_rate': 6.354515050167225e-06, 'epoch': 0.81}


 41%|████      | 267/656 [13:46<23:30,  3.63s/it]

{'loss': 0.6239, 'learning_rate': 6.378404204491162e-06, 'epoch': 0.81}


 41%|████      | 268/656 [13:50<24:00,  3.71s/it]

{'loss': 0.5498, 'learning_rate': 6.402293358815098e-06, 'epoch': 0.82}


 41%|████      | 269/656 [13:53<23:29,  3.64s/it]

{'loss': 0.4741, 'learning_rate': 6.426182513139035e-06, 'epoch': 0.82}


 41%|████      | 270/656 [13:57<24:00,  3.73s/it]

{'loss': 0.6102, 'learning_rate': 6.450071667462971e-06, 'epoch': 0.82}


 41%|████▏     | 271/656 [14:01<23:40,  3.69s/it]

{'loss': 0.5519, 'learning_rate': 6.473960821786909e-06, 'epoch': 0.83}


 41%|████▏     | 272/656 [14:05<23:57,  3.74s/it]

{'loss': 0.468, 'learning_rate': 6.497849976110847e-06, 'epoch': 0.83}


 42%|████▏     | 273/656 [14:09<24:58,  3.91s/it]

{'loss': 0.6419, 'learning_rate': 6.521739130434783e-06, 'epoch': 0.83}


 42%|████▏     | 274/656 [14:13<24:05,  3.79s/it]

{'loss': 0.4962, 'learning_rate': 6.545628284758721e-06, 'epoch': 0.84}


 42%|████▏     | 275/656 [14:16<23:47,  3.75s/it]

{'loss': 0.6095, 'learning_rate': 6.569517439082657e-06, 'epoch': 0.84}


 42%|████▏     | 276/656 [14:21<25:26,  4.02s/it]

{'loss': 0.509, 'learning_rate': 6.5934065934065935e-06, 'epoch': 0.84}


 42%|████▏     | 277/656 [14:25<25:23,  4.02s/it]

{'loss': 0.4489, 'learning_rate': 6.61729574773053e-06, 'epoch': 0.84}


 42%|████▏     | 278/656 [14:29<25:48,  4.10s/it]

{'loss': 0.4274, 'learning_rate': 6.641184902054467e-06, 'epoch': 0.85}


 43%|████▎     | 279/656 [14:34<27:12,  4.33s/it]

{'loss': 0.4882, 'learning_rate': 6.665074056378405e-06, 'epoch': 0.85}


 43%|████▎     | 280/656 [14:39<27:52,  4.45s/it]

{'loss': 0.3734, 'learning_rate': 6.688963210702341e-06, 'epoch': 0.85}


 43%|████▎     | 281/656 [14:43<27:42,  4.43s/it]

{'loss': 0.4172, 'learning_rate': 6.712852365026279e-06, 'epoch': 0.86}


 43%|████▎     | 282/656 [14:49<29:46,  4.78s/it]

{'loss': 0.4655, 'learning_rate': 6.736741519350215e-06, 'epoch': 0.86}


 43%|████▎     | 283/656 [14:53<29:08,  4.69s/it]

{'loss': 0.4496, 'learning_rate': 6.7606306736741526e-06, 'epoch': 0.86}


 43%|████▎     | 284/656 [14:58<29:21,  4.73s/it]

{'loss': 0.4708, 'learning_rate': 6.7845198279980895e-06, 'epoch': 0.87}


 43%|████▎     | 285/656 [15:03<29:28,  4.77s/it]

{'loss': 0.4488, 'learning_rate': 6.808408982322026e-06, 'epoch': 0.87}


 44%|████▎     | 286/656 [15:07<28:27,  4.61s/it]

{'loss': 0.4429, 'learning_rate': 6.832298136645963e-06, 'epoch': 0.87}


 44%|████▍     | 287/656 [15:11<27:05,  4.41s/it]

{'loss': 0.4176, 'learning_rate': 6.856187290969899e-06, 'epoch': 0.88}


 44%|████▍     | 288/656 [15:16<27:01,  4.41s/it]

{'loss': 0.4006, 'learning_rate': 6.880076445293837e-06, 'epoch': 0.88}


 44%|████▍     | 289/656 [15:21<28:03,  4.59s/it]

{'loss': 0.3264, 'learning_rate': 6.903965599617773e-06, 'epoch': 0.88}


 44%|████▍     | 290/656 [15:25<28:31,  4.68s/it]

{'loss': 0.3582, 'learning_rate': 6.927854753941711e-06, 'epoch': 0.88}


 44%|████▍     | 291/656 [15:29<26:19,  4.33s/it]

{'loss': 0.3655, 'learning_rate': 6.9517439082656485e-06, 'epoch': 0.89}


 45%|████▍     | 292/656 [15:32<24:38,  4.06s/it]

{'loss': 0.3715, 'learning_rate': 6.9756330625895845e-06, 'epoch': 0.89}


 45%|████▍     | 293/656 [15:36<23:44,  3.93s/it]

{'loss': 0.4427, 'learning_rate': 6.999522216913522e-06, 'epoch': 0.89}


 45%|████▍     | 294/656 [15:39<22:09,  3.67s/it]

{'loss': 0.4087, 'learning_rate': 7.023411371237458e-06, 'epoch': 0.9}


 45%|████▍     | 295/656 [15:42<21:27,  3.57s/it]

{'loss': 0.302, 'learning_rate': 7.047300525561395e-06, 'epoch': 0.9}


 45%|████▌     | 296/656 [15:46<21:13,  3.54s/it]

{'loss': 0.4179, 'learning_rate': 7.071189679885333e-06, 'epoch': 0.9}


 45%|████▌     | 297/656 [15:49<20:21,  3.40s/it]

{'loss': 0.3984, 'learning_rate': 7.095078834209269e-06, 'epoch': 0.91}


 45%|████▌     | 298/656 [15:52<20:21,  3.41s/it]

{'loss': 0.3615, 'learning_rate': 7.118967988533207e-06, 'epoch': 0.91}


 46%|████▌     | 299/656 [15:56<20:56,  3.52s/it]

{'loss': 0.3471, 'learning_rate': 7.142857142857143e-06, 'epoch': 0.91}


 46%|████▌     | 300/656 [15:59<20:24,  3.44s/it]

{'loss': 0.4097, 'learning_rate': 7.1667462971810804e-06, 'epoch': 0.91}


 46%|████▌     | 301/656 [16:03<20:57,  3.54s/it]

{'loss': 0.3719, 'learning_rate': 7.1906354515050165e-06, 'epoch': 0.92}


 46%|████▌     | 302/656 [16:06<20:24,  3.46s/it]

{'loss': 0.3668, 'learning_rate': 7.214524605828954e-06, 'epoch': 0.92}


 46%|████▌     | 303/656 [16:10<20:00,  3.40s/it]

{'loss': 0.3039, 'learning_rate': 7.238413760152891e-06, 'epoch': 0.92}


 46%|████▋     | 304/656 [16:13<19:23,  3.31s/it]

{'loss': 0.2988, 'learning_rate': 7.262302914476828e-06, 'epoch': 0.93}


 46%|████▋     | 305/656 [16:16<18:42,  3.20s/it]

{'loss': 0.3165, 'learning_rate': 7.286192068800765e-06, 'epoch': 0.93}


 47%|████▋     | 306/656 [16:20<20:05,  3.44s/it]

{'loss': 0.3435, 'learning_rate': 7.310081223124701e-06, 'epoch': 0.93}


 47%|████▋     | 307/656 [16:23<20:33,  3.53s/it]

{'loss': 0.3139, 'learning_rate': 7.333970377448639e-06, 'epoch': 0.94}


 47%|████▋     | 308/656 [16:27<20:56,  3.61s/it]

{'loss': 0.3407, 'learning_rate': 7.357859531772576e-06, 'epoch': 0.94}


 47%|████▋     | 309/656 [16:30<19:41,  3.40s/it]

{'loss': 0.3227, 'learning_rate': 7.381748686096512e-06, 'epoch': 0.94}


 47%|████▋     | 310/656 [16:34<20:24,  3.54s/it]

{'loss': 0.2833, 'learning_rate': 7.40563784042045e-06, 'epoch': 0.95}


 47%|████▋     | 311/656 [16:38<20:49,  3.62s/it]

{'loss': 0.4602, 'learning_rate': 7.429526994744386e-06, 'epoch': 0.95}


 48%|████▊     | 312/656 [16:41<20:29,  3.58s/it]

{'loss': 0.2664, 'learning_rate': 7.453416149068324e-06, 'epoch': 0.95}


 48%|████▊     | 313/656 [16:45<20:04,  3.51s/it]

{'loss': 0.3526, 'learning_rate': 7.47730530339226e-06, 'epoch': 0.95}


 48%|████▊     | 314/656 [16:48<20:22,  3.58s/it]

{'loss': 0.3436, 'learning_rate': 7.501194457716197e-06, 'epoch': 0.96}


 48%|████▊     | 315/656 [16:52<19:53,  3.50s/it]

{'loss': 0.3299, 'learning_rate': 7.5250836120401346e-06, 'epoch': 0.96}


 48%|████▊     | 316/656 [16:54<18:16,  3.22s/it]

{'loss': 0.4545, 'learning_rate': 7.548972766364071e-06, 'epoch': 0.96}


 48%|████▊     | 317/656 [16:59<20:03,  3.55s/it]

{'loss': 0.254, 'learning_rate': 7.572861920688008e-06, 'epoch': 0.97}


 48%|████▊     | 318/656 [17:03<20:47,  3.69s/it]

{'loss': 0.2996, 'learning_rate': 7.596751075011944e-06, 'epoch': 0.97}


 49%|████▊     | 319/656 [17:06<20:15,  3.61s/it]

{'loss': 0.3744, 'learning_rate': 7.620640229335882e-06, 'epoch': 0.97}


 49%|████▉     | 320/656 [17:09<19:20,  3.45s/it]

{'loss': 0.3924, 'learning_rate': 7.64452938365982e-06, 'epoch': 0.98}


 49%|████▉     | 321/656 [17:12<18:06,  3.24s/it]

{'loss': 0.2613, 'learning_rate': 7.668418537983756e-06, 'epoch': 0.98}


 49%|████▉     | 322/656 [17:15<17:13,  3.09s/it]

{'loss': 0.3635, 'learning_rate': 7.692307692307694e-06, 'epoch': 0.98}


 49%|████▉     | 323/656 [17:18<17:20,  3.12s/it]

{'loss': 0.2801, 'learning_rate': 7.71619684663163e-06, 'epoch': 0.98}


 49%|████▉     | 324/656 [17:21<16:45,  3.03s/it]

{'loss': 0.2361, 'learning_rate': 7.740086000955567e-06, 'epoch': 0.99}


 50%|████▉     | 325/656 [17:24<16:58,  3.08s/it]

{'loss': 0.261, 'learning_rate': 7.763975155279503e-06, 'epoch': 0.99}


 50%|████▉     | 326/656 [17:27<17:13,  3.13s/it]

{'loss': 0.1906, 'learning_rate': 7.787864309603441e-06, 'epoch': 0.99}


 50%|████▉     | 327/656 [17:32<19:35,  3.57s/it]

{'loss': 0.2314, 'learning_rate': 7.811753463927377e-06, 'epoch': 1.0}


 50%|█████     | 328/656 [17:33<16:29,  3.02s/it]The following columns in the evaluation set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: token_labels, utterance, tokens. If token_labels, utterance, tokens are not expected by `DistilBertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 2617
  Batch size = 32


{'loss': 0.1786, 'learning_rate': 7.835642618251313e-06, 'epoch': 1.0}


                                                 
 50%|█████     | 328/656 [18:46<16:29,  3.02s/it]Saving model checkpoint to ./snips_clf\tmp-checkpoint-328
Configuration saved in ./snips_clf\tmp-checkpoint-328\config.json


{'eval_loss': 0.2234005630016327, 'eval_accuracy': 0.9778372181887658, 'eval_runtime': 72.1664, 'eval_samples_per_second': 36.263, 'eval_steps_per_second': 1.136, 'epoch': 1.0}


Model weights saved in ./snips_clf\tmp-checkpoint-328\pytorch_model.bin
 50%|█████     | 329/656 [18:52<2:20:07, 25.71s/it]

{'loss': 0.2186, 'learning_rate': 7.859531772575251e-06, 'epoch': 1.0}


 50%|█████     | 330/656 [18:55<1:42:58, 18.95s/it]

{'loss': 0.2469, 'learning_rate': 7.883420926899187e-06, 'epoch': 1.01}


 50%|█████     | 331/656 [18:59<1:17:24, 14.29s/it]

{'loss': 0.187, 'learning_rate': 7.907310081223125e-06, 'epoch': 1.01}


 51%|█████     | 332/656 [19:02<59:23, 11.00s/it]  

{'loss': 0.2792, 'learning_rate': 7.931199235547062e-06, 'epoch': 1.01}


 51%|█████     | 333/656 [19:05<46:28,  8.63s/it]

{'loss': 0.3252, 'learning_rate': 7.955088389870998e-06, 'epoch': 1.02}


 51%|█████     | 334/656 [19:08<37:29,  6.99s/it]

{'loss': 0.1969, 'learning_rate': 7.978977544194936e-06, 'epoch': 1.02}


 51%|█████     | 335/656 [19:11<30:31,  5.70s/it]

{'loss': 0.2262, 'learning_rate': 8.002866698518872e-06, 'epoch': 1.02}


 51%|█████     | 336/656 [19:14<26:24,  4.95s/it]

{'loss': 0.2577, 'learning_rate': 8.02675585284281e-06, 'epoch': 1.02}


 51%|█████▏    | 337/656 [19:17<22:39,  4.26s/it]

{'loss': 0.1717, 'learning_rate': 8.050645007166746e-06, 'epoch': 1.03}


 52%|█████▏    | 338/656 [19:19<20:00,  3.77s/it]

{'loss': 0.298, 'learning_rate': 8.074534161490684e-06, 'epoch': 1.03}


 52%|█████▏    | 339/656 [19:22<18:31,  3.51s/it]

{'loss': 0.2194, 'learning_rate': 8.098423315814621e-06, 'epoch': 1.03}


 52%|█████▏    | 340/656 [19:25<17:09,  3.26s/it]

{'loss': 0.4545, 'learning_rate': 8.122312470138558e-06, 'epoch': 1.04}


 52%|█████▏    | 341/656 [19:27<15:36,  2.97s/it]

{'loss': 0.2716, 'learning_rate': 8.146201624462495e-06, 'epoch': 1.04}


 52%|█████▏    | 342/656 [19:30<15:24,  2.95s/it]

{'loss': 0.3032, 'learning_rate': 8.170090778786431e-06, 'epoch': 1.04}


 52%|█████▏    | 343/656 [19:33<14:56,  2.87s/it]

{'loss': 0.1707, 'learning_rate': 8.193979933110369e-06, 'epoch': 1.05}


 52%|█████▏    | 344/656 [19:37<16:09,  3.11s/it]

{'loss': 0.2254, 'learning_rate': 8.217869087434307e-06, 'epoch': 1.05}


 53%|█████▎    | 345/656 [19:39<15:27,  2.98s/it]

{'loss': 0.1396, 'learning_rate': 8.241758241758243e-06, 'epoch': 1.05}


 53%|█████▎    | 346/656 [19:42<14:54,  2.89s/it]

{'loss': 0.2042, 'learning_rate': 8.265647396082179e-06, 'epoch': 1.05}


 53%|█████▎    | 347/656 [19:45<14:41,  2.85s/it]

{'loss': 0.1984, 'learning_rate': 8.289536550406115e-06, 'epoch': 1.06}


 53%|█████▎    | 348/656 [19:47<14:33,  2.84s/it]

{'loss': 0.2025, 'learning_rate': 8.313425704730053e-06, 'epoch': 1.06}


 53%|█████▎    | 349/656 [19:50<14:38,  2.86s/it]

{'loss': 0.2588, 'learning_rate': 8.337314859053989e-06, 'epoch': 1.06}


 53%|█████▎    | 350/656 [19:53<13:44,  2.70s/it]

{'loss': 0.1668, 'learning_rate': 8.361204013377926e-06, 'epoch': 1.07}


 54%|█████▎    | 351/656 [19:56<14:33,  2.86s/it]

{'loss': 0.2581, 'learning_rate': 8.385093167701864e-06, 'epoch': 1.07}


 54%|█████▎    | 352/656 [19:59<15:16,  3.01s/it]

{'loss': 0.2449, 'learning_rate': 8.4089823220258e-06, 'epoch': 1.07}


 54%|█████▍    | 353/656 [20:02<15:20,  3.04s/it]

{'loss': 0.2722, 'learning_rate': 8.432871476349738e-06, 'epoch': 1.08}


 54%|█████▍    | 354/656 [20:05<15:00,  2.98s/it]

{'loss': 0.1445, 'learning_rate': 8.456760630673674e-06, 'epoch': 1.08}


 54%|█████▍    | 355/656 [20:08<14:52,  2.96s/it]

{'loss': 0.2004, 'learning_rate': 8.480649784997612e-06, 'epoch': 1.08}


 54%|█████▍    | 356/656 [20:11<15:11,  3.04s/it]

{'loss': 0.1378, 'learning_rate': 8.50453893932155e-06, 'epoch': 1.09}


 54%|█████▍    | 357/656 [20:14<15:11,  3.05s/it]

{'loss': 0.1481, 'learning_rate': 8.528428093645485e-06, 'epoch': 1.09}


 55%|█████▍    | 358/656 [20:17<14:41,  2.96s/it]

{'loss': 0.1565, 'learning_rate': 8.552317247969423e-06, 'epoch': 1.09}


 55%|█████▍    | 359/656 [20:20<14:45,  2.98s/it]

{'loss': 0.282, 'learning_rate': 8.576206402293359e-06, 'epoch': 1.09}


 55%|█████▍    | 360/656 [20:24<16:05,  3.26s/it]

{'loss': 0.2595, 'learning_rate': 8.600095556617297e-06, 'epoch': 1.1}


 55%|█████▌    | 361/656 [20:27<15:11,  3.09s/it]

{'loss': 0.1629, 'learning_rate': 8.623984710941233e-06, 'epoch': 1.1}


 55%|█████▌    | 362/656 [20:30<14:57,  3.05s/it]

{'loss': 0.1731, 'learning_rate': 8.64787386526517e-06, 'epoch': 1.1}


 55%|█████▌    | 363/656 [20:33<14:30,  2.97s/it]

{'loss': 0.1548, 'learning_rate': 8.671763019589108e-06, 'epoch': 1.11}


 55%|█████▌    | 364/656 [20:36<14:45,  3.03s/it]

{'loss': 0.1346, 'learning_rate': 8.695652173913044e-06, 'epoch': 1.11}


 56%|█████▌    | 365/656 [20:39<14:40,  3.03s/it]

{'loss': 0.1406, 'learning_rate': 8.71954132823698e-06, 'epoch': 1.11}


 56%|█████▌    | 366/656 [20:41<14:09,  2.93s/it]

{'loss': 0.1945, 'learning_rate': 8.743430482560916e-06, 'epoch': 1.12}


 56%|█████▌    | 367/656 [20:44<14:08,  2.94s/it]

{'loss': 0.1709, 'learning_rate': 8.767319636884854e-06, 'epoch': 1.12}


 56%|█████▌    | 368/656 [20:48<14:26,  3.01s/it]

{'loss': 0.1568, 'learning_rate': 8.791208791208792e-06, 'epoch': 1.12}


 56%|█████▋    | 369/656 [20:50<14:01,  2.93s/it]

{'loss': 0.1867, 'learning_rate': 8.815097945532728e-06, 'epoch': 1.12}


 56%|█████▋    | 370/656 [20:54<14:33,  3.05s/it]

{'loss': 0.1324, 'learning_rate': 8.838987099856666e-06, 'epoch': 1.13}


 57%|█████▋    | 371/656 [20:57<14:39,  3.09s/it]

{'loss': 0.1337, 'learning_rate': 8.862876254180602e-06, 'epoch': 1.13}


 57%|█████▋    | 372/656 [21:00<14:21,  3.03s/it]

{'loss': 0.1291, 'learning_rate': 8.88676540850454e-06, 'epoch': 1.13}


 57%|█████▋    | 373/656 [21:03<14:06,  2.99s/it]

{'loss': 0.2092, 'learning_rate': 8.910654562828476e-06, 'epoch': 1.14}


 57%|█████▋    | 374/656 [21:05<13:43,  2.92s/it]

{'loss': 0.1248, 'learning_rate': 8.934543717152413e-06, 'epoch': 1.14}


 57%|█████▋    | 375/656 [21:08<13:25,  2.87s/it]

{'loss': 0.1822, 'learning_rate': 8.958432871476351e-06, 'epoch': 1.14}


 57%|█████▋    | 376/656 [21:12<14:02,  3.01s/it]

{'loss': 0.1166, 'learning_rate': 8.982322025800287e-06, 'epoch': 1.15}


 57%|█████▋    | 377/656 [21:15<14:38,  3.15s/it]

{'loss': 0.1251, 'learning_rate': 9.006211180124225e-06, 'epoch': 1.15}


 58%|█████▊    | 378/656 [21:18<14:17,  3.08s/it]

{'loss': 0.241, 'learning_rate': 9.03010033444816e-06, 'epoch': 1.15}


 58%|█████▊    | 379/656 [21:21<14:53,  3.23s/it]

{'loss': 0.3768, 'learning_rate': 9.053989488772099e-06, 'epoch': 1.16}


 58%|█████▊    | 380/656 [21:25<14:50,  3.22s/it]

{'loss': 0.2215, 'learning_rate': 9.077878643096036e-06, 'epoch': 1.16}


 58%|█████▊    | 381/656 [21:28<14:40,  3.20s/it]

{'loss': 0.2204, 'learning_rate': 9.101767797419972e-06, 'epoch': 1.16}


 58%|█████▊    | 382/656 [21:31<14:10,  3.11s/it]

{'loss': 0.2086, 'learning_rate': 9.125656951743908e-06, 'epoch': 1.16}


 58%|█████▊    | 383/656 [21:34<13:56,  3.06s/it]

{'loss': 0.094, 'learning_rate': 9.149546106067846e-06, 'epoch': 1.17}


 59%|█████▊    | 384/656 [21:36<13:26,  2.96s/it]

{'loss': 0.2061, 'learning_rate': 9.173435260391782e-06, 'epoch': 1.17}


 59%|█████▊    | 385/656 [21:40<14:00,  3.10s/it]

{'loss': 0.1234, 'learning_rate': 9.197324414715718e-06, 'epoch': 1.17}


 59%|█████▉    | 386/656 [21:43<14:10,  3.15s/it]

{'loss': 0.1646, 'learning_rate': 9.221213569039656e-06, 'epoch': 1.18}


 59%|█████▉    | 387/656 [21:46<13:56,  3.11s/it]

{'loss': 0.1994, 'learning_rate': 9.245102723363594e-06, 'epoch': 1.18}


 59%|█████▉    | 388/656 [21:49<13:32,  3.03s/it]

{'loss': 0.1983, 'learning_rate': 9.26899187768753e-06, 'epoch': 1.18}


 59%|█████▉    | 389/656 [21:52<13:19,  2.99s/it]

{'loss': 0.1189, 'learning_rate': 9.292881032011467e-06, 'epoch': 1.19}


 59%|█████▉    | 390/656 [21:55<13:08,  2.96s/it]

{'loss': 0.1595, 'learning_rate': 9.316770186335403e-06, 'epoch': 1.19}


 60%|█████▉    | 391/656 [21:58<13:13,  3.00s/it]

{'loss': 0.243, 'learning_rate': 9.340659340659341e-06, 'epoch': 1.19}


 60%|█████▉    | 392/656 [22:01<13:31,  3.07s/it]

{'loss': 0.1402, 'learning_rate': 9.364548494983277e-06, 'epoch': 1.2}


 60%|█████▉    | 393/656 [22:05<14:15,  3.25s/it]

{'loss': 0.101, 'learning_rate': 9.388437649307215e-06, 'epoch': 1.2}


 60%|██████    | 394/656 [22:08<14:39,  3.36s/it]

{'loss': 0.1408, 'learning_rate': 9.412326803631153e-06, 'epoch': 1.2}


 60%|██████    | 395/656 [22:11<13:56,  3.20s/it]

{'loss': 0.1264, 'learning_rate': 9.436215957955089e-06, 'epoch': 1.2}


 60%|██████    | 396/656 [22:14<13:34,  3.13s/it]

{'loss': 0.1749, 'learning_rate': 9.460105112279026e-06, 'epoch': 1.21}


 61%|██████    | 397/656 [22:18<14:33,  3.37s/it]

{'loss': 0.3023, 'learning_rate': 9.483994266602962e-06, 'epoch': 1.21}


 61%|██████    | 398/656 [22:22<14:35,  3.40s/it]

{'loss': 0.0951, 'learning_rate': 9.5078834209269e-06, 'epoch': 1.21}


 61%|██████    | 399/656 [22:26<15:37,  3.65s/it]

{'loss': 0.0981, 'learning_rate': 9.531772575250838e-06, 'epoch': 1.22}


 61%|██████    | 400/656 [22:29<15:21,  3.60s/it]

{'loss': 0.1356, 'learning_rate': 9.555661729574774e-06, 'epoch': 1.22}


 61%|██████    | 401/656 [22:33<15:10,  3.57s/it]

{'loss': 0.1247, 'learning_rate': 9.57955088389871e-06, 'epoch': 1.22}


 61%|██████▏   | 402/656 [22:36<14:44,  3.48s/it]

{'loss': 0.0951, 'learning_rate': 9.603440038222648e-06, 'epoch': 1.23}


 61%|██████▏   | 403/656 [22:40<14:47,  3.51s/it]

{'loss': 0.0872, 'learning_rate': 9.627329192546584e-06, 'epoch': 1.23}


 62%|██████▏   | 404/656 [22:43<15:06,  3.60s/it]

{'loss': 0.1257, 'learning_rate': 9.65121834687052e-06, 'epoch': 1.23}


 62%|██████▏   | 405/656 [22:47<14:37,  3.49s/it]

{'loss': 0.2127, 'learning_rate': 9.675107501194458e-06, 'epoch': 1.23}


 62%|██████▏   | 406/656 [22:49<13:39,  3.28s/it]

{'loss': 0.0864, 'learning_rate': 9.698996655518395e-06, 'epoch': 1.24}


 62%|██████▏   | 407/656 [22:53<13:22,  3.22s/it]

{'loss': 0.0839, 'learning_rate': 9.722885809842331e-06, 'epoch': 1.24}


 62%|██████▏   | 408/656 [22:55<12:43,  3.08s/it]

{'loss': 0.0914, 'learning_rate': 9.746774964166269e-06, 'epoch': 1.24}


 62%|██████▏   | 409/656 [22:58<12:37,  3.07s/it]

{'loss': 0.0843, 'learning_rate': 9.770664118490205e-06, 'epoch': 1.25}


 62%|██████▎   | 410/656 [23:01<12:15,  2.99s/it]

{'loss': 0.1274, 'learning_rate': 9.794553272814143e-06, 'epoch': 1.25}


 63%|██████▎   | 411/656 [23:04<12:11,  2.99s/it]

{'loss': 0.0822, 'learning_rate': 9.81844242713808e-06, 'epoch': 1.25}


 63%|██████▎   | 412/656 [23:07<12:08,  2.98s/it]

{'loss': 0.147, 'learning_rate': 9.842331581462017e-06, 'epoch': 1.26}


 63%|██████▎   | 413/656 [23:10<11:59,  2.96s/it]

{'loss': 0.0922, 'learning_rate': 9.866220735785954e-06, 'epoch': 1.26}


 63%|██████▎   | 414/656 [23:12<11:20,  2.81s/it]

{'loss': 0.1168, 'learning_rate': 9.89010989010989e-06, 'epoch': 1.26}


 63%|██████▎   | 415/656 [23:15<11:10,  2.78s/it]

{'loss': 0.158, 'learning_rate': 9.913999044433828e-06, 'epoch': 1.27}


 63%|██████▎   | 416/656 [23:18<11:06,  2.78s/it]

{'loss': 0.0997, 'learning_rate': 9.937888198757764e-06, 'epoch': 1.27}


 64%|██████▎   | 417/656 [23:23<13:27,  3.38s/it]

{'loss': 0.0976, 'learning_rate': 9.961777353081702e-06, 'epoch': 1.27}


 64%|██████▎   | 418/656 [23:26<13:08,  3.31s/it]

{'loss': 0.0913, 'learning_rate': 9.98566650740564e-06, 'epoch': 1.27}


 64%|██████▍   | 419/656 [23:29<13:04,  3.31s/it]

{'loss': 0.1308, 'learning_rate': 1.0009555661729576e-05, 'epoch': 1.28}


 64%|██████▍   | 420/656 [23:33<13:09,  3.34s/it]

{'loss': 0.224, 'learning_rate': 1.0033444816053512e-05, 'epoch': 1.28}


 64%|██████▍   | 421/656 [23:35<12:22,  3.16s/it]

{'loss': 0.3396, 'learning_rate': 1.005733397037745e-05, 'epoch': 1.28}


 64%|██████▍   | 422/656 [23:38<11:53,  3.05s/it]

{'loss': 0.2471, 'learning_rate': 1.0081223124701385e-05, 'epoch': 1.29}


 64%|██████▍   | 423/656 [23:42<12:28,  3.21s/it]

{'loss': 0.0864, 'learning_rate': 1.0105112279025323e-05, 'epoch': 1.29}


 65%|██████▍   | 424/656 [23:44<11:51,  3.07s/it]

{'loss': 0.2202, 'learning_rate': 1.0129001433349259e-05, 'epoch': 1.29}


 65%|██████▍   | 425/656 [23:47<11:22,  2.95s/it]

{'loss': 0.1544, 'learning_rate': 1.0152890587673197e-05, 'epoch': 1.3}


 65%|██████▍   | 426/656 [23:50<11:20,  2.96s/it]

{'loss': 0.0905, 'learning_rate': 1.0176779741997133e-05, 'epoch': 1.3}


 65%|██████▌   | 427/656 [23:53<11:06,  2.91s/it]

{'loss': 0.111, 'learning_rate': 1.020066889632107e-05, 'epoch': 1.3}


 65%|██████▌   | 428/656 [23:56<10:49,  2.85s/it]

{'loss': 0.1118, 'learning_rate': 1.0224558050645007e-05, 'epoch': 1.3}


 65%|██████▌   | 429/656 [23:59<11:18,  2.99s/it]

{'loss': 0.0979, 'learning_rate': 1.0248447204968944e-05, 'epoch': 1.31}


 66%|██████▌   | 430/656 [24:02<11:10,  2.97s/it]

{'loss': 0.2306, 'learning_rate': 1.0272336359292882e-05, 'epoch': 1.31}


 66%|██████▌   | 431/656 [24:05<10:50,  2.89s/it]

{'loss': 0.0957, 'learning_rate': 1.0296225513616818e-05, 'epoch': 1.31}


 66%|██████▌   | 432/656 [24:08<11:55,  3.19s/it]

{'loss': 0.2691, 'learning_rate': 1.0320114667940756e-05, 'epoch': 1.32}


 66%|██████▌   | 433/656 [24:12<11:49,  3.18s/it]

{'loss': 0.2412, 'learning_rate': 1.0344003822264692e-05, 'epoch': 1.32}


 66%|██████▌   | 434/656 [24:14<11:17,  3.05s/it]

{'loss': 0.1844, 'learning_rate': 1.036789297658863e-05, 'epoch': 1.32}


 66%|██████▋   | 435/656 [24:17<10:48,  2.93s/it]

{'loss': 0.1571, 'learning_rate': 1.0391782130912567e-05, 'epoch': 1.33}


 66%|██████▋   | 436/656 [24:20<10:36,  2.89s/it]

{'loss': 0.2615, 'learning_rate': 1.0415671285236503e-05, 'epoch': 1.33}


 67%|██████▋   | 437/656 [24:23<10:30,  2.88s/it]

{'loss': 0.0927, 'learning_rate': 1.0439560439560441e-05, 'epoch': 1.33}


 67%|██████▋   | 438/656 [24:25<10:16,  2.83s/it]

{'loss': 0.0677, 'learning_rate': 1.0463449593884377e-05, 'epoch': 1.34}


 67%|██████▋   | 439/656 [24:29<10:36,  2.93s/it]

{'loss': 0.0669, 'learning_rate': 1.0487338748208313e-05, 'epoch': 1.34}


 67%|██████▋   | 440/656 [24:31<10:24,  2.89s/it]

{'loss': 0.2185, 'learning_rate': 1.0511227902532251e-05, 'epoch': 1.34}


 67%|██████▋   | 441/656 [24:34<10:14,  2.86s/it]

{'loss': 0.0725, 'learning_rate': 1.0535117056856187e-05, 'epoch': 1.34}


 67%|██████▋   | 442/656 [24:37<10:01,  2.81s/it]

{'loss': 0.0866, 'learning_rate': 1.0559006211180125e-05, 'epoch': 1.35}


 68%|██████▊   | 443/656 [24:40<10:32,  2.97s/it]

{'loss': 0.144, 'learning_rate': 1.058289536550406e-05, 'epoch': 1.35}


 68%|██████▊   | 444/656 [24:43<10:32,  2.98s/it]

{'loss': 0.0732, 'learning_rate': 1.0606784519827999e-05, 'epoch': 1.35}


 68%|██████▊   | 445/656 [24:47<11:27,  3.26s/it]

{'loss': 0.2196, 'learning_rate': 1.0630673674151935e-05, 'epoch': 1.36}


 68%|██████▊   | 446/656 [24:50<10:45,  3.07s/it]

{'loss': 0.0721, 'learning_rate': 1.0654562828475872e-05, 'epoch': 1.36}


 68%|██████▊   | 447/656 [24:52<10:14,  2.94s/it]

{'loss': 0.0698, 'learning_rate': 1.067845198279981e-05, 'epoch': 1.36}


 68%|██████▊   | 448/656 [24:55<09:53,  2.85s/it]

{'loss': 0.0628, 'learning_rate': 1.0702341137123746e-05, 'epoch': 1.37}


 68%|██████▊   | 449/656 [24:58<10:21,  3.00s/it]

{'loss': 0.0659, 'learning_rate': 1.0726230291447684e-05, 'epoch': 1.37}


 69%|██████▊   | 450/656 [25:01<10:06,  2.94s/it]

{'loss': 0.069, 'learning_rate': 1.075011944577162e-05, 'epoch': 1.37}


 69%|██████▉   | 451/656 [25:04<10:00,  2.93s/it]

{'loss': 0.0747, 'learning_rate': 1.0774008600095558e-05, 'epoch': 1.38}


 69%|██████▉   | 452/656 [25:07<09:39,  2.84s/it]

{'loss': 0.0873, 'learning_rate': 1.0797897754419494e-05, 'epoch': 1.38}


 69%|██████▉   | 453/656 [25:09<09:22,  2.77s/it]

{'loss': 0.0791, 'learning_rate': 1.0821786908743431e-05, 'epoch': 1.38}


 69%|██████▉   | 454/656 [25:13<09:46,  2.90s/it]

{'loss': 0.0636, 'learning_rate': 1.0845676063067369e-05, 'epoch': 1.38}


 69%|██████▉   | 455/656 [25:16<10:13,  3.05s/it]

{'loss': 0.0697, 'learning_rate': 1.0869565217391305e-05, 'epoch': 1.39}


 70%|██████▉   | 456/656 [25:19<10:22,  3.11s/it]

{'loss': 0.1703, 'learning_rate': 1.0893454371715243e-05, 'epoch': 1.39}


 70%|██████▉   | 457/656 [25:22<10:07,  3.05s/it]

{'loss': 0.0533, 'learning_rate': 1.0917343526039179e-05, 'epoch': 1.39}


 70%|██████▉   | 458/656 [25:25<09:35,  2.91s/it]

{'loss': 0.2039, 'learning_rate': 1.0941232680363115e-05, 'epoch': 1.4}


 70%|██████▉   | 459/656 [25:28<09:48,  2.99s/it]

{'loss': 0.1781, 'learning_rate': 1.0965121834687053e-05, 'epoch': 1.4}


 70%|███████   | 460/656 [25:31<09:40,  2.96s/it]

{'loss': 0.09, 'learning_rate': 1.0989010989010989e-05, 'epoch': 1.4}


 70%|███████   | 461/656 [25:34<10:19,  3.18s/it]

{'loss': 0.0549, 'learning_rate': 1.1012900143334926e-05, 'epoch': 1.41}


 70%|███████   | 462/656 [25:38<10:39,  3.30s/it]

{'loss': 0.0587, 'learning_rate': 1.1036789297658862e-05, 'epoch': 1.41}


 71%|███████   | 463/656 [25:41<10:06,  3.14s/it]

{'loss': 0.067, 'learning_rate': 1.10606784519828e-05, 'epoch': 1.41}


 71%|███████   | 464/656 [25:43<09:39,  3.02s/it]

{'loss': 0.0839, 'learning_rate': 1.1084567606306736e-05, 'epoch': 1.41}


 71%|███████   | 465/656 [25:47<09:41,  3.04s/it]

{'loss': 0.086, 'learning_rate': 1.1108456760630674e-05, 'epoch': 1.42}


 71%|███████   | 466/656 [25:49<09:27,  2.99s/it]

{'loss': 0.0493, 'learning_rate': 1.1132345914954612e-05, 'epoch': 1.42}


 71%|███████   | 467/656 [25:52<08:46,  2.78s/it]

{'loss': 0.2674, 'learning_rate': 1.1156235069278548e-05, 'epoch': 1.42}


 71%|███████▏  | 468/656 [25:55<08:51,  2.82s/it]

{'loss': 0.0584, 'learning_rate': 1.1180124223602485e-05, 'epoch': 1.43}


 71%|███████▏  | 469/656 [25:58<09:01,  2.89s/it]

{'loss': 0.2451, 'learning_rate': 1.1204013377926421e-05, 'epoch': 1.43}


 72%|███████▏  | 470/656 [26:01<08:52,  2.86s/it]

{'loss': 0.113, 'learning_rate': 1.122790253225036e-05, 'epoch': 1.43}


 72%|███████▏  | 471/656 [26:03<08:46,  2.85s/it]

{'loss': 0.1049, 'learning_rate': 1.1251791686574297e-05, 'epoch': 1.44}


 72%|███████▏  | 472/656 [26:06<08:41,  2.84s/it]

{'loss': 0.0527, 'learning_rate': 1.1275680840898233e-05, 'epoch': 1.44}


 72%|███████▏  | 473/656 [26:09<09:05,  2.98s/it]

{'loss': 0.0934, 'learning_rate': 1.129956999522217e-05, 'epoch': 1.44}


 72%|███████▏  | 474/656 [26:12<08:33,  2.82s/it]

{'loss': 0.0617, 'learning_rate': 1.1323459149546107e-05, 'epoch': 1.45}


 72%|███████▏  | 475/656 [26:15<08:50,  2.93s/it]

{'loss': 0.226, 'learning_rate': 1.1347348303870044e-05, 'epoch': 1.45}


 73%|███████▎  | 476/656 [26:18<08:50,  2.95s/it]

{'loss': 0.0563, 'learning_rate': 1.137123745819398e-05, 'epoch': 1.45}


 73%|███████▎  | 477/656 [26:22<09:20,  3.13s/it]

{'loss': 0.0521, 'learning_rate': 1.1395126612517917e-05, 'epoch': 1.45}


 73%|███████▎  | 478/656 [26:25<09:03,  3.05s/it]

{'loss': 0.0522, 'learning_rate': 1.1419015766841854e-05, 'epoch': 1.46}


 73%|███████▎  | 479/656 [26:27<08:41,  2.95s/it]

{'loss': 0.2157, 'learning_rate': 1.144290492116579e-05, 'epoch': 1.46}


 73%|███████▎  | 480/656 [26:30<08:17,  2.83s/it]

{'loss': 0.059, 'learning_rate': 1.1466794075489728e-05, 'epoch': 1.46}


 73%|███████▎  | 481/656 [26:32<08:05,  2.78s/it]

{'loss': 0.1555, 'learning_rate': 1.1490683229813664e-05, 'epoch': 1.47}


 73%|███████▎  | 482/656 [26:35<07:53,  2.72s/it]

{'loss': 0.0528, 'learning_rate': 1.1514572384137602e-05, 'epoch': 1.47}


 74%|███████▎  | 483/656 [26:38<07:54,  2.75s/it]

{'loss': 0.088, 'learning_rate': 1.153846153846154e-05, 'epoch': 1.47}


 74%|███████▍  | 484/656 [26:42<09:25,  3.29s/it]

{'loss': 0.0475, 'learning_rate': 1.1562350692785476e-05, 'epoch': 1.48}


 74%|███████▍  | 485/656 [26:46<09:18,  3.27s/it]

{'loss': 0.0758, 'learning_rate': 1.1586239847109413e-05, 'epoch': 1.48}


 74%|███████▍  | 486/656 [26:48<08:49,  3.12s/it]

{'loss': 0.0465, 'learning_rate': 1.161012900143335e-05, 'epoch': 1.48}


 74%|███████▍  | 487/656 [26:52<08:58,  3.18s/it]

{'loss': 0.1706, 'learning_rate': 1.1634018155757287e-05, 'epoch': 1.48}


 74%|███████▍  | 488/656 [26:54<08:24,  3.00s/it]

{'loss': 0.0875, 'learning_rate': 1.1657907310081223e-05, 'epoch': 1.49}


 75%|███████▍  | 489/656 [26:57<08:18,  2.99s/it]

{'loss': 0.0407, 'learning_rate': 1.168179646440516e-05, 'epoch': 1.49}


 75%|███████▍  | 490/656 [27:00<08:18,  3.01s/it]

{'loss': 0.0433, 'learning_rate': 1.1705685618729099e-05, 'epoch': 1.49}


 75%|███████▍  | 491/656 [27:03<07:57,  2.89s/it]

{'loss': 0.0399, 'learning_rate': 1.1729574773053035e-05, 'epoch': 1.5}


 75%|███████▌  | 492/656 [27:07<09:09,  3.35s/it]

{'loss': 0.1277, 'learning_rate': 1.1753463927376972e-05, 'epoch': 1.5}


 75%|███████▌  | 493/656 [27:10<08:41,  3.20s/it]

{'loss': 0.0515, 'learning_rate': 1.1777353081700908e-05, 'epoch': 1.5}


 75%|███████▌  | 494/656 [27:13<08:26,  3.13s/it]

{'loss': 0.0939, 'learning_rate': 1.1801242236024846e-05, 'epoch': 1.51}


 75%|███████▌  | 495/656 [27:16<08:09,  3.04s/it]

{'loss': 0.1802, 'learning_rate': 1.1825131390348782e-05, 'epoch': 1.51}


 76%|███████▌  | 496/656 [27:19<07:45,  2.91s/it]

{'loss': 0.0749, 'learning_rate': 1.1849020544672718e-05, 'epoch': 1.51}


 76%|███████▌  | 497/656 [27:21<07:31,  2.84s/it]

{'loss': 0.1754, 'learning_rate': 1.1872909698996656e-05, 'epoch': 1.52}


 76%|███████▌  | 498/656 [27:24<07:45,  2.94s/it]

{'loss': 0.0897, 'learning_rate': 1.1896798853320592e-05, 'epoch': 1.52}


 76%|███████▌  | 499/656 [27:27<07:44,  2.96s/it]

{'loss': 0.1326, 'learning_rate': 1.192068800764453e-05, 'epoch': 1.52}


 76%|███████▌  | 500/656 [27:30<07:24,  2.85s/it]

{'loss': 0.1297, 'learning_rate': 1.1944577161968466e-05, 'epoch': 1.52}


 76%|███████▋  | 501/656 [27:33<07:19,  2.84s/it]

{'loss': 0.049, 'learning_rate': 1.1968466316292403e-05, 'epoch': 1.53}


 77%|███████▋  | 502/656 [27:36<07:41,  3.00s/it]

{'loss': 0.0397, 'learning_rate': 1.1992355470616341e-05, 'epoch': 1.53}


 77%|███████▋  | 503/656 [27:39<07:33,  2.96s/it]

{'loss': 0.221, 'learning_rate': 1.2016244624940277e-05, 'epoch': 1.53}


 77%|███████▋  | 504/656 [27:42<07:33,  2.98s/it]

{'loss': 0.1383, 'learning_rate': 1.2040133779264215e-05, 'epoch': 1.54}


 77%|███████▋  | 505/656 [27:46<07:50,  3.12s/it]

{'loss': 0.0411, 'learning_rate': 1.2064022933588151e-05, 'epoch': 1.54}


 77%|███████▋  | 506/656 [27:49<07:39,  3.07s/it]

{'loss': 0.2089, 'learning_rate': 1.2087912087912089e-05, 'epoch': 1.54}


 77%|███████▋  | 507/656 [27:51<07:25,  2.99s/it]

{'loss': 0.0545, 'learning_rate': 1.2111801242236026e-05, 'epoch': 1.55}


 77%|███████▋  | 508/656 [27:54<07:23,  2.99s/it]

{'loss': 0.0644, 'learning_rate': 1.2135690396559962e-05, 'epoch': 1.55}


 78%|███████▊  | 509/656 [27:57<07:05,  2.89s/it]

{'loss': 0.0478, 'learning_rate': 1.21595795508839e-05, 'epoch': 1.55}


 78%|███████▊  | 510/656 [28:00<06:56,  2.86s/it]

{'loss': 0.2594, 'learning_rate': 1.2183468705207836e-05, 'epoch': 1.55}


 78%|███████▊  | 511/656 [28:04<07:45,  3.21s/it]

{'loss': 0.0402, 'learning_rate': 1.2207357859531774e-05, 'epoch': 1.56}


 78%|███████▊  | 512/656 [28:07<07:25,  3.09s/it]

{'loss': 0.2094, 'learning_rate': 1.223124701385571e-05, 'epoch': 1.56}


 78%|███████▊  | 513/656 [28:10<07:48,  3.27s/it]

{'loss': 0.0431, 'learning_rate': 1.2255136168179648e-05, 'epoch': 1.56}


 78%|███████▊  | 514/656 [28:13<07:31,  3.18s/it]

{'loss': 0.0402, 'learning_rate': 1.2279025322503584e-05, 'epoch': 1.57}


 79%|███████▊  | 515/656 [28:16<07:14,  3.08s/it]

{'loss': 0.1203, 'learning_rate': 1.230291447682752e-05, 'epoch': 1.57}


 79%|███████▊  | 516/656 [28:20<07:25,  3.18s/it]

{'loss': 0.0417, 'learning_rate': 1.2326803631151458e-05, 'epoch': 1.57}


 79%|███████▉  | 517/656 [28:22<07:11,  3.11s/it]

{'loss': 0.0806, 'learning_rate': 1.2350692785475394e-05, 'epoch': 1.58}


 79%|███████▉  | 518/656 [28:25<07:01,  3.05s/it]

{'loss': 0.0522, 'learning_rate': 1.2374581939799331e-05, 'epoch': 1.58}


 79%|███████▉  | 519/656 [28:29<07:10,  3.14s/it]

{'loss': 0.0409, 'learning_rate': 1.2398471094123269e-05, 'epoch': 1.58}


 79%|███████▉  | 520/656 [28:32<07:06,  3.14s/it]

{'loss': 0.0458, 'learning_rate': 1.2422360248447205e-05, 'epoch': 1.59}


 79%|███████▉  | 521/656 [28:35<07:20,  3.27s/it]

{'loss': 0.1974, 'learning_rate': 1.2446249402771143e-05, 'epoch': 1.59}


 80%|███████▉  | 522/656 [28:38<06:55,  3.10s/it]

{'loss': 0.1366, 'learning_rate': 1.2470138557095079e-05, 'epoch': 1.59}


 80%|███████▉  | 523/656 [28:41<06:37,  2.99s/it]

{'loss': 0.0711, 'learning_rate': 1.2494027711419017e-05, 'epoch': 1.59}


 80%|███████▉  | 524/656 [28:44<06:48,  3.10s/it]

{'loss': 0.0408, 'learning_rate': 1.2517916865742954e-05, 'epoch': 1.6}


 80%|████████  | 525/656 [28:47<06:26,  2.95s/it]

{'loss': 0.2369, 'learning_rate': 1.254180602006689e-05, 'epoch': 1.6}


 80%|████████  | 526/656 [28:50<06:13,  2.87s/it]

{'loss': 0.06, 'learning_rate': 1.2565695174390826e-05, 'epoch': 1.6}


 80%|████████  | 527/656 [28:52<06:10,  2.87s/it]

{'loss': 0.0302, 'learning_rate': 1.2589584328714766e-05, 'epoch': 1.61}


 80%|████████  | 528/656 [28:55<06:16,  2.94s/it]

{'loss': 0.0847, 'learning_rate': 1.2613473483038702e-05, 'epoch': 1.61}


 81%|████████  | 529/656 [28:59<06:16,  2.97s/it]

{'loss': 0.0377, 'learning_rate': 1.2637362637362638e-05, 'epoch': 1.61}


 81%|████████  | 530/656 [29:01<06:04,  2.89s/it]

{'loss': 0.0349, 'learning_rate': 1.2661251791686574e-05, 'epoch': 1.62}


 81%|████████  | 531/656 [29:04<05:46,  2.77s/it]

{'loss': 0.1808, 'learning_rate': 1.2685140946010512e-05, 'epoch': 1.62}


 81%|████████  | 532/656 [29:07<05:47,  2.80s/it]

{'loss': 0.1485, 'learning_rate': 1.270903010033445e-05, 'epoch': 1.62}


 81%|████████▏ | 533/656 [29:10<05:50,  2.85s/it]

{'loss': 0.0378, 'learning_rate': 1.2732919254658385e-05, 'epoch': 1.62}


 81%|████████▏ | 534/656 [29:13<06:25,  3.16s/it]

{'loss': 0.047, 'learning_rate': 1.2756808408982323e-05, 'epoch': 1.63}


 82%|████████▏ | 535/656 [29:17<06:19,  3.14s/it]

{'loss': 0.0463, 'learning_rate': 1.278069756330626e-05, 'epoch': 1.63}


 82%|████████▏ | 536/656 [29:19<06:07,  3.06s/it]

{'loss': 0.0391, 'learning_rate': 1.2804586717630195e-05, 'epoch': 1.63}


 82%|████████▏ | 537/656 [29:22<06:03,  3.05s/it]

{'loss': 0.2932, 'learning_rate': 1.2828475871954135e-05, 'epoch': 1.64}


 82%|████████▏ | 538/656 [29:25<06:00,  3.06s/it]

{'loss': 0.0326, 'learning_rate': 1.285236502627807e-05, 'epoch': 1.64}


 82%|████████▏ | 539/656 [29:29<06:04,  3.12s/it]

{'loss': 0.0512, 'learning_rate': 1.2876254180602007e-05, 'epoch': 1.64}


 82%|████████▏ | 540/656 [29:32<05:56,  3.07s/it]

{'loss': 0.0695, 'learning_rate': 1.2900143334925943e-05, 'epoch': 1.65}


 82%|████████▏ | 541/656 [29:34<05:36,  2.93s/it]

{'loss': 0.1056, 'learning_rate': 1.2924032489249882e-05, 'epoch': 1.65}


 83%|████████▎ | 542/656 [29:38<05:44,  3.02s/it]

{'loss': 0.0505, 'learning_rate': 1.2947921643573818e-05, 'epoch': 1.65}


 83%|████████▎ | 543/656 [29:40<05:32,  2.95s/it]

{'loss': 0.0333, 'learning_rate': 1.2971810797897754e-05, 'epoch': 1.66}


 83%|████████▎ | 544/656 [29:43<05:27,  2.92s/it]

{'loss': 0.1373, 'learning_rate': 1.2995699952221694e-05, 'epoch': 1.66}


 83%|████████▎ | 545/656 [29:46<05:24,  2.92s/it]

{'loss': 0.0887, 'learning_rate': 1.301958910654563e-05, 'epoch': 1.66}


 83%|████████▎ | 546/656 [29:49<05:14,  2.86s/it]

{'loss': 0.0569, 'learning_rate': 1.3043478260869566e-05, 'epoch': 1.66}


 83%|████████▎ | 547/656 [29:52<05:12,  2.87s/it]

{'loss': 0.0516, 'learning_rate': 1.3067367415193502e-05, 'epoch': 1.67}


 84%|████████▎ | 548/656 [29:55<05:28,  3.04s/it]

{'loss': 0.0734, 'learning_rate': 1.3091256569517441e-05, 'epoch': 1.67}


 84%|████████▎ | 549/656 [29:59<05:37,  3.15s/it]

{'loss': 0.0705, 'learning_rate': 1.3115145723841377e-05, 'epoch': 1.67}


 84%|████████▍ | 550/656 [30:02<05:45,  3.26s/it]

{'loss': 0.0393, 'learning_rate': 1.3139034878165313e-05, 'epoch': 1.68}


 84%|████████▍ | 551/656 [30:05<05:42,  3.27s/it]

{'loss': 0.1383, 'learning_rate': 1.3162924032489251e-05, 'epoch': 1.68}


 84%|████████▍ | 552/656 [30:08<05:33,  3.21s/it]

{'loss': 0.0411, 'learning_rate': 1.3186813186813187e-05, 'epoch': 1.68}


 84%|████████▍ | 553/656 [30:12<05:47,  3.38s/it]

{'loss': 0.1631, 'learning_rate': 1.3210702341137123e-05, 'epoch': 1.69}


 84%|████████▍ | 554/656 [30:16<05:45,  3.39s/it]

{'loss': 0.0411, 'learning_rate': 1.323459149546106e-05, 'epoch': 1.69}


 85%|████████▍ | 555/656 [30:19<05:27,  3.25s/it]

{'loss': 0.0323, 'learning_rate': 1.3258480649784999e-05, 'epoch': 1.69}


 85%|████████▍ | 556/656 [30:22<05:22,  3.22s/it]

{'loss': 0.0324, 'learning_rate': 1.3282369804108935e-05, 'epoch': 1.7}


 85%|████████▍ | 557/656 [30:25<05:08,  3.12s/it]

{'loss': 0.0955, 'learning_rate': 1.330625895843287e-05, 'epoch': 1.7}


 85%|████████▌ | 558/656 [30:27<04:54,  3.00s/it]

{'loss': 0.0458, 'learning_rate': 1.333014811275681e-05, 'epoch': 1.7}


 85%|████████▌ | 559/656 [30:30<04:55,  3.05s/it]

{'loss': 0.0338, 'learning_rate': 1.3354037267080746e-05, 'epoch': 1.7}


 85%|████████▌ | 560/656 [30:33<04:43,  2.95s/it]

{'loss': 0.0382, 'learning_rate': 1.3377926421404682e-05, 'epoch': 1.71}


 86%|████████▌ | 561/656 [30:36<04:40,  2.96s/it]

{'loss': 0.1102, 'learning_rate': 1.3401815575728622e-05, 'epoch': 1.71}


 86%|████████▌ | 562/656 [30:39<04:39,  2.97s/it]

{'loss': 0.0313, 'learning_rate': 1.3425704730052558e-05, 'epoch': 1.71}


 86%|████████▌ | 563/656 [30:42<04:24,  2.85s/it]

{'loss': 0.033, 'learning_rate': 1.3449593884376494e-05, 'epoch': 1.72}


 86%|████████▌ | 564/656 [30:44<04:18,  2.81s/it]

{'loss': 0.0347, 'learning_rate': 1.347348303870043e-05, 'epoch': 1.72}


 86%|████████▌ | 565/656 [30:47<04:21,  2.88s/it]

{'loss': 0.0423, 'learning_rate': 1.3497372193024369e-05, 'epoch': 1.72}


 86%|████████▋ | 566/656 [30:50<04:15,  2.83s/it]

{'loss': 0.0666, 'learning_rate': 1.3521261347348305e-05, 'epoch': 1.73}


 86%|████████▋ | 567/656 [30:53<04:10,  2.82s/it]

{'loss': 0.152, 'learning_rate': 1.3545150501672241e-05, 'epoch': 1.73}


 87%|████████▋ | 568/656 [30:56<04:07,  2.81s/it]

{'loss': 0.194, 'learning_rate': 1.3569039655996179e-05, 'epoch': 1.73}


 87%|████████▋ | 569/656 [30:58<03:59,  2.75s/it]

{'loss': 0.0317, 'learning_rate': 1.3592928810320115e-05, 'epoch': 1.73}


 87%|████████▋ | 570/656 [31:03<04:33,  3.18s/it]

{'loss': 0.0458, 'learning_rate': 1.3616817964644053e-05, 'epoch': 1.74}


 87%|████████▋ | 571/656 [31:06<04:29,  3.17s/it]

{'loss': 0.1448, 'learning_rate': 1.3640707118967989e-05, 'epoch': 1.74}


 87%|████████▋ | 572/656 [31:09<04:20,  3.10s/it]

{'loss': 0.1669, 'learning_rate': 1.3664596273291926e-05, 'epoch': 1.74}


 87%|████████▋ | 573/656 [31:11<04:09,  3.00s/it]

{'loss': 0.1054, 'learning_rate': 1.3688485427615862e-05, 'epoch': 1.75}


 88%|████████▊ | 574/656 [31:14<04:04,  2.98s/it]

{'loss': 0.0356, 'learning_rate': 1.3712374581939799e-05, 'epoch': 1.75}


 88%|████████▊ | 575/656 [31:17<03:57,  2.94s/it]

{'loss': 0.3312, 'learning_rate': 1.3736263736263738e-05, 'epoch': 1.75}


 88%|████████▊ | 576/656 [31:21<04:06,  3.08s/it]

{'loss': 0.0466, 'learning_rate': 1.3760152890587674e-05, 'epoch': 1.76}


 88%|████████▊ | 577/656 [31:23<03:55,  2.98s/it]

{'loss': 0.1763, 'learning_rate': 1.378404204491161e-05, 'epoch': 1.76}


 88%|████████▊ | 578/656 [31:26<03:53,  2.99s/it]

{'loss': 0.037, 'learning_rate': 1.3807931199235546e-05, 'epoch': 1.76}


 88%|████████▊ | 579/656 [31:29<03:49,  2.97s/it]

{'loss': 0.2124, 'learning_rate': 1.3831820353559485e-05, 'epoch': 1.77}


 88%|████████▊ | 580/656 [31:32<03:43,  2.94s/it]

{'loss': 0.1034, 'learning_rate': 1.3855709507883422e-05, 'epoch': 1.77}


 89%|████████▊ | 581/656 [31:36<04:06,  3.28s/it]

{'loss': 0.07, 'learning_rate': 1.3879598662207358e-05, 'epoch': 1.77}


 89%|████████▊ | 582/656 [31:40<04:02,  3.28s/it]

{'loss': 0.0292, 'learning_rate': 1.3903487816531297e-05, 'epoch': 1.77}


 89%|████████▉ | 583/656 [31:43<04:11,  3.44s/it]

{'loss': 0.0482, 'learning_rate': 1.3927376970855233e-05, 'epoch': 1.78}


 89%|████████▉ | 584/656 [31:48<04:39,  3.88s/it]

{'loss': 0.0475, 'learning_rate': 1.3951266125179169e-05, 'epoch': 1.78}


 89%|████████▉ | 585/656 [31:53<04:47,  4.05s/it]

{'loss': 0.0739, 'learning_rate': 1.3975155279503105e-05, 'epoch': 1.78}


 89%|████████▉ | 586/656 [31:56<04:29,  3.84s/it]

{'loss': 0.033, 'learning_rate': 1.3999044433827045e-05, 'epoch': 1.79}


 89%|████████▉ | 587/656 [31:59<04:15,  3.70s/it]

{'loss': 0.0303, 'learning_rate': 1.402293358815098e-05, 'epoch': 1.79}


 90%|████████▉ | 588/656 [32:02<03:51,  3.40s/it]

{'loss': 0.1455, 'learning_rate': 1.4046822742474917e-05, 'epoch': 1.79}


 90%|████████▉ | 589/656 [32:05<03:36,  3.23s/it]

{'loss': 0.0306, 'learning_rate': 1.4070711896798854e-05, 'epoch': 1.8}


 90%|████████▉ | 590/656 [32:09<03:42,  3.37s/it]

{'loss': 0.1367, 'learning_rate': 1.409460105112279e-05, 'epoch': 1.8}


 90%|█████████ | 591/656 [32:12<03:37,  3.35s/it]

{'loss': 0.0257, 'learning_rate': 1.4118490205446726e-05, 'epoch': 1.8}


 90%|█████████ | 592/656 [32:15<03:22,  3.16s/it]

{'loss': 0.0253, 'learning_rate': 1.4142379359770666e-05, 'epoch': 1.8}


 90%|█████████ | 593/656 [32:17<03:09,  3.01s/it]

{'loss': 0.1087, 'learning_rate': 1.4166268514094602e-05, 'epoch': 1.81}


 91%|█████████ | 594/656 [32:20<03:00,  2.92s/it]

{'loss': 0.0252, 'learning_rate': 1.4190157668418538e-05, 'epoch': 1.81}


 91%|█████████ | 595/656 [32:23<02:59,  2.95s/it]

{'loss': 0.0274, 'learning_rate': 1.4214046822742474e-05, 'epoch': 1.81}


 91%|█████████ | 596/656 [32:26<02:58,  2.97s/it]

{'loss': 0.0262, 'learning_rate': 1.4237935977066413e-05, 'epoch': 1.82}


 91%|█████████ | 597/656 [32:29<02:49,  2.87s/it]

{'loss': 0.0253, 'learning_rate': 1.426182513139035e-05, 'epoch': 1.82}


 91%|█████████ | 598/656 [32:31<02:43,  2.82s/it]

{'loss': 0.1031, 'learning_rate': 1.4285714285714285e-05, 'epoch': 1.82}


 91%|█████████▏| 599/656 [32:34<02:43,  2.87s/it]

{'loss': 0.0537, 'learning_rate': 1.4309603440038225e-05, 'epoch': 1.83}


 91%|█████████▏| 600/656 [32:38<02:51,  3.06s/it]

{'loss': 0.0265, 'learning_rate': 1.4333492594362161e-05, 'epoch': 1.83}


 92%|█████████▏| 601/656 [32:41<02:45,  3.01s/it]

{'loss': 0.041, 'learning_rate': 1.4357381748686097e-05, 'epoch': 1.83}


 92%|█████████▏| 602/656 [32:44<02:39,  2.95s/it]

{'loss': 0.137, 'learning_rate': 1.4381270903010033e-05, 'epoch': 1.84}


 92%|█████████▏| 603/656 [32:47<02:36,  2.95s/it]

{'loss': 0.0289, 'learning_rate': 1.4405160057333972e-05, 'epoch': 1.84}


 92%|█████████▏| 604/656 [32:50<02:46,  3.21s/it]

{'loss': 0.1646, 'learning_rate': 1.4429049211657908e-05, 'epoch': 1.84}


 92%|█████████▏| 605/656 [32:54<02:46,  3.27s/it]

{'loss': 0.0209, 'learning_rate': 1.4452938365981844e-05, 'epoch': 1.84}


 92%|█████████▏| 606/656 [32:57<02:48,  3.37s/it]

{'loss': 0.026, 'learning_rate': 1.4476827520305782e-05, 'epoch': 1.85}


 93%|█████████▎| 607/656 [33:01<02:42,  3.32s/it]

{'loss': 0.1352, 'learning_rate': 1.4500716674629718e-05, 'epoch': 1.85}


 93%|█████████▎| 608/656 [33:03<02:29,  3.11s/it]

{'loss': 0.0293, 'learning_rate': 1.4524605828953656e-05, 'epoch': 1.85}


 93%|█████████▎| 609/656 [33:06<02:19,  2.97s/it]

{'loss': 0.1029, 'learning_rate': 1.4548494983277592e-05, 'epoch': 1.86}


 93%|█████████▎| 610/656 [33:08<02:09,  2.81s/it]

{'loss': 0.0245, 'learning_rate': 1.457238413760153e-05, 'epoch': 1.86}


 93%|█████████▎| 611/656 [33:11<01:58,  2.64s/it]

{'loss': 0.0248, 'learning_rate': 1.4596273291925466e-05, 'epoch': 1.86}


 93%|█████████▎| 612/656 [33:13<01:59,  2.71s/it]

{'loss': 0.0405, 'learning_rate': 1.4620162446249402e-05, 'epoch': 1.87}


 93%|█████████▎| 613/656 [33:16<01:53,  2.63s/it]

{'loss': 0.0802, 'learning_rate': 1.4644051600573341e-05, 'epoch': 1.87}


 94%|█████████▎| 614/656 [33:18<01:44,  2.48s/it]

{'loss': 0.0245, 'learning_rate': 1.4667940754897277e-05, 'epoch': 1.87}


 94%|█████████▍| 615/656 [35:11<24:19, 35.60s/it]

{'loss': 0.0623, 'learning_rate': 1.4691829909221213e-05, 'epoch': 1.88}


 94%|█████████▍| 616/656 [35:12<16:50, 25.25s/it]

{'loss': 0.0427, 'learning_rate': 1.4715719063545153e-05, 'epoch': 1.88}


 94%|█████████▍| 617/656 [35:13<11:43, 18.04s/it]

{'loss': 0.1501, 'learning_rate': 1.4739608217869089e-05, 'epoch': 1.88}


 94%|█████████▍| 618/656 [35:15<08:15, 13.03s/it]

{'loss': 0.1278, 'learning_rate': 1.4763497372193025e-05, 'epoch': 1.88}


 94%|█████████▍| 619/656 [35:16<05:52,  9.52s/it]

{'loss': 0.0283, 'learning_rate': 1.478738652651696e-05, 'epoch': 1.89}


 95%|█████████▍| 620/656 [35:17<04:14,  7.06s/it]

{'loss': 0.0288, 'learning_rate': 1.48112756808409e-05, 'epoch': 1.89}


 95%|█████████▍| 621/656 [35:18<03:04,  5.28s/it]

{'loss': 0.0195, 'learning_rate': 1.4835164835164836e-05, 'epoch': 1.89}


 95%|█████████▍| 622/656 [35:20<02:20,  4.13s/it]

{'loss': 0.0265, 'learning_rate': 1.4859053989488772e-05, 'epoch': 1.9}


 95%|█████████▍| 623/656 [35:21<01:49,  3.30s/it]

{'loss': 0.024, 'learning_rate': 1.4882943143812712e-05, 'epoch': 1.9}


 95%|█████████▌| 624/656 [35:22<01:26,  2.70s/it]

{'loss': 0.0287, 'learning_rate': 1.4906832298136648e-05, 'epoch': 1.9}


 95%|█████████▌| 625/656 [35:24<01:09,  2.23s/it]

{'loss': 0.0191, 'learning_rate': 1.4930721452460584e-05, 'epoch': 1.91}


 95%|█████████▌| 626/656 [35:25<00:58,  1.95s/it]

{'loss': 0.1508, 'learning_rate': 1.495461060678452e-05, 'epoch': 1.91}


 96%|█████████▌| 627/656 [35:26<00:51,  1.77s/it]

{'loss': 0.0217, 'learning_rate': 1.4978499761108458e-05, 'epoch': 1.91}


 96%|█████████▌| 628/656 [35:28<00:46,  1.65s/it]

{'loss': 0.1772, 'learning_rate': 1.5002388915432394e-05, 'epoch': 1.91}


 96%|█████████▌| 629/656 [35:29<00:39,  1.45s/it]

{'loss': 0.0215, 'learning_rate': 1.502627806975633e-05, 'epoch': 1.92}


 96%|█████████▌| 630/656 [35:30<00:36,  1.40s/it]

{'loss': 0.1043, 'learning_rate': 1.5050167224080269e-05, 'epoch': 1.92}


 96%|█████████▌| 631/656 [35:31<00:32,  1.28s/it]

{'loss': 0.0246, 'learning_rate': 1.5074056378404205e-05, 'epoch': 1.92}


 96%|█████████▋| 632/656 [35:32<00:29,  1.21s/it]

{'loss': 0.0566, 'learning_rate': 1.5097945532728141e-05, 'epoch': 1.93}


 96%|█████████▋| 633/656 [35:33<00:27,  1.22s/it]

{'loss': 0.0227, 'learning_rate': 1.5121834687052077e-05, 'epoch': 1.93}


 97%|█████████▋| 634/656 [35:34<00:27,  1.23s/it]

{'loss': 0.0601, 'learning_rate': 1.5145723841376017e-05, 'epoch': 1.93}


 97%|█████████▋| 635/656 [35:36<00:25,  1.24s/it]

{'loss': 0.0257, 'learning_rate': 1.5169612995699953e-05, 'epoch': 1.94}


 97%|█████████▋| 636/656 [35:37<00:23,  1.19s/it]

{'loss': 0.1353, 'learning_rate': 1.5193502150023889e-05, 'epoch': 1.94}


 97%|█████████▋| 637/656 [35:38<00:22,  1.16s/it]

{'loss': 0.1379, 'learning_rate': 1.5217391304347828e-05, 'epoch': 1.94}


 97%|█████████▋| 638/656 [35:39<00:20,  1.14s/it]

{'loss': 0.0192, 'learning_rate': 1.5241280458671764e-05, 'epoch': 1.95}


 97%|█████████▋| 639/656 [35:40<00:18,  1.10s/it]

{'loss': 0.0238, 'learning_rate': 1.52651696129957e-05, 'epoch': 1.95}


 98%|█████████▊| 640/656 [35:42<00:19,  1.24s/it]

{'loss': 0.0871, 'learning_rate': 1.528905876731964e-05, 'epoch': 1.95}


 98%|█████████▊| 641/656 [35:43<00:19,  1.28s/it]

{'loss': 0.1058, 'learning_rate': 1.5312947921643576e-05, 'epoch': 1.95}


 98%|█████████▊| 642/656 [35:45<00:20,  1.45s/it]

{'loss': 0.0225, 'learning_rate': 1.5336837075967512e-05, 'epoch': 1.96}


 98%|█████████▊| 643/656 [35:47<00:20,  1.56s/it]

{'loss': 0.0234, 'learning_rate': 1.5360726230291448e-05, 'epoch': 1.96}


 98%|█████████▊| 644/656 [35:48<00:18,  1.55s/it]

{'loss': 0.02, 'learning_rate': 1.5384615384615387e-05, 'epoch': 1.96}


 98%|█████████▊| 645/656 [35:50<00:17,  1.57s/it]

{'loss': 0.0218, 'learning_rate': 1.5408504538939323e-05, 'epoch': 1.97}


 98%|█████████▊| 646/656 [35:52<00:16,  1.65s/it]

{'loss': 0.0914, 'learning_rate': 1.543239369326326e-05, 'epoch': 1.97}


 99%|█████████▊| 647/656 [35:53<00:15,  1.71s/it]

{'loss': 0.1259, 'learning_rate': 1.54562828475872e-05, 'epoch': 1.97}


 99%|█████████▉| 648/656 [35:55<00:13,  1.73s/it]

{'loss': 0.0849, 'learning_rate': 1.5480172001911135e-05, 'epoch': 1.98}


 99%|█████████▉| 649/656 [35:57<00:12,  1.82s/it]

{'loss': 0.0456, 'learning_rate': 1.550406115623507e-05, 'epoch': 1.98}


 99%|█████████▉| 650/656 [35:59<00:11,  1.87s/it]

{'loss': 0.1953, 'learning_rate': 1.5527950310559007e-05, 'epoch': 1.98}


 99%|█████████▉| 651/656 [36:01<00:09,  1.98s/it]

{'loss': 0.0418, 'learning_rate': 1.5551839464882946e-05, 'epoch': 1.98}


 99%|█████████▉| 652/656 [36:04<00:08,  2.18s/it]

{'loss': 0.0183, 'learning_rate': 1.5575728619206882e-05, 'epoch': 1.99}


100%|█████████▉| 653/656 [36:07<00:06,  2.27s/it]

{'loss': 0.0224, 'learning_rate': 1.5599617773530818e-05, 'epoch': 1.99}


100%|█████████▉| 654/656 [36:09<00:04,  2.41s/it]

{'loss': 0.0294, 'learning_rate': 1.5623506927854754e-05, 'epoch': 1.99}


100%|█████████▉| 655/656 [36:11<00:02,  2.27s/it]

{'loss': 0.0206, 'learning_rate': 1.564739608217869e-05, 'epoch': 2.0}


100%|██████████| 656/656 [36:12<00:00,  1.82s/it]The following columns in the evaluation set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: token_labels, utterance, tokens. If token_labels, utterance, tokens are not expected by `DistilBertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 2617
  Batch size = 32


{'loss': 0.0192, 'learning_rate': 1.5671285236502626e-05, 'epoch': 2.0}


                                                 
100%|██████████| 656/656 [37:04<00:00,  1.82s/it]Saving model checkpoint to ./snips_clf\tmp-checkpoint-656
Configuration saved in ./snips_clf\tmp-checkpoint-656\config.json


{'eval_loss': 0.058428067713975906, 'eval_accuracy': 0.9862437905999236, 'eval_runtime': 52.2054, 'eval_samples_per_second': 50.129, 'eval_steps_per_second': 1.571, 'epoch': 2.0}


Model weights saved in ./snips_clf\tmp-checkpoint-656\pytorch_model.bin


Training completed. Do not forget to share your model on huggingface.co/models =)


Loading best model from ./snips_clf\checkpoint-656 (score: 0.058428067713975906).
100%|██████████| 656/656 [37:07<00:00,  3.39s/it]

{'train_runtime': 2227.0718, 'train_samples_per_second': 9.4, 'train_steps_per_second': 0.295, 'train_loss': 0.7253907033463758, 'epoch': 2.0}





TrainOutput(global_step=656, training_loss=0.7253907033463758, metrics={'train_runtime': 2227.0718, 'train_samples_per_second': 9.4, 'train_steps_per_second': 0.295, 'train_loss': 0.7253907033463758, 'epoch': 2.0})

In [23]:
trainer.evaluate()

The following columns in the evaluation set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: token_labels, utterance, tokens. If token_labels, utterance, tokens are not expected by `DistilBertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 2617
  Batch size = 32
100%|██████████| 82/82 [00:28<00:00,  2.91it/s]


{'eval_loss': 0.058428067713975906,
 'eval_accuracy': 0.9862437905999236,
 'eval_runtime': 28.748,
 'eval_samples_per_second': 91.033,
 'eval_steps_per_second': 2.852,
 'epoch': 2.0}

In [31]:
sequence_clf_model

DistilBertForSequenceClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): MultiHeadSelfAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)
 

In [25]:
pipe = pipeline("text-classification", sequence_clf_model, tokenizer=tokenizer)
pipe('Please add stairway to heaven by Led Zepplin to my playlist')

[{'label': 'AddToPlaylist', 'score': 0.989997148513794}]

In [26]:
trainer.save_model()

Saving model checkpoint to ./snips_clf
Configuration saved in ./snips_clf\config.json
Model weights saved in ./snips_clf\pytorch_model.bin


#### Freezing the entire pre-trained BERT except the classifification layer to Finetune it

In [28]:
frozen_sequence_clf_model = DistilBertForSequenceClassification.from_pretrained(
    'distilbert-base-cased', 
    num_labels=len(unique_sequence_labels),
)

config.json: 100%|██████████| 465/465 [00:00<00:00, 464kB/s]
To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
loading configuration file config.json from cache at C:\Users\anzel\.cache\huggingface\hub\models--distilbert-base-cased\snapshots\0dacbb01d604f8adeeb5b87c9339e485ac40d5c0\config.json
Model config DistilBertConfig {
  "activation": "gelu",
  "architectures": [
    "DistilBertForMaskedLM"
  ],
  "attention_dropout": 0.1,
  "dim": 768,
  "dropout": 0.1,
  "hidden_dim": 3072,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1",
    "2": "LABEL_2",
    "3": "LABEL_3",
    "4": "LABEL_4",
    "5": "LABEL_5",
    "6": "LABEL_6"
  },
  "initializer_range": 0.02,
  "label2id": {
    "LABEL_0": 0,
    "LABEL_1": 1,
    "LABEL_2": 2,
    "LABEL_3": 3,
    "LABEL_4": 4,
   

In [30]:
frozen_sequence_clf_model

DistilBertForSequenceClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(28996, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): MultiHeadSelfAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)
 

In [29]:
for param in frozen_sequence_clf_model.distilbert.parameters():
    param.requires_grad = False

In [38]:
epochs = 1

training_args = TrainingArguments(
    output_dir="./snips_clf/frozen-seq-classifier",
    num_train_epochs=epochs,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    load_best_model_at_end=True,
    
    # some deep learning parameters that the Trainer is able to take in
    warmup_steps=len(seq_clf_tokenized_snips['train']) // 5,  # number of warmup steps for learning rate scheduler,
    weight_decay = 0.05,
    
    logging_steps=1,
    log_level='info',
    evaluation_strategy='epoch',
    eval_steps=50,
    save_strategy='epoch'
)

# Define the trainer:

trainer = Trainer(
    model=frozen_sequence_clf_model,
    args=training_args,
    train_dataset=seq_clf_tokenized_snips['train'],
    eval_dataset=seq_clf_tokenized_snips['test'],
    compute_metrics=compute_metrics,
    data_collator=data_collator
)

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).


In [37]:
# trainer.evaluate()

In [40]:
# trainer.train()
# THis will yield worse score than updating all the parameters