## Import and setup

In [None]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [None]:
%cd gdrive/MyDrive/Colab\ Notebooks

/content/gdrive/MyDrive/Colab Notebooks


In [11]:
!pip install transformers
!pip install datasets

#dl imports
from transformers import pipeline
from datasets import load_dataset, Dataset, ClassLabel, load_from_disk, DatasetDict
#from huggingface_hub import notebook_login

#import data science packages
import pandas as pd
import numpy as np
import seaborn as sns

#import file helper packages
import glob
import requests



In [7]:
from huggingface_hub import notebook_login

ImportError: cannot import name 'notebook_login' from 'huggingface_hub' (C:\Users\bookw\anaconda3\lib\site-packages\huggingface_hub\__init__.py)

## Import data and convert to HF Dataset object

In [None]:
dreams = pd.read_csv('dream_text_corpus.csv')

In [None]:
dataset_pd = Dataset.from_pandas(dreams)

In [None]:
dataset_pd

Dataset({
    features: ['Unnamed: 0', 'counts', 'dreams', 'titles'],
    num_rows: 26401
})

In [None]:
dataset_pd.features

{'Unnamed: 0': Value(dtype='int64', id=None),
 'counts': Value(dtype='string', id=None),
 'dreams': Value(dtype='string', id=None),
 'titles': Value(dtype='string', id=None)}

## Make train/valid/test split

In [None]:
dataset = dataset_pd.train_test_split(test_size=0.2)

NameError: ignored

In [None]:
dataset['train']['text']

In [None]:
train_val_dataset = dataset['train'].train_test_split(test_size = 0.3)

In [None]:
dataset['train'] = train_val_dataset['train']
dataset['valid'] = train_val_dataset['test']

In [None]:
dataset['train']['text']

## Upload to HF hub

In [5]:
!git config --global credential.helper store

In [6]:
notebook_login()

NameError: name 'notebook_login' is not defined

In [None]:
dataset.push_to_hub('eadsa1998/dataset', private=True)

Pushing split train to the Hub.


Pushing dataset shards to the dataset hub:   0%|          | 0/1 [00:00<?, ?it/s]

Pushing split test to the Hub.
The repository already exists: the `private` keyword argument will be ignored.


Pushing dataset shards to the dataset hub:   0%|          | 0/1 [00:00<?, ?it/s]

Pushing split valid to the Hub.
The repository already exists: the `private` keyword argument will be ignored.


Pushing dataset shards to the dataset hub:   0%|          | 0/1 [00:00<?, ?it/s]

## Following HF Courses 'Fine-tuning an MLM'

In [10]:
#if this doesn't work, reload notebook_login
from datasets import load_dataset

dataset = load_dataset("eadsa1998/dataset")

FileNotFoundError: Couldn't find file locally at dataset\dataset.py, or remotely at https://raw.githubusercontent.com/huggingface/datasets/1.7.0/datasets/dataset/dataset.py.
The file is also not present on the master branch on github.

In [None]:
from transformers import AutoModelForMaskedLM

model_checkpoint = "distilbert-base-uncased"
model = AutoModelForMaskedLM.from_pretrained(model_checkpoint)

Downloading:   0%|          | 0.00/483 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/256M [00:00<?, ?B/s]

In [None]:
distilbert_num_parameters = model.num_parameters() / 1_000_000
print(f"'>>> DistilBERT number of parameters: {round(distilbert_num_parameters)}M'")
print(f"'>>> BERT number of parameters: 110M'")

'>>> DistilBERT number of parameters: 67M'
'>>> BERT number of parameters: 110M'


In [None]:
text = "This is a great [MASK]."

In [None]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

Downloading:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/226k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/455k [00:00<?, ?B/s]

In [None]:
import torch

inputs = tokenizer(text, return_tensors="pt")
token_logits = model(**inputs).logits
# Find the location of [MASK] and extract its logits
mask_token_index = torch.where(inputs["input_ids"] == tokenizer.mask_token_id)[1]
mask_token_logits = token_logits[0, mask_token_index, :]
# Pick the [MASK] candidates with the highest logits
top_5_tokens = torch.topk(mask_token_logits, 5, dim=1).indices[0].tolist()

for token in top_5_tokens:
    print(f"'>>> {text.replace(tokenizer.mask_token, tokenizer.decode([token]))}'")

'>>> This is a great deal.'
'>>> This is a great success.'
'>>> This is a great adventure.'
'>>> This is a great idea.'
'>>> This is a great feat.'


In [None]:
sample = dataset["train"].shuffle(seed=42).select(range(3))

for row in sample:
    print(f"\n'>>> Dream: {row['dreams']}'")
    print(f"'>>> Dreamer: {row['titles']}'")


'>>> Dream:  I'm at Mud Lake with my friend Kevin. His boss John is there, as well as John's kids. The family is on the fishing pier. Kevin and I are going fishing. Our boat in heading out from shore by the pier. Kevin and John are talking about wages. Kevin is using his characteristic cautiousness. John says something about the wage of $6.00/hour. John isn't looking at me, because he knows $6.00/hour is a poor wage and I will tell him so. I say that if I'm able to get a better paying job working less hours I'm going to have to take it. Kevin says something jokingly about getting paid $12.00/hour. We head out fishing. There are two boats: Kevin in one and I in the other. But we are not in the boats, or, at least, Kevin is not in a boat. John tells Kevin to roll the boat, for some reason having to do with what will happen to the contents of it. I am impatient to start fishing, and am annoyed with Kevin pondering whether to roll the boat and risk losing our fishing gear. I ask Kevin if 

In [None]:
#could it be this??
def tokenize_function(examples):
    result = tokenizer(examples["dreams"])
    if tokenizer.is_fast:
        result["word_ids"] = [result.word_ids(i) for i in range(len(result["input_ids"]))]
    return result


# Use batched=True to activate fast multithreading!
tokenized_datasets = dataset.map(
    tokenize_function, batched=True, remove_columns=['Unnamed: 0', 'counts', 'dreams', 'titles']
)
tokenized_datasets

  0%|          | 0/15 [00:00<?, ?ba/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (1093 > 512). Running this sequence through the model will result in indexing errors


  0%|          | 0/7 [00:00<?, ?ba/s]

  0%|          | 0/6 [00:00<?, ?ba/s]

DatasetDict({
    train: Dataset({
        features: ['attention_mask', 'input_ids', 'word_ids'],
        num_rows: 14784
    })
    valid: Dataset({
        features: ['attention_mask', 'input_ids', 'word_ids'],
        num_rows: 6336
    })
    test: Dataset({
        features: ['attention_mask', 'input_ids', 'word_ids'],
        num_rows: 5281
    })
})

In [None]:
chunk_size = 128

In [None]:
# Slicing produces a list of lists for each feature
tokenized_samples = tokenized_datasets["train"][:10]

for idx, sample in enumerate(tokenized_samples["input_ids"]):
    print(f"'>>> Dream {idx} length: {len(sample)}'")

'>>> Dream 0 length: 84'
'>>> Dream 1 length: 46'
'>>> Dream 2 length: 1093'
'>>> Dream 3 length: 110'
'>>> Dream 4 length: 70'
'>>> Dream 5 length: 93'
'>>> Dream 6 length: 99'
'>>> Dream 7 length: 395'
'>>> Dream 8 length: 108'
'>>> Dream 9 length: 237'


In [None]:
concatenated_examples = {
    k: sum(tokenized_samples[k], []) for k in tokenized_samples.keys()
}
total_length = len(concatenated_examples["input_ids"])
print(f"'>>> Concatenated dreams length: {total_length}'")

'>>> Concatenated dreams length: 2335'


In [None]:
chunks = {
    k: [t[i : i + chunk_size] for i in range(0, total_length, chunk_size)]
    for k, t in concatenated_examples.items()
}

for chunk in chunks["input_ids"]:
    print(f"'>>> Chunk length: {len(chunk)}'")

'>>> Chunk length: 128'
'>>> Chunk length: 128'
'>>> Chunk length: 128'
'>>> Chunk length: 128'
'>>> Chunk length: 128'
'>>> Chunk length: 128'
'>>> Chunk length: 128'
'>>> Chunk length: 128'
'>>> Chunk length: 128'
'>>> Chunk length: 128'
'>>> Chunk length: 128'
'>>> Chunk length: 128'
'>>> Chunk length: 128'
'>>> Chunk length: 128'
'>>> Chunk length: 128'
'>>> Chunk length: 128'
'>>> Chunk length: 128'
'>>> Chunk length: 128'
'>>> Chunk length: 31'


In [None]:
def group_texts(examples):
    # Concatenate all texts
    concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()}
    # Compute length of concatenated texts
    total_length = len(concatenated_examples[list(examples.keys())[0]])
    # We drop the last chunk if it's smaller than chunk_size
    total_length = (total_length // chunk_size) * chunk_size
    # Split by chunks of max_len
    result = {
        k: [t[i : i + chunk_size] for i in range(0, total_length, chunk_size)]
        for k, t in concatenated_examples.items()
    }
    # Create a new labels column
    result["labels"] = result["input_ids"].copy()
    return result

In [None]:
lm_datasets = tokenized_datasets.map(group_texts, batched=True)
lm_datasets

  0%|          | 0/15 [00:00<?, ?ba/s]

  0%|          | 0/7 [00:00<?, ?ba/s]

  0%|          | 0/6 [00:00<?, ?ba/s]

DatasetDict({
    train: Dataset({
        features: ['attention_mask', 'input_ids', 'labels', 'word_ids'],
        num_rows: 24019
    })
    valid: Dataset({
        features: ['attention_mask', 'input_ids', 'labels', 'word_ids'],
        num_rows: 10556
    })
    test: Dataset({
        features: ['attention_mask', 'input_ids', 'labels', 'word_ids'],
        num_rows: 8592
    })
})

In [None]:
tokenizer.decode(lm_datasets["train"][1]["input_ids"])

". [SEP] [CLS] i had gone to the doctor for some flu like symptoms. i was standing back in the nurse's area when i man came from another area. he was wearing greenish - blue scrubs. doctor pf wasn't available so i was going to be seeing doctor waddle [ unknown in waking life ]. i assumed this man was doctor waddle but his first name was steve. i said, hi steve. as soon as i did i thought, that's odd. why would i call him steve and i had never met him before and he's a doctor. that's not like me to call"

In [None]:
tokenizer.decode(lm_datasets["train"][1]["labels"])

". [SEP] [CLS] i had gone to the doctor for some flu like symptoms. i was standing back in the nurse's area when i man came from another area. he was wearing greenish - blue scrubs. doctor pf wasn't available so i was going to be seeing doctor waddle [ unknown in waking life ]. i assumed this man was doctor waddle but his first name was steve. i said, hi steve. as soon as i did i thought, that's odd. why would i call him steve and i had never met him before and he's a doctor. that's not like me to call"

In [None]:
from transformers import DataCollatorForLanguageModeling

data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm_probability=0.15)

In [None]:
samples = [lm_datasets["train"][i] for i in range(2)]
for sample in samples:
    _ = sample.pop("word_ids")

for chunk in data_collator(samples)["input_ids"]:
    print(f"\n'>>> {tokenizer.decode(chunk)}'")


'>>> [CLS] ezra had this thing which would make peoples legs go out to the side and their heart would burst [MASK] it didn't affect him though and i didn't want it to affect me. [MASK] i backed away, but [MASK] [MASK] saying it [MASK] [MASK] affect स far away and then i argued that people over the other side of [MASK] [MASK] [MASK]'t getting affected. i didn [MASK] [MASK] get a reply though. [SEP] [CLS] [MASK] s. was in the shop to visit. he talked with us. we visited a relation or friend of ours and he came home riding [MASK] big mountain lion. it was tame [MASK] we were not afraid of it'

'>>> . [SEP] [CLS] i had [MASK] [MASK] the [MASK] for some flu [MASK] symptoms. i was [MASK] back in the [MASK]'[MASK] area when i man [MASK] [MASK] [MASK] area. he was wearing greenish - blue scrubs. doctor pf [MASK]'t available so i was [MASK] to be seeing doctor waddle [ unknown [MASK] waking life ]. [MASK] assumed this [MASK] was doctor waddle but his first name was steve [MASK] i said, [MASK] 

In [None]:
import collections
import numpy as np

from transformers import default_data_collator

wwm_probability = 0.15


def whole_word_masking_data_collator(features):
    for feature in features:
        word_ids = feature.pop("word_ids")

        # Create a map between words and corresponding token indices
        mapping = collections.defaultdict(list)
        current_word_index = -1
        current_word = None
        for idx, word_id in enumerate(word_ids):
            if word_id is not None:
                if word_id != current_word:
                    current_word = word_id
                    current_word_index += 1
                mapping[current_word_index].append(idx)

        # Randomly mask words
        mask = np.random.binomial(1, wwm_probability, (len(mapping),))
        input_ids = feature["input_ids"]
        labels = feature["labels"]
        new_labels = [-100] * len(labels)
        for word_id in np.where(mask)[0]:
            word_id = word_id.item()
            for idx in mapping[word_id]:
                new_labels[idx] = labels[idx]
                input_ids[idx] = tokenizer.mask_token_id

    return default_data_collator(features)

In [None]:
samples = [lm_datasets["train"][i] for i in range(2)]
batch = whole_word_masking_data_collator(samples)

for chunk in batch["input_ids"]:
    print(f"\n'>>> {tokenizer.decode(chunk)}'")



'>>> [CLS] ezra had this thing which would make [MASK] legs go out to the [MASK] [MASK] their heart would burst. [MASK] didn't affect him though and i [MASK]'t want it to affect [MASK]. so i backed away, [MASK] [MASK] followed [MASK] it would only affect people far away and then i argued [MASK] people over the other side [MASK] [MASK] world weren't [MASK] affected. i didn't get a reply though. [SEP] [CLS] hank s [MASK] was in the shop to visit. he talked with us [MASK] we [MASK] a relation or friend of ours and [MASK] came [MASK] [MASK] a big [MASK] lion. it was [MASK] [MASK] we [MASK] not [MASK] of it'

'>>> . [SEP] [CLS] i had gone to the doctor for some flu like symptoms. i was [MASK] back in the nurse's [MASK] when i man came from another area. he [MASK] wearing greenish - blue [MASK] [MASK]. doctor pf wasn't available so [MASK] was going to be seeing doctor waddle [MASK] unknown in waking life ]. i assumed this man was doctor waddle but his [MASK] name was [MASK]. i said, hi stev

## Test training example to make sure it works!

In [None]:
train_size = 10_000
test_size = int(0.1 * train_size)

downsampled_dataset = lm_datasets["train"].train_test_split(
    train_size=train_size, test_size=test_size, seed=42
)
downsampled_dataset


DatasetDict({
    train: Dataset({
        features: ['attention_mask', 'input_ids', 'labels', 'word_ids'],
        num_rows: 10000
    })
    test: Dataset({
        features: ['attention_mask', 'input_ids', 'labels', 'word_ids'],
        num_rows: 1000
    })
})

In [None]:
from huggingface_hub import notebook_login

notebook_login()

Login successful
Your token has been saved to /root/.huggingface/token


In [None]:
from datasets import load_metric
#load a metric
#metric = load_metric('accuracy')

#define the metric behavior
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

In [None]:
from sklearn.metrics import precision_recall_fscore_support

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    
    #get predictions by using index of max logit
    predictions = np.argmax(logits, axis=-1)
    
    #calculate classification report
    perfs = precision_recall_fscore_support(labels, predictions, average='macro', zero_division=0)
    perf_dict = dict(zip(['precision', 'recall', 'fscore'], perfs[:3]))
    
    #return dictionary
    return perf_dict

In [None]:
#add an evaluation metric????
from transformers import TrainingArguments

batch_size = 32
# Show the training loss with every epoch
logging_steps = len(downsampled_dataset["train"]) // batch_size
model_name = model_checkpoint.split("/")[-1]

training_args = TrainingArguments(
    output_dir=f"{model_name}-finetuned-dreams",
    overwrite_output_dir=True,
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    weight_decay=0.01,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    push_to_hub=True,
    fp16=True,
    logging_steps=logging_steps
)

In [None]:
!apt install git-lfs

Reading package lists... Done
Building dependency tree       
Reading state information... Done
The following NEW packages will be installed:
  git-lfs
0 upgraded, 1 newly installed, 0 to remove and 37 not upgraded.
Need to get 2,129 kB of archives.
After this operation, 7,662 kB of additional disk space will be used.
Get:1 http://archive.ubuntu.com/ubuntu bionic/universe amd64 git-lfs amd64 2.3.4-1 [2,129 kB]
Fetched 2,129 kB in 1s (1,838 kB/s)
Selecting previously unselected package git-lfs.
(Reading database ... 155229 files and directories currently installed.)
Preparing to unpack .../git-lfs_2.3.4-1_amd64.deb ...
Unpacking git-lfs (2.3.4-1) ...
Setting up git-lfs (2.3.4-1) ...
Processing triggers for man-db (2.8.3-2ubuntu0.1) ...


In [None]:
from transformers import Trainer

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=downsampled_dataset["train"],
    eval_dataset=downsampled_dataset["test"],
    data_collator=data_collator
    #compute_metrics=compute_metrics
)

Cloning https://huggingface.co/eadsa1998/distilbert-base-uncased-finetuned-dreams into local empty directory.
Using amp half precision backend


In [None]:
torch.cuda.empty_cache()

In [None]:
import math

eval_results = trainer.evaluate()
print(f">>> Perplexity: {math.exp(eval_results['eval_loss']):.2f}")


The following columns in the evaluation set  don't have a corresponding argument in `DistilBertForMaskedLM.forward` and have been ignored: word_ids.
***** Running Evaluation *****
  Num examples = 1000
  Batch size = 32


RuntimeError: ignored

In [None]:
#Does the comment here matter either?
trainer.train()

The following columns in the training set  don't have a corresponding argument in `DistilBertForMaskedLM.forward` and have been ignored: word_ids.
***** Running training *****
  Num examples = 10000
  Num Epochs = 3
  Instantaneous batch size per device = 32
  Total train batch size (w. parallel, distributed & accumulation) = 32
  Gradient Accumulation steps = 1
  Total optimization steps = 939


Epoch,Training Loss,Validation Loss
1,2.3058,2.15259
2,2.1826,2.032212
3,2.1442,2.081021


The following columns in the evaluation set  don't have a corresponding argument in `DistilBertForMaskedLM.forward` and have been ignored: word_ids.
***** Running Evaluation *****
  Num examples = 1000
  Batch size = 32
Saving model checkpoint to distilbert-base-uncased-finetuned-dreams/checkpoint-500
Configuration saved in distilbert-base-uncased-finetuned-dreams/checkpoint-500/config.json
Model weights saved in distilbert-base-uncased-finetuned-dreams/checkpoint-500/pytorch_model.bin
The following columns in the evaluation set  don't have a corresponding argument in `DistilBertForMaskedLM.forward` and have been ignored: word_ids.
***** Running Evaluation *****
  Num examples = 1000
  Batch size = 32
The following columns in the evaluation set  don't have a corresponding argument in `DistilBertForMaskedLM.forward` and have been ignored: word_ids.
***** Running Evaluation *****
  Num examples = 1000
  Batch size = 32


Training completed. Do not forget to share your model on huggingfac

TrainOutput(global_step=939, training_loss=2.2107567172715714, metrics={'train_runtime': 1912.9795, 'train_samples_per_second': 15.682, 'train_steps_per_second': 0.491, 'total_flos': 994208670720000.0, 'train_loss': 2.2107567172715714, 'epoch': 3.0})

In [None]:
eval_results = trainer.evaluate()
print(f">>> Perplexity: {math.exp(eval_results['eval_loss']):.2f}")

The following columns in the evaluation set  don't have a corresponding argument in `DistilBertForMaskedLM.forward` and have been ignored: word_ids.
***** Running Evaluation *****
  Num examples = 1000
  Batch size = 64


>>> Perplexity: 8.03


In [None]:
trainer.push_to_hub()

Saving model checkpoint to distilbert-base-uncased-finetuned-dreams
Configuration saved in distilbert-base-uncased-finetuned-dreams/config.json
Model weights saved in distilbert-base-uncased-finetuned-dreams/pytorch_model.bin
Several commits (2) will be pushed upstream.
The progress bars may be unreliable.


## Fine-tuning with Accelerate

In [None]:
def insert_random_mask(batch):
    features = [dict(zip(batch, t)) for t in zip(*batch.values())]
    masked_inputs = data_collator(features)
    # Create a new "masked" column for each column in the dataset
    return {"masked_" + k: v.numpy() for k, v in masked_inputs.items()}

In [None]:
downsampled_dataset = downsampled_dataset.remove_columns(["word_ids"])
eval_dataset = downsampled_dataset["test"].map(
    insert_random_mask,
    batched=True,
    remove_columns=downsampled_dataset["test"].column_names,
)
eval_dataset = eval_dataset.rename_columns(
    {
        "masked_input_ids": "input_ids",
        "masked_attention_mask": "attention_mask",
        "masked_labels": "labels",
    }
)

  0%|          | 0/1 [00:00<?, ?ba/s]

In [None]:
from torch.utils.data import DataLoader
from transformers import default_data_collator

batch_size = 64
train_dataloader = DataLoader(
    downsampled_dataset["train"],
    shuffle=True,
    batch_size=batch_size,
    collate_fn=data_collator,
)
eval_dataloader = DataLoader(
    eval_dataset, batch_size=batch_size, collate_fn=default_data_collator
)

In [None]:
model = AutoModelForMaskedLM.from_pretrained(model_checkpoint)

loading configuration file https://huggingface.co/distilbert-base-uncased/resolve/main/config.json from cache at /root/.cache/huggingface/transformers/23454919702d26495337f3da04d1655c7ee010d5ec9d77bdb9e399e00302c0a1.91b885ab15d631bf9cee9dc9d25ece0afd932f2f5130eba28f2055b2220c0333
Model config DistilBertConfig {
  "_name_or_path": "distilbert-base-uncased",
  "activation": "gelu",
  "architectures": [
    "DistilBertForMaskedLM"
  ],
  "attention_dropout": 0.1,
  "dim": 768,
  "dropout": 0.1,
  "hidden_dim": 3072,
  "initializer_range": 0.02,
  "max_position_embeddings": 512,
  "model_type": "distilbert",
  "n_heads": 12,
  "n_layers": 6,
  "pad_token_id": 0,
  "qa_dropout": 0.1,
  "seq_classif_dropout": 0.2,
  "sinusoidal_pos_embds": false,
  "tie_weights_": true,
  "transformers_version": "4.15.0",
  "vocab_size": 30522
}

loading weights file https://huggingface.co/distilbert-base-uncased/resolve/main/pytorch_model.bin from cache at /root/.cache/huggingface/transformers/9c169103d7e5a

In [None]:
from torch.optim import AdamW

optimizer = AdamW(model.parameters(), lr=5e-5)

In [None]:
!pip install accelerate
from accelerate import Accelerator

accelerator = Accelerator()
model, optimizer, train_dataloader, eval_dataloader = accelerator.prepare(
    model, optimizer, train_dataloader, eval_dataloader
)

Collecting accelerate
  Downloading accelerate-0.5.1-py3-none-any.whl (58 kB)
[?25l[K     |█████▋                          | 10 kB 33.1 MB/s eta 0:00:01[K     |███████████▎                    | 20 kB 7.9 MB/s eta 0:00:01[K     |█████████████████               | 30 kB 7.2 MB/s eta 0:00:01[K     |██████████████████████▋         | 40 kB 6.8 MB/s eta 0:00:01[K     |████████████████████████████▎   | 51 kB 4.2 MB/s eta 0:00:01[K     |████████████████████████████████| 58 kB 2.5 MB/s 
Installing collected packages: accelerate
Successfully installed accelerate-0.5.1


In [None]:
from transformers import get_scheduler

num_train_epochs = 3
num_update_steps_per_epoch = len(train_dataloader)
num_training_steps = num_train_epochs * num_update_steps_per_epoch

lr_scheduler = get_scheduler(
    "linear",
    optimizer=optimizer,
    num_warmup_steps=0,
    num_training_steps=num_training_steps,
)

In [None]:
from huggingface_hub import get_full_repo_name

model_name = "distilbert-base-uncased-finetuned-dreams-accelerate"
repo_name = get_full_repo_name(model_name)
repo_name

'eadsa1998/distilbert-base-uncased-finetuned-dreams-accelerate'

In [None]:
from huggingface_hub import Repository

output_dir = model_name
repo = Repository(output_dir, clone_from=repo_name)

Cloning https://huggingface.co/eadsa1998/distilbert-base-uncased-finetuned-dreams-accelerate into local empty directory.


In [None]:
from tqdm.auto import tqdm
import torch
import math

progress_bar = tqdm(range(num_training_steps))

for epoch in range(num_train_epochs):
    # Training
    model.train()
    for batch in train_dataloader:
        outputs = model(**batch)
        loss = outputs.loss
        accelerator.backward(loss)

        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()
        progress_bar.update(1)

    # Evaluation
    model.eval()
    losses = []
    for step, batch in enumerate(eval_dataloader):
        with torch.no_grad():
            outputs = model(**batch)

        loss = outputs.loss
        losses.append(accelerator.gather(loss.repeat(batch_size)))

    losses = torch.cat(losses)
    losses = losses[: len(eval_dataset)]
    try:
        perplexity = math.exp(torch.mean(losses))
    except OverflowError:
        perplexity = float("inf")

    print(f">>> Epoch {epoch}: Perplexity: {perplexity}")

    # Save and upload
    accelerator.wait_for_everyone()
    unwrapped_model = accelerator.unwrap_model(model)
    unwrapped_model.save_pretrained(output_dir, save_function=accelerator.save)
    if accelerator.is_main_process:
        tokenizer.save_pretrained(output_dir)
        repo.push_to_hub(
            commit_message=f"Training in progress epoch {epoch}", blocking=False
        )

  0%|          | 0/471 [00:00<?, ?it/s]

Configuration saved in distilbert-base-uncased-finetuned-dreams-accelerate/config.json


>>> Epoch 0: Perplexity: 8.034567003820205


Model weights saved in distilbert-base-uncased-finetuned-dreams-accelerate/pytorch_model.bin
tokenizer config file saved in distilbert-base-uncased-finetuned-dreams-accelerate/tokenizer_config.json
Special tokens file saved in distilbert-base-uncased-finetuned-dreams-accelerate/special_tokens_map.json
Configuration saved in distilbert-base-uncased-finetuned-dreams-accelerate/config.json


>>> Epoch 1: Perplexity: 7.6300251443303075


Model weights saved in distilbert-base-uncased-finetuned-dreams-accelerate/pytorch_model.bin
tokenizer config file saved in distilbert-base-uncased-finetuned-dreams-accelerate/tokenizer_config.json
Special tokens file saved in distilbert-base-uncased-finetuned-dreams-accelerate/special_tokens_map.json
Several commits (2) will be pushed upstream.
Configuration saved in distilbert-base-uncased-finetuned-dreams-accelerate/config.json


>>> Epoch 2: Perplexity: 7.492879794820154


Model weights saved in distilbert-base-uncased-finetuned-dreams-accelerate/pytorch_model.bin
tokenizer config file saved in distilbert-base-uncased-finetuned-dreams-accelerate/tokenizer_config.json
Special tokens file saved in distilbert-base-uncased-finetuned-dreams-accelerate/special_tokens_map.json
Several commits (3) will be pushed upstream.


## Use it!!

In [None]:
from transformers import pipeline

#mask_filler = pipeline(
#    "fill-mask", model="eadsa1998/distilbert-base-uncased-finetuned-dreams-accelerate", use_auth_token=True
#)

mask_filler = pipeline(
    "fill-mask", model="distilbert-base-uncased-finetuned-dreams"
)

In [1]:
preds = mask_filler("I had a dream I was [MASK].")

for pred in preds:
    print(f">>> {pred['sequence']}")

NameError: ignored