In [1]:
from google.colab import drive
drive.mount('drive')

Drive already mounted at drive; to attempt to forcibly remount, call drive.mount("drive", force_remount=True).


In [2]:
!pip install transformers
!pip install datasets

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.27.3-py3-none-any.whl (6.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.8/6.8 MB[0m [31m94.2 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting huggingface-hub<1.0,>=0.11.0
  Downloading huggingface_hub-0.13.3-py3-none-any.whl (199 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m199.8/199.8 KB[0m [31m25.7 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Downloading tokenizers-0.13.2-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.6/7.6 MB[0m [31m88.4 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.13.3 tokenizers-0.13.2 transformers-4.27.3
Looking in indexes: https://pypi.org/simple, http

In [3]:
#Testing an example

example_text = "Her word had the strength of titanium.Her promises can be [MASK]."

In [4]:
from transformers import TFAutoModelForMaskedLM

model_checkpoint = "distilbert-base-uncased"
model = TFAutoModelForMaskedLM.from_pretrained(model_checkpoint)

Downloading (…)lve/main/config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

Downloading tf_model.h5:   0%|          | 0.00/363M [00:00<?, ?B/s]

Some layers from the model checkpoint at distilbert-base-uncased were not used when initializing TFDistilBertForMaskedLM: ['activation_13']
- This IS expected if you are initializing TFDistilBertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFDistilBertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFDistilBertForMaskedLM were initialized from the model checkpoint at distilbert-base-uncased.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFDistilBertForMaskedLM for predictions without further training.


In [5]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

Downloading (…)okenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

In [6]:
import numpy as np
import tensorflow as tf

inputs = tokenizer(example_text, return_tensors="np")
token_logits = model(**inputs).logits
# Find the location of [MASK] and extract its logits
mask_token_index = np.argwhere(inputs["input_ids"] == tokenizer.mask_token_id)[0, 1]
mask_token_logits = token_logits[0, mask_token_index, :]
# Pick the [MASK] candidates with the highest logits
# We negate the array before argsort to get the largest, not the smallest, logits
top_5_tokens = np.argsort(-mask_token_logits)[:5].tolist()

for token in top_5_tokens:
    print(f">>> {example_text.replace(tokenizer.mask_token, tokenizer.decode([token]))}")

>>> Her word had the strength of titanium.Her promises can be broken.
>>> Her word had the strength of titanium.Her promises can be trusted.
>>> Her word had the strength of titanium.Her promises can be fulfilled.
>>> Her word had the strength of titanium.Her promises can be forged.
>>> Her word had the strength of titanium.Her promises can be damned.


In [7]:
from datasets import load_dataset

dataset = load_dataset('text', data_files="/content/drive/MyDrive/Deep learning project /commonsense.txt")

print(dataset)

Downloading and preparing dataset text/default to /root/.cache/huggingface/datasets/text/default-7f645fb7f7b288d9/0.0.0/cb1e9bd71a82ad27976be3b12b407850fe2837d80c22c5e03a28949843a8ace2...


Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Dataset text downloaded and prepared to /root/.cache/huggingface/datasets/text/default-7f645fb7f7b288d9/0.0.0/cb1e9bd71a82ad27976be3b12b407850fe2837d80c22c5e03a28949843a8ace2. Subsequent calls will reuse this data.


  0%|          | 0/1 [00:00<?, ?it/s]

DatasetDict({
    train: Dataset({
        features: ['text'],
        num_rows: 1458
    })
})


In [8]:
def tokenize_function(examples):
    result = tokenizer(examples['text'])
    if tokenizer.is_fast:
        result["word_ids"] = [result.word_ids(i) for i in range(len(result["input_ids"]))]
    return result

# Use batched=True to activate fast multithreading!
tokenized_datasets = dataset.map(
    tokenize_function, batched=True, remove_columns=['text']
)
tokenized_datasets

Map:   0%|          | 0/1458 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'word_ids'],
        num_rows: 1458
    })
})

In [9]:
tokenizer.model_max_length

512

In [10]:
chunk_size = 128

In [11]:
# Slicing produces a list of lists for each feature
tokenized_samples = tokenized_datasets["train"][:3]

for idx, sample in enumerate(tokenized_samples["input_ids"]):
    print(f"'>>> Review {idx} length: {len(sample)}'")

'>>> Review 0 length: 16'
'>>> Review 1 length: 18'
'>>> Review 2 length: 24'


In [12]:
concatenated_examples = {
    k: sum(tokenized_samples[k], []) for k in tokenized_samples.keys()
}
total_length = len(concatenated_examples["input_ids"])
print(f"'>>> Concatenated reviews length: {total_length}'")

'>>> Concatenated reviews length: 58'


In [13]:
chunks = {
    k: [t[i : i + chunk_size] for i in range(0, total_length, chunk_size)]
    for k, t in concatenated_examples.items()
}

for chunk in chunks["input_ids"]:
    print(f"'>>> Chunk length: {len(chunk)}'")

'>>> Chunk length: 58'


In [14]:
def group_texts(examples):
    # Concatenate all texts
    concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()}
    # Compute length of concatenated texts
    total_length = len(concatenated_examples[list(examples.keys())[0]])
    # We drop the last chunk if it's smaller than chunk_size
    total_length = (total_length // chunk_size) * chunk_size
    # Split by chunks of max_len
    result = {
        k: [t[i : i + chunk_size] for i in range(0, total_length, chunk_size)]
        for k, t in concatenated_examples.items()
    }
    # Create a new labels column
    result["labels"] = result["input_ids"].copy()
    return result

In [15]:
lm_datasets = tokenized_datasets.map(group_texts, batched=True)
lm_datasets

Map:   0%|          | 0/1458 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'word_ids', 'labels'],
        num_rows: 222
    })
})

In [16]:
tokenizer.decode(lm_datasets["train"][1]["input_ids"])

'grumpy as an old man. he was mean. [SEP] [CLS] the dog was as grumpy as a kindergarten teacher. he was nice. [SEP] [CLS] the voyage was as long as a lifetime. the voyage was very long. [SEP] [CLS] the voyage was as long as the blink of an eye. the voyage as short. [SEP] [CLS] he had all the wealth of a tycoon. he was rich. [SEP] [CLS] he had all the wealth of a hobo. he was poor. [SEP] [CLS] it smells like a freshly baked cookies on christmas morning. it smells great. [SEP] [CLS] it smells like a cesspool sitting in the'

In [17]:
tokenizer.decode(lm_datasets["train"][1]["labels"])

'grumpy as an old man. he was mean. [SEP] [CLS] the dog was as grumpy as a kindergarten teacher. he was nice. [SEP] [CLS] the voyage was as long as a lifetime. the voyage was very long. [SEP] [CLS] the voyage was as long as the blink of an eye. the voyage as short. [SEP] [CLS] he had all the wealth of a tycoon. he was rich. [SEP] [CLS] he had all the wealth of a hobo. he was poor. [SEP] [CLS] it smells like a freshly baked cookies on christmas morning. it smells great. [SEP] [CLS] it smells like a cesspool sitting in the'

In [18]:
from transformers import DataCollatorForLanguageModeling

data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm_probability=0.15)

In [19]:
samples = [lm_datasets["train"][i] for i in range(2)]
for sample in samples:
    _ = sample.pop("word_ids")

for chunk in data_collator(samples)["input_ids"]:
    print(f"\n'>>> {tokenizer.decode(chunk)}'")

You're using a DistilBertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.



'>>> [CLS] her word had [MASK] strength of [MASK]. [MASK] promises can be believed. [SEP] [CLS] her word [MASK] the [MASK] of a wine glass. [MASK] promises cannot be trusted. [SEP] [CLS] his [MASK] have the passion of lovers meeting after a long separation. his kisses are demonstrative and intense. [SEP] [CLS] his kisses have the conflict of a couple in a loveless marriage [MASK] his kiss is unemotional. [SEP] [CLS] this winter is as cold as my mother - in - law towards me. it's very cold. [SEP] [CLS] this winterreus as cold as [MASK] [MASK] at [MASK]. it'[MASK] [MASK] warm. [SEP] [CLS]athy dog was as'

'>>> grumpy [MASK] an [MASK] man. he was mean. [SEP] [CLS] the dog was [MASK] [MASK] [MASK]y [MASK] a kindergarten teacher. he was [MASK]. [SEP] [CLS] the [MASK] was as [MASK] as [MASK] lifetime. the voyage was very long. [SEP] [CLS] the voyage was [MASK] long as the blink of an eye. the voyage as short [MASK] [SEP] [CLS] he had all the wealth of [MASK] tycoon. he was rich. [SEP] [CLS]

In [20]:
train_size = 200
test_size = int(0.1 * train_size)

downsampled_dataset = lm_datasets["train"].train_test_split(
    train_size=train_size, test_size=test_size, seed=42
)
downsampled_dataset

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'word_ids', 'labels'],
        num_rows: 200
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask', 'word_ids', 'labels'],
        num_rows: 20
    })
})

In [21]:
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [22]:
#Starting the training

tf_train_dataset = model.prepare_tf_dataset(
    downsampled_dataset["train"],
    collate_fn=data_collator,
    shuffle=True,
    batch_size=32,
)

tf_eval_dataset = model.prepare_tf_dataset(
    downsampled_dataset["test"],
    collate_fn=data_collator,
    shuffle=False,
    batch_size=32,
)

In [23]:
from transformers import create_optimizer
from transformers.keras_callbacks import PushToHubCallback
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

num_train_steps = len(tf_train_dataset)
optimizer, schedule = create_optimizer(
    init_lr=2e-5,
    num_warmup_steps=1_000,
    num_train_steps=num_train_steps,
    weight_decay_rate=0.01,
)
#loss_function = keras.losses.SparseCategoricalCrossentropy(from_logits=True)
#model.compile(loss=loss_function, optimizer=optimizer)
model.compile(optimizer=optimizer)

No loss specified in compile() - the model's internal loss computation will be used as the loss. Don't panic - this is a common way to train TensorFlow models in Transformers! To disable this behaviour please pass a loss argument, or explicitly pass `loss=None` if you do not want your model to compute a loss.


In [24]:
import math

eval_loss = model.evaluate(tf_eval_dataset)
print(f"Perplexity: {math.exp(eval_loss):.2f}")

Instructions for updating:
Lambda fuctions will be no more assumed to be used in the statement where they are used, or at least in the same block. https://github.com/tensorflow/tensorflow/issues/56089


Perplexity: 11.76


In [25]:
model.fit(tf_train_dataset, validation_data=tf_eval_dataset)
eval_loss = model.evaluate(tf_eval_dataset)
print(f"Perplexity: {math.exp(eval_loss):.2f}")

Perplexity: 17.77


In [26]:
#Saving the model

path_model = "/content/drive/MyDrive/Deep learning project /my_model.pt"
model.save_pretrained(path_model)