<a href="https://colab.research.google.com/github/edgarbc/My_medium_posts/blob/main/tokenizer_embeddings_tutorial.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Tokenizer and embeddings model tutorial
by Edgar Bermudez

medium:

In [4]:
from transformers import AutoTokenizer, AutoModel
import torch

# Step 1: Load the tokenizer and model
tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')
model = AutoModel.from_pretrained('bert-base-uncased')

# Step 2: Tokenize input text
text = "I love machine learning"
tokens = tokenizer(text, return_tensors='pt')

# Output of tokenizer
print("Tokenized Input:")
print(tokens)

# Step 3: Pass token IDs to the embedding model
with torch.no_grad():
    embeddings = model(**tokens).last_hidden_state

# Output embeddings
print("\nEmbedding Vectors (Shape):", embeddings.shape)

Tokenized Input:
{'input_ids': tensor([[ 101, 1045, 2293, 3698, 4083,  102]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1]])}

Embedding Vectors (Shape): torch.Size([1, 6, 768])


## Embedding

In [7]:
!pip install datasets

Collecting datasets
  Downloading datasets-3.2.0-py3-none-any.whl.metadata (20 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.9.0,>=2023.1.0 (from fsspec[http]<=2024.9.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.9.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.2.0-py3-none-any.whl (480 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m480.6/480.6 kB[0m [31m8.9 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m8.0 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2024.9.0-py3-none-any.whl (1

In [5]:
from sentence_transformers import SentenceTransformer, util

# Step 1: Load a pretrained embedding model
model = SentenceTransformer('all-MiniLM-L6-v2')  # A lightweight, high-performance embedding model

# Step 2: Define text to encode
sentences = [
    "A king is a male ruler of a kingdom.",
    "A queen is a female ruler of a kingdom.",
    "The weather today is sunny and bright."
]

# Step 3: Convert text to embeddings
embeddings = model.encode(sentences)

# Step 4: Compute pairwise cosine similarity
similarity_matrix = util.cos_sim(embeddings, embeddings)

# Step 5: Display results
print("Pairwise Similarity Matrix:")
print(similarity_matrix)

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.7k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Pairwise Similarity Matrix:
tensor([[1.0000, 0.7328, 0.0671],
        [0.7328, 1.0000, 0.0427],
        [0.0671, 0.0427, 1.0000]])


## Embeddings model fine-tuning

For this part make sure you change the run type to T4 so that the fine tuning does not take forever.

In [1]:
!pip install datasets


Collecting datasets
  Downloading datasets-3.2.0-py3-none-any.whl.metadata (20 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.9.0,>=2023.1.0 (from fsspec[http]<=2024.9.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.9.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.2.0-py3-none-any.whl (480 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m480.6/480.6 kB[0m [31m28.8 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m11.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2024.9.0-py3-none-any.whl 

In [2]:
from datasets import load_dataset
# Load IMDb dataset (train/test split)
dataset = load_dataset('imdb')
# Take a small subset for quick fine-tuning (to speed up demonstration)
small_train_dataset = dataset['train'].shuffle(seed=42).select(range(2000))
small_test_dataset = dataset['test'].shuffle(seed=42).select(range(500))

README.md:   0%|          | 0.00/7.81k [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/21.0M [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/20.5M [00:00<?, ?B/s]

unsupervised-00000-of-00001.parquet:   0%|          | 0.00/42.0M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/25000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/25000 [00:00<?, ? examples/s]

Generating unsupervised split:   0%|          | 0/50000 [00:00<?, ? examples/s]

In [3]:
from transformers import AutoTokenizer
# Load the tokenizer for DistilBERT
tokenizer = AutoTokenizer.from_pretrained('distilbert-base-uncased')
# Tokenize the dataset
def tokenize_function(example):
    return tokenizer(example['text'], padding="max_length", truncation=True)
tokenized_train = small_train_dataset.map(tokenize_function, batched=True)
tokenized_test = small_test_dataset.map(tokenize_function, batched=True)
# Prepare data for PyTorch
tokenized_train = tokenized_train.remove_columns(['text']).with_format('torch')
tokenized_test = tokenized_test.remove_columns(['text']).with_format('torch')

The cache for model files in Transformers v4.22.0 has been updated. Migrating your old cache. This is a one-time only operation. You can interrupt this and resume the migration later on by calling `transformers.utils.move_cache()`.


0it [00:00, ?it/s]

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Map:   0%|          | 0/2000 [00:00<?, ? examples/s]

Map:   0%|          | 0/500 [00:00<?, ? examples/s]

In [4]:
from transformers import AutoModelForSequenceClassification
# Load the pretrained DistilBERT model with a classification head
model = AutoModelForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=2)

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


You will need to access your weights and biases account to give access to log the training there. Once you login you will be given a token to paste in the cell below.

In [5]:
from transformers import TrainingArguments, Trainer
# Define training arguments
training_args = TrainingArguments(
    output_dir='./results',          # Directory to save the model
    evaluation_strategy="epoch",    # Evaluate at the end of each epoch
    learning_rate=2e-5,             # Learning rate
    per_device_train_batch_size=8,  # Batch size for training
    per_device_eval_batch_size=8,   # Batch size for evaluation
    num_train_epochs=3,             # Number of epochs
    weight_decay=0.01,              # Regularization
    logging_dir='./logs',           # Directory for logs
    logging_steps=10,
    save_strategy="epoch"           # Save model after every epoch
)
# Define the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_test,
)
# Fine-tune the model
trainer.train()



<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


Epoch,Training Loss,Validation Loss
1,0.2902,0.310921
2,0.2114,0.390879
3,0.0504,0.475531


TrainOutput(global_step=750, training_loss=0.24530404072999953, metrics={'train_runtime': 327.8096, 'train_samples_per_second': 18.303, 'train_steps_per_second': 2.288, 'total_flos': 794804391936000.0, 'train_loss': 0.24530404072999953, 'epoch': 3.0})

In [6]:
# Evaluate the model
results = trainer.evaluate()

print("Evaluation Results:")
print(results)

Evaluation Results:
{'eval_loss': 0.47553136944770813, 'eval_runtime': 6.9555, 'eval_samples_per_second': 71.885, 'eval_steps_per_second': 9.058, 'epoch': 3.0}
