# 🧠 ReplicateAI Demo Notebook — Attention Is All You Need (2017)
Use this notebook to test model implementation, visualize metrics, and document key observations.

## test multi30k dataset

In [1]:
from datasets import load_dataset

ds = load_dataset("bentrevett/multi30k")

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
ds

DatasetDict({
    train: Dataset({
        features: ['en', 'de'],
        num_rows: 29000
    })
    validation: Dataset({
        features: ['en', 'de'],
        num_rows: 1014
    })
    test: Dataset({
        features: ['en', 'de'],
        num_rows: 1000
    })
})

In [3]:
ds['train']

Dataset({
    features: ['en', 'de'],
    num_rows: 29000
})

In [8]:
ds['validation']

Dataset({
    features: ['en', 'de'],
    num_rows: 1014
})

In [9]:
ds['test']

Dataset({
    features: ['en', 'de'],
    num_rows: 1000
})

## Test Tiktoken

In [10]:
import tiktoken
enc = tiktoken.get_encoding("gpt2")

# Encode sample pairs
for example in ds['train'].select(range(10)):
    en_tokens = enc.encode(example['en'])
    de_tokens = enc.encode(example['de'])

    print(f"EN: {example['en']}")
    print(f"DE: {example['de']}")
    print(f"EN tokens: {en_tokens} | DE tokens: {de_tokens}\n")

EN: Two young, White males are outside near many bushes.
DE: Zwei junge weiße Männer sind im Freien in der Nähe vieler Büsche.
EN tokens: [7571, 1862, 11, 2635, 10835, 389, 2354, 1474, 867, 37413, 13] | DE tokens: [57, 42990, 10891, 469, 356, 72, 39683, 68, 337, 11033, 77, 1008, 264, 521, 545, 4848, 2013, 287, 4587, 399, 11033, 258, 410, 8207, 263, 347, 9116, 15952, 13]

EN: Several men in hard hats are operating a giant pulley system.
DE: Mehrere Männer mit Schutzhelmen bedienen ein Antriebsradsystem.
EN tokens: [14945, 1450, 287, 1327, 23910, 389, 5361, 257, 6175, 17472, 1636, 1080, 13] | DE tokens: [5308, 71, 260, 260, 337, 11033, 77, 1008, 10255, 3059, 27839, 2978, 3653, 3996, 2013, 268, 304, 259, 3738, 5034, 1443, 6335, 10057, 13]

EN: A little girl climbing into a wooden playhouse.
DE: Ein kleines Mädchen klettert in ein Spielhaus aus Holz.
EN tokens: [32, 1310, 2576, 14281, 656, 257, 13510, 711, 4803, 13] | DE tokens: [36, 259, 479, 293, 1127, 337, 11033, 67, 6607, 479, 9291, 83

## Test Tokenizers

In [5]:
from tokenizers import Tokenizer
from tokenizers.models import BPE

tokenizer = Tokenizer(BPE())

from tokenizers.trainers import BpeTrainer
trainer = BpeTrainer(special_tokens=["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]"])

tokenizer.train_from_iterator(ds['train']['en'], trainer)






In [13]:
output = tokenizer.encode(ds['train']['en'][1])

In [14]:
output.tokens


['Several men in ',
 'hard hats are ',
 'operating a ',
 'giant ',
 'pulley ',
 'system.']

In [15]:
output.ids

[9099, 18266, 6291, 2529, 16857, 29072]