In [49]:
tokenizer = DistilBertTokenizer.from_pretrained(
    "distilbert-base-uncased"
)

MAX_LENGTH = 256

def preprocess_function(examples):
    return tokenizer(
        examples["text"],
        padding="max_length",
        truncation=True,
        max_length=MAX_LENGTH,
    )

encoded_dataset = dataset.map(preprocess_function, batched=True)


In [50]:
encoded_dataset

DatasetDict({
    train: Dataset({
        features: ['text', 'label', 'input_ids', 'attention_mask'],
        num_rows: 25000
    })
    test: Dataset({
        features: ['text', 'label', 'input_ids', 'attention_mask'],
        num_rows: 25000
    })
})

In [48]:
from datasets import load_dataset

dataset = load_dataset(
    "imdb",
    split={"train": "train", "test": "test"}
)

In [47]:
dataset = dataset.remove_columns("unsupervised")

ValueError: Column name ['unsupervised'] not in the dataset. Current columns in the dataset: ['text', 'label']

In [42]:
def token_length(example):
    return len(tokenizer(example["text"])["input_ids"])

lengths = dataset["train"].map(token_length)
print(max(lengths))

Map:   0%|          | 0/25000 [00:00<?, ? examples/s]


TypeError: Provided `function` which is applied to all elements of table returns a variable of type <class 'int'>. Make sure provided `function` returns a variable of type `dict` (or a pyarrow table) to update the dataset or `None` if you are only interested in side effects.

In [15]:
dataset.values()

dict_values([Dataset({
    features: ['text', 'label'],
    num_rows: 25000
}), Dataset({
    features: ['text', 'label'],
    num_rows: 25000
}), Dataset({
    features: ['text', 'label'],
    num_rows: 50000
})])

In [1]:
import torch
import numpy as np

x = torch.randn(3)
print(x.numpy())

[1.0867946 1.532958  0.1886419]


In [10]:
from transformers import pipeline
import numpy

classifier = pipeline(
    task="text-classification",
    model="distilbert-base-uncased-finetuned-sst-2-english"
)

result = classifier("I love using Hugging Face Transformers!")
print(result)

Device set to use cpu


[{'label': 'POSITIVE', 'score': 0.9971315860748291}]


In [25]:
from transformers import AutoTokenizer

# Load pre-trained BERT tokenizer
tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")

sentence = "I love NLP."
tokens = tokenizer.tokenize(sentence)
print(tokens)

['I', 'love', 'NL', '##P', '.']


In [26]:
token_ids = tokenizer.encode(sentence)
print(token_ids)

[101, 146, 1567, 21239, 2101, 119, 102]


In [31]:
unknown_word = "unbelievable"
tokens = tokenizer.tokenize(unknown_word)
print(tokens)

decoded_sentence = tokenizer.decode(token_ids)
print(decoded_sentence)

['un', '##believable']
[CLS] I love NLP. [SEP]


In [32]:
sentences = [
    "BERT is great.",
    "Transformers excel in NLP.",
    "Tokenization matters."
]

batch = tokenizer(sentences, padding=True)
print(batch)

{'input_ids': [[101, 139, 9637, 1942, 1110, 1632, 119, 102, 0], [101, 25267, 4252, 18389, 1107, 21239, 2101, 119, 102], [101, 1706, 6378, 2734, 5218, 119, 102, 0, 0]], 'token_type_ids': [[0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 0], [1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 0, 0]]}


In [40]:
for ids in batch["input_ids"]:
    print(tokenizer.decode(ids))

[CLS] BERT is great. [SEP] [PAD]
[CLS] Transformers excel in NLP. [SEP]
[CLS] Tokenization matters. [SEP] [PAD] [PAD]


In [29]:
from transformers import DistilBertTokenizer

def preprocess_function(examples, max_length):
    tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased")
    return tokenizer(
        examples["text"],
        padding="max_length",
        truncation=True,
        max_length=max_length,
    )

sentence = "This is an example sentence for tokenization."
max_length = 10
example = {"text": sentence}
tokenized_output = preprocess_function(example, max_length)
print(tokenized_output)


{'input_ids': [101, 2023, 2003, 2019, 2742, 6251, 2005, 19204, 3989, 102], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}
