### HuggingFace - Tokenizers


---


In [2]:
import pandas as pd
from datasets import load_dataset
from transformers import DistilBertTokenizer

dataset = load_dataset("imdb")

model = "distilbert/distilbert-base-uncased-finetuned-sst-2-english"
tokenizer = DistilBertTokenizer.from_pretrained(pretrained_model_name_or_path=model)


# Tokenization
def tokenize_function(examples):
    return tokenizer(examples["text"], padding=True, truncation=True)


# Tokenization on dataset
tokenized_train = dataset["train"].map(tokenize_function, batched=True)

# DataFrame
df = pd.DataFrame(tokenized_train)

In [3]:
print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 25000 entries, 0 to 24999
Data columns (total 4 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   text            25000 non-null  object
 1   label           25000 non-null  int64 
 2   input_ids       25000 non-null  object
 3   attention_mask  25000 non-null  object
dtypes: int64(1), object(3)
memory usage: 781.4+ KB
None


In [4]:
print(df[:5])

                                                text  label  \
0  I rented I AM CURIOUS-YELLOW from my video sto...      0   
1  "I Am Curious: Yellow" is a risible and preten...      0   
2  If only to avoid making this type of film in t...      0   
3  This film was probably inspired by Godard's Ma...      0   
4  Oh, brother...after hearing about this ridicul...      0   

                                           input_ids  \
0  [101, 1045, 12524, 1045, 2572, 8025, 1011, 375...   
1  [101, 1000, 1045, 2572, 8025, 1024, 3756, 1000...   
2  [101, 2065, 2069, 2000, 4468, 2437, 2023, 2828...   
3  [101, 2023, 2143, 2001, 2763, 4427, 2011, 2643...   
4  [101, 2821, 1010, 2567, 1012, 1012, 1012, 2044...   

                                      attention_mask  
0  [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...  
1  [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...  
2  [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...  
3  [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...  
4  [1, 1,

In [5]:
print(df[:5]["label"])

0    0
1    0
2    0
3    0
4    0
Name: label, dtype: int64


In [6]:
print(df[:5]["input_ids"])

0    [101, 1045, 12524, 1045, 2572, 8025, 1011, 375...
1    [101, 1000, 1045, 2572, 8025, 1024, 3756, 1000...
2    [101, 2065, 2069, 2000, 4468, 2437, 2023, 2828...
3    [101, 2023, 2143, 2001, 2763, 4427, 2011, 2643...
4    [101, 2821, 1010, 2567, 1012, 1012, 1012, 2044...
Name: input_ids, dtype: object


In [7]:
print(df[:5]["attention_mask"])

0    [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
1    [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
2    [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
3    [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
4    [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
Name: attention_mask, dtype: object
