In [16]:
import collections

import datasets
import matplotlib.pyplot as plt
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
import torchtext
import tqdm
print("torchtext version : " , torchtext.__version__)
print("torch version : ",torch.__version__)

torchtext version :  0.18.0+cpu
torch version :  2.3.0+cpu


In [17]:
train_data, test_data = datasets.load_dataset("imdb", split=["train", "test"])

In [18]:
train_data,test_data

(Dataset({
     features: ['text', 'label'],
     num_rows: 25000
 }),
 Dataset({
     features: ['text', 'label'],
     num_rows: 25000
 }))

In [19]:
train_data.features

{'text': Value(dtype='string', id=None),
 'label': ClassLabel(names=['neg', 'pos'], id=None)}

In [20]:
train_data[0]

{'text': 'I rented I AM CURIOUS-YELLOW from my video store because of all the controversy that surrounded it when it was first released in 1967. I also heard that at first it was seized by U.S. customs if it ever tried to enter this country, therefore being a fan of films considered "controversial" I really had to see this for myself.<br /><br />The plot is centered around a young Swedish drama student named Lena who wants to learn everything she can about life. In particular she wants to focus her attentions to making some sort of documentary on what the average Swede thought about certain political issues such as the Vietnam War and race issues in the United States. In between asking politicians and ordinary denizens of Stockholm about their opinions on politics, she has sex with her drama teacher, classmates, and married men.<br /><br />What kills me about I AM CURIOUS-YELLOW is that 40 years ago, this was considered pornographic. Really, the sex and nudity scenes are few and far be

In [21]:
#pip install torchtext==0.4 resolve an issue when trying to import the module
from torchtext.data import get_tokenizer
tokenizer = get_tokenizer("basic_english")


In [22]:
#example of use of the tokenizer :

tokenizer("Hello world ! How are you doing today")

['hello', 'world', '!', 'how', 'are', 'you', 'doing', 'today']

In [23]:
def tokenize_example(example,tokenizer, max_length):
    tokens = tokenizer(example["text"])[:max_length]
    return {"tokens" : tokens}

In [24]:
max_length = 256

train_data = train_data.map(
    tokenize_example, fn_kwargs={"tokenizer" : tokenizer, "max_length" : max_length}
)

test_data = test_data.map(
    tokenize_example, fn_kwargs={"tokenizer" : tokenizer, "max_length" : max_length}
)

In [25]:
train_data

Dataset({
    features: ['text', 'label', 'tokens'],
    num_rows: 25000
})

In [26]:
train_data.features

{'text': Value(dtype='string', id=None),
 'label': ClassLabel(names=['neg', 'pos'], id=None),
 'tokens': Sequence(feature=Value(dtype='string', id=None), length=-1, id=None)}

In [27]:
train_data[0]["tokens"][:25]

['i',
 'rented',
 'i',
 'am',
 'curious-yellow',
 'from',
 'my',
 'video',
 'store',
 'because',
 'of',
 'all',
 'the',
 'controversy',
 'that',
 'surrounded',
 'it',
 'when',
 'it',
 'was',
 'first',
 'released',
 'in',
 '1967',
 '.']

In [28]:
#We are going to create a validation test set which is a fraction of the real
#test set the validation one is the set we will use to test and the test set is the one  we will use in the final test 
#We create two sets because we dont want to overfit the model
test_size = 0.25

train_valid_data = train_data.train_test_split(test_size=test_size)
train_data = train_valid_data["train"]
valid_data = train_valid_data["test"]

In [29]:
#We can see the split : 
len(train_data),len(valid_data), len(test_data)


(18750, 6250, 25000)

In [34]:
from torchtext.vocab import build_vocab_from_iterator
min_freq = 5
special_token = ["<unk>","<pad>"]

vocab = build_vocab_from_iterator(
    train_data["tokens"],
    min_freq = min_freq,
    specials=special_token
)



In [35]:
len(vocab)

21498

In [40]:
vocab.get_itos()[:10]

['<unk>', '<pad>', 'the', '.', ',', 'a', 'and', 'of', 'to', "'"]

In [42]:
unk_index = vocab["<unk>"]
pad_index = vocab["<pad>"]

print(unk_index)
print(pad_index)

0
1


In [43]:
#we can chek if a token is in our vacabulary using the in operator

"some_token" in vocab

False

In [None]:
def numericalize_example(example, vocab):
    ids = vocab.lookup_indices(example["tokens"])
    return {"ids" : ids}