In [1]:
from torchtext import vocab
from torchtext.legacy.data import Field, TabularDataset, BucketIterator
from torchtext.datasets import AG_NEWS
from torchtext.data.utils import get_tokenizer

train_iter = AG_NEWS(split='train')
tokenizer = get_tokenizer('basic_english')
print(next(train_iter))


def yield_tokens(data_iter):
    for _, text in data_iter:
        yield tokenizer(text)

dataset_vocab = vocab.build_vocab_from_iterator(yield_tokens(train_iter))


5211lines [00:00, 26045.03lines/s]

(3, "Wall St. Bears Claw Back Into the Black (Reuters) Reuters - Short-sellers, Wall Street's dwindling\\band of ultra-cynics, are seeing green again.")


119999lines [00:06, 19464.50lines/s]


In [2]:
print(tokenizer("hello my name is"))

['hello', 'my', 'name', 'is']


In [3]:
def yield_token():
    for i in tokenizer("hello my name is"):
        yield dataset_vocab[i]

some_list = list(yield_token())
print(some_list)
#print(tokenizer.spacy.len())

[12545, 1301, 952, 22]


### Split Dataset

In [7]:
import torch
from torchtext.legacy import data
from torchtext.legacy import datasets


from torchtext.datasets import IMDB
train_iter, test_iter = IMDB(split=('train', 'test'))
print(next(train_iter))

C:\Users\Dicks\Documents\Python\Sequence\Coursera_course\my_coursera\.data\aclImdb_v1.tar.gz: 100%|█| 84.1M/84.1M [00:2


('neg', 'I rented I AM CURIOUS-YELLOW from my video store because of all the controversy that surrounded it when it was first released in 1967. I also heard that at first it was seized by U.S. customs if it ever tried to enter this country, therefore being a fan of films considered "controversial" I really had to see this for myself.<br /><br />The plot is centered around a young Swedish drama student named Lena who wants to learn everything she can about life. In particular she wants to focus her attentions to making some sort of documentary on what the average Swede thought about certain political issues such as the Vietnam War and race issues in the United States. In between asking politicians and ordinary denizens of Stockholm about their opinions on politics, she has sex with her drama teacher, classmates, and married men.<br /><br />What kills me about I AM CURIOUS-YELLOW is that 40 years ago, this was considered pornographic. Really, the sex and nudity scenes are few and far bet

In [8]:
from torchtext.data.utils import get_tokenizer
tokenizer = get_tokenizer('basic_english')


For more flexibility, users can build the vocabulary directly with the **Vocab class** For example the argument 
- min_freq = cutoff sentence to the vocab.

The special tokens, like <BOS> and <EOS> can be assigned to the special symbols in the constructor of the Vocab class

In [9]:
from collections import Counter
from torchtext.vocab import Vocab

train_iter = IMDB(split='train')
counter = Counter()
for (label, line) in train_iter:
    counter.update(tokenizer(line))
vocab = Vocab(counter, min_freq=10, specials=('<unk>', '<BOS>', '<EOS>', '<PAD>'))

In [10]:
print("The length of the new vocab is", len(vocab))
new_stoi = vocab.stoi
print("The index of '<BOS>' is", new_stoi['<BOS>'])
new_itos = vocab.itos
print("The token at index 2 is", new_itos[2])

The length of the new vocab is 20439
The index of '<BOS>' is 1
The token at index 2 is <EOS>


In [11]:

text_transform = lambda x: [vocab['<BOS>']] + [vocab[token] for token in tokenizer(x)] + [vocab['<EOS>']]
label_transform = lambda x: 1 if x == 'pos' else 0

# Print out the output of text_transform
print("input to the text_transform:", "here is an example")
print("output of the text_transform:", text_transform("here is an example"))

input to the text_transform: here is an example
output of the text_transform: [1, 134, 12, 43, 467, 2]


In [14]:
from torch.utils.data import DataLoader
from torch.nn.utils.rnn import pad_sequence
import random

train_iter = IMDB(split='train')
train_list = list(train_iter)
batch_size = 8  # A batch size of 8

def collate_batch(batch):
   label_list, text_list = [], []
   for (_label, _text) in batch:
        label_list.append(label_transform(_label))
        processed_text = torch.tensor(text_transform(_text))
        text_list.append(processed_text)
   return torch.tensor(label_list), pad_sequence(text_list, padding_value=3.0)

def batch_sampler():
    indices = [(i, len(tokenizer(s[1]))) for i, s in enumerate(train_list)]
    random.shuffle(indices)
    pooled_indices = []
    # create pool of indices with similar lengths 
    for i in range(0, len(indices), batch_size * 100):
        pooled_indices.extend(sorted(indices[i:i + batch_size * 100], key=lambda x: x[1]))

    pooled_indices = [x[0] for x in pooled_indices]

    # yield indices for current batch
    for i in range(0, len(pooled_indices), batch_size):
        yield pooled_indices[i:i + batch_size]

bucket_dataloader = DataLoader(train_list, batch_sampler=batch_sampler(),
                               collate_fn=collate_batch)

#print(next(iter(bucket_dataloader)), next(iter(bucket_dataloader)).shape)

AttributeError: 'tuple' object has no attribute 'shape'

In [23]:
#print(next(iter(bucket_dataloader)))
test = next(iter(bucket_dataloader))
print(test[0], test[1].shape)

tensor([0, 0, 0, 0, 0, 0, 1, 0]) torch.Size([113, 8])


In [25]:
test = next(iter(bucket_dataloader))
print(test[0], test[1].shape)

tensor([0, 0, 0, 1, 1, 1, 1, 0]) torch.Size([127, 8])


In [26]:
test = next(iter(bucket_dataloader))
print(test[0], test[1].shape)

tensor([1, 0, 0, 1, 1, 1, 1, 0]) torch.Size([130, 8])


In [27]:
test = next(iter(bucket_dataloader))
print(test[0], test[1].shape)

tensor([1, 0, 0, 1, 0, 0, 0, 0]) torch.Size([132, 8])


# Following Pytorch tutorial in this youtube link
https://www.youtube.com/watch?v=InUqeaOSPpA&list=RDCMUCkzW5JSFwvKRjXABI-UTAkQ&index=2 

In [45]:
import spacy
from torchtext.datasets import IWSLT2017
train_data, valid_data, test_data = IWSLT2017(language_pair=('en','de'))


In [46]:
#python -m spacy download de_dep_news_trf
#python -m spacy download en_core_web_trf
#python -m spacy download fr_dep_news_trf
#python -m spacy download zh_core_web_trf
dict_spacy = {'en':'en_core_web_trf', 'de':'de_dep_news_trf', 'fr':'fr_dep_news_trf', 'cn': 'zh_core_web_trf', 
             'cat': 'ca_core_news_trf'}
# https://spacy.io/usage/models
# english, german, french, chinese, catalan
spacy_eng = spacy.load(dict_spacy['en'])
spacy_ger = spacy.load(dict_spacy['de'])

OSError: [E050] Can't find model 'en_core_web_trf'. It doesn't seem to be a Python package or a valid path to a data directory.