## Vocab

In [2]:
from torchtext.vocab import vocab
from collections import Counter, OrderedDict

In [7]:
counter = Counter(["a", "a", "b", "b", "b"])
counter, counter.items()

(Counter({'b': 3, 'a': 2}), dict_items([('a', 2), ('b', 3)]))

In [8]:
sorted_by_freq_tuples = sorted(counter.items(), key=lambda x: x[1], reverse=True)
sorted_by_freq_tuples

[('b', 3), ('a', 2)]

In [11]:
ordered_dict = OrderedDict(sorted_by_freq_tuples)
ordered_dict

OrderedDict([('b', 3), ('a', 2)])

In [26]:
v1 = vocab(ordered_dict)

print(v1["a"])
print(v1["b"])

1
0


In [27]:
print(v1['out of vocab'])

RuntimeError: Token out of vocab not found and default index is not set

In [14]:
tokens = ['e', 'd', 'c', 'b', 'a']
#adding <unk> token and default index
unk_token = '<unk>'
default_index = -1

In [22]:
v2 = vocab(OrderedDict([(token, 1) for token in tokens]), specials=[unk_token], min_freq=1)
v2.set_default_index(default_index)
print(v2['<unk>']) #prints 0
print(v2['out of vocab']) #prints -1

0
-1


In [16]:
#make default index same as index of unk_token
v2.set_default_index(v2[unk_token])
v2['out of vocab'] is v2[unk_token] #prints True

True

---

## build_vocab_from_iterator

Build a Vocab from an iterator.

In [36]:
#generating vocab from text file
import io
from torchtext.vocab import build_vocab_from_iterator
def yield_tokens(file_path):
    with io.open(file_path, encoding = 'utf-8') as f:
        for line in f:
            val =  line.strip().split()
            print(f"{val=}")
            yield val

In [40]:
vocab = build_vocab_from_iterator(yield_tokens("hello.txt"), specials=["<unk>"], special_first=False)

vocab

val=['My', 'name', 'is', 'Deependu.']
val=["I'm", 'a', 'NLP', '&', 'Cloud', 'Engineer.']
val=["I'm", 'a', 'tech', 'enthusiast', 'and', 'a', 'lifelong', 'learner.']
val=["I'm", 'willing', 'to', 'join', 'a', 'small', 'team', 'of', 'people', 'who', 'are', 'passionate', 'about', 'what', 'they', 'do', 'and', 'are', 'looking', 'to', 'make', 'a', 'positive', 'impact', 'on', 'the', 'world.']
val=['A', 'big', 'firm', 'is', 'not', 'my', 'cup', 'of', 'tea.', "I'm", 'looking', 'for', 'a', 'small', 'team', 'of', 'people', 'who', 'are', 'passionate', 'about', 'what', 'they', 'do', 'and', 'are', 'looking', 'to', 'make', 'a', 'positive', 'impact', 'on', 'the', 'world.']


Vocab()

In [41]:
vocab['Deependu.'], vocab['a'], vocab['<unk>']

(26, 0, 44)

In [44]:
for i in range(45):
    print(f"{i=}; {vocab.lookup_token(i)}")

i=0; a
i=1; I'm
i=2; are
i=3; and
i=4; looking
i=5; of
i=6; to
i=7; about
i=8; do
i=9; impact
i=10; is
i=11; make
i=12; on
i=13; passionate
i=14; people
i=15; positive
i=16; small
i=17; team
i=18; the
i=19; they
i=20; what
i=21; who
i=22; world.
i=23; &
i=24; A
i=25; Cloud
i=26; Deependu.
i=27; Engineer.
i=28; My
i=29; NLP
i=30; big
i=31; cup
i=32; enthusiast
i=33; firm
i=34; for
i=35; join
i=36; learner.
i=37; lifelong
i=38; my
i=39; name
i=40; not
i=41; tea.
i=42; tech
i=43; willing
i=44; <unk>
