# Outline

Here is the implementation including tokenize, embedding and simple training for given text.

In [51]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.nn.utils.rnn import pad_sequence
import re
from collections import  Counter
import random

random.seed(42)
torch.manual_seed(42)

<torch._C.Generator at 0x7f879908de10>

In [None]:
class Tokenizer:
    def __init__(self, texts, min_freq=1, max_vocab_size=10000):
        self.texts = texts
        self.min_freq = min_freq
        self.max_vocab_size = max_vocab_size
        self.vocab = self.build_vocab()

    def preprocess(self, text):
        text = text.lower()
        # keep only a-z, A-Z, 0-9, whitespace
        text = re.sub(r"[^a-zA-Z0-9\s]", "", text)
        return text.split()
    
    def build_vocab(self):
        word_counts = Counter()
        for text in self.texts:
            tokens = self.preprocess(text)
            word_counts.update(tokens)
        
        vocab = {"<pad>": 0, "<unk>": 1}

        """
        `most_common()` returns words from most frequent â†’ least frequent.
        Example:
        [("hello", 2), ("world", 1), ("Jacy", 1)]
        This ensures high-frequency words get smaller IDs (useful for models).
        """
        for word, freq in word_counts.most_common(self.max_vocab_size-2): # 2 for <pad> and <unk>
            if freq >= self.min_freq:
                vocab[word] = len(vocab)
        
        """
        Ex:
        vocab = {
            "<pad>": 0,
            "<unk>": 1,
            "hello": 2,
            "world": 3,
            "sean": 4
        }
        """
        return vocab

    def text_to_sequence(self, text):
        tokens = self.preprocess(text)
        [""]
        return [
            self.vocab.get(token, self.vocab["<unk>"]) for token in tokens
        ]
    
    def pad_sequences(self, sequences, pad_value=0):
        # convert each list to a tensor
        tensors = [torch.tensor(seq, dtype=torch.long) for seq in sequences]
        # pad to same length
        padded = pad_sequence(tensors, batch_first=True, padding_value=pad_value)
        return padded
    
    def tokenize_all_text(self):
        tokenized_seq = []
        for text in self.texts:
            tokenized_seq.append(self.text_to_sequence(text))
        padded_tensor = self.pad_sequences(tokenized_seq, pad_value=self.vocab["<pad>"])
        return padded_tensor

In [33]:
texts = [
    "The quick brown fox jumps over the lazy dog.",
    "PyTorch is widely used for deep learning tasks and neural networks.",
    "Tokenizers help convert raw text into numerical representations.",
    "Machine learning models rely on large datasets to generalize well.",
    "The fox and the dog became friends after many adventures.",
]


In [64]:
tokenizer = Tokenizer(texts)
tokenizer.text_to_sequence("The quick brown for jumps over the lazy dog")

[2, 7, 8, 16, 9, 10, 2, 11, 4]

In [65]:
text_seq = tokenizer.tokenize_all_text()
text_seq

Ddd: 0


tensor([[ 2,  7,  8,  3,  9, 10,  2, 11,  4,  0,  0],
        [12, 13, 14, 15, 16, 17,  5, 18,  6, 19, 20],
        [21, 22, 23, 24, 25, 26, 27, 28,  0,  0,  0],
        [29,  5, 30, 31, 32, 33, 34, 35, 36, 37,  0],
        [ 2,  3,  6,  2,  4, 38, 39, 40, 41, 42,  0]])

In [47]:
VOCAB_SIZE = len(tokenizer.vocab)
EMB_DIM = 8
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
DEVICE

device(type='cuda')

build embedding

In [48]:
class TextEmbedding(nn.Module):
    def __init__(self, vocab_size, embed_dim):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim)
    
    def forward(self, x):
        return self.embedding(x)

In [66]:
embedding_layer = TextEmbedding(VOCAB_SIZE, EMB_DIM).to(DEVICE)
embedding_layer

TextEmbedding(
  (embedding): Embedding(43, 8)
)

In [67]:
text_tensor = torch.tensor(text_seq, dtype=torch.long).to(DEVICE)

  text_tensor = torch.tensor(text_seq, dtype=torch.long).to(DEVICE)
