In [3]:
import re

class SimpleTokenizer:
    def __init__(self):
        self.token_to_id = {"[PAD]": 0, "[UNK]": 1}
        self.num_tokens = 2
    
    # Fit the tokenizer onto the dataset
    def fit(self, texts):
        for text in texts:
            for word in re.findall(r'\w+', text):
                if word not in self.token_to_id:
                    self.token_to_id[word] = self.num_tokens
                    self.num_tokens += 1

    def tokenize(self, text):
        return_list = []
        for word in re.findall(r'\w+', text):
            return_list.append(self.token_to_id.get(word, 1))
        return return_list
        # return [self.token_to_id.get(word, 1) for word in re.findall(r'\w+', text)]

In [18]:
import torch
from torch.utils.data import Dataset, DataLoader

class TextDataset(Dataset):
    def __init__(self, filename, is_multi_line=True, chunk_size=1024):
        self.chunks = []
        with open(filename, 'r', encoding="utf-8") as f:
            if is_multi_line:
                self.chunks = f.readlines()
            else:
                while True:
                    chunk = f.read(chunk_size)
                    if not chunk:
                        break  # eof
                    self.chunks.append(chunk)

    def __len__(self):
        return len(self.chunks)
    
    def __getitem__(self, idx):
        return self.chunks[idx]

In [19]:
# Example Usage:
texts = ["Hello world!", "How are you?"]
tokenizer = SimpleTokenizer()
tokenizer.fit(texts)
tokenized_text = tokenizer.tokenize("Hello world!")
print(tokenized_text)  # Output: [2, 3]

[2, 3]


In [22]:
# Usage:
dataset = TextDataset('datasets/iac_mini.txt', is_multi_line=False)
dataloader = DataLoader(dataset, batch_size=32, shuffle=True)

texts = dataset.chunks
tokenizer.fit(texts)

# Process data in batches
for batch in dataloader:
    for line in batch:
        tokens = tokenizer.fit(line)
        print(f"Original text: {line}")
        print(f"Tokenized text: {tokens}")
    break

Original text: t out of his mind. The ledge turned into the cliff and became a tunnel that had been carved into solid stone. The walls of the tunnel were as smooth as polished marble. What tools could men have used in the old days to cut a tunnel with walls so smooth that they looked like glass? Modern equipment could not have done the job so well. Niches in the wall of the tunnel admitted light and gave them glimpses of the island. "Where the hell will we find -- Oh, Pedro!" Retch spoke. The Indian messenger of the night before had appeared in the tunnel. He beckoned to them. They followed him into a large room cut out of solid stone. It was one of the cleanest and most simply furnished rooms Parker had ever seen. It contained handmade chairs along the wall and a big table, also hand-made. Light from a wall slit flowed into the room. Seated behind the table, illumined by tile light flowing in from the wall slit behind them, were Rozeno and Ulnar. Rozeno had a thin nose, the narrow fac