<a href="https://colab.research.google.com/github/dhruv20047228/Pytorch-Notebooks/blob/main/tokenizer_practice.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from importlib.metadata import version

print(f"Torch version: ", version("torch"))
print(f"tiktoken version: ", version("tiktoken"))

In [None]:
import os
import requests

In [None]:
if not os.path.expandvars:
  url = (
      "https://raw.githubusercontent.com/rasbt/"
        "LLMs-from-scratch/main/ch02/01_main-chapter-code/"
        "the-verdict.txt"
  )
  file_path = "the-verdict.txt"
  response = requests.get(url, timeout=30)
  response.raise_for_status()
  with open(file_path, "wb") as f:
    f.write(response.content)

In [None]:
with open("the-verdict.txt","r", encoding="utf-8") as f:
  raw_text = f.read()

print("Total number of character: ", len(raw_text))
print(raw_text[:99])


In [None]:
import re

text = "Hello World, This is a test."
result = re.split(r'(\s)', text)

print(result)

In [None]:
#spliting commas and periods
result = [item for item in result if item.strip()]
print(result)

In [None]:
text = "Hello, world. Is this a test?" #splitting of symbols

result = re.split(r'([,.:;?_!"()\']|--|\s)', text)
result = [item.strip() for item in result if item.strip()]
print(result)

In [None]:
preprocessed = re.split(r'([,.:;?_!"()\']|--|\s)', raw_text)
preprocessed = [item.strip() for item in preprocessed if item.strip()]
print(preprocessed[:30])

In [None]:
print(len(preprocessed))

In [None]:
all_words = sorted(list(set(preprocessed).union({"[UNK]"}))) # Add an unknown token
vocab_size = len(all_words)

print(vocab_size)

In [None]:
vocab = {token:integer for integer, token in enumerate(all_words)}

In [None]:
for i, item in enumerate(vocab.items()):
  print(item)
  if i >= 50:
    break


In [None]:
class SimpleTokenizerV1:
  def __init__(self, vocab):
    self.str_to_int = vocab
    self.int_to_str = {i:s for s,i in vocab.items()}

  def encode(self, text):
      preprocessed = re.split(r'([,.:;?_!"()\']|--|\s)', text)
      preprocessed = [
          item.strip() for item in preprocessed if item.strip()
      ]
      ids = [self.str_to_int[s] for s in preprocessed]
      return ids

  def decode(self, ids):
      text = " ".join([self.int_to_str[i] for i in ids])
      text = re.sub(r'\s+([,.?!"()\'])', r'\1', text)
      return text

In [None]:
tokenizer = SimpleTokenizerV1(vocab)

text = """"It's the last he painted, you know,"
           Mrs. Gisburn said with pardonable pride."""
ids = tokenizer.encode(text)
print(ids)

In [None]:
tokenizer.decode(ids)

In [None]:
tokenizer.decode(tokenizer.encode(text))

In [None]:
tokenizer = SimpleTokenizerV1(vocab)

text = "Hello, do you like tea. Is this-- a test?"

tokenizer.encode(text)

In [None]:
all_tokens = sorted(list(set(preprocessed)))
all_tokens.extend(["<|endoftext|>", "<|unk|>"])

vocab = {token:integer for integer, token in enumerate(all_tokens)}

In [None]:
len(vocab.items())

In [None]:
for i, item in enumerate(list(vocab.items())[-5:]):
  print(item)

In [None]:
class SimpleTokenizerV2:
  def __init__(self, vocab):
    self.str_to_int = vocab
    self.int_to_str = { i : s for s, i in enumerate(vocab.items())}

  def encode(self, text):
    preprocessed = re.split(r'([,.:;?_!"()\']|--|\s)', text)
    preprocessed = [item.strip() for item in preprocessed if item.strip()]
    preprocessed = [
        item if item in self.str_to_int
        else "<|unk|>" for item in preprocessed
    ]
    ids = [self.str_to_int[s] for s in preprocessed]
    return ids

  def decode(self, ids):
    text = " ".join([self.int_to_str for i in ids])
    text = re.sub(r'\s+([,.:;?!"()\'])', r'\1', text)
    return text

In [None]:
tokenizer.encode(text)

In [None]:
tokenizer.decode(tokenizer.encode(text))

BPE

In [None]:
!pip install tiktoken

In [None]:
import tiktoken
import importlib

print("tiktoken version: ", importlib.metadata.version("tiktoken"))

In [None]:
tokenizer = tiktoken.get_encoding("gpt2")

In [None]:
text = (
    "Hello, do you like tea? <|endoftext|> In the sunlit terraces"
     "of someunknownPlace."
)
integers = tokenizer.encode(text, allowed_special={"<|endoftext|>"})
print(integers)

In [None]:
strings = tokenizer.decode(integers)
print(strings)

In [None]:
with open("the-verdict.txt", "r", encoding="utf-8") as f:
  raw_text = f.read()

enc_text = tokenizer.encode(raw_text)
print(len(enc_text))

In [None]:
enc_sample = enc_text[50:]

In [None]:
context_size = 4

x = enc_sample[:context_size]
y = enc_sample[1:context_size]

print(f"x: {x}")
print(f"y: {y}")

In [None]:
for i in range(1, context_size+1):
  context = enc_sample[:i]
  desired = enc_sample[i]

  print(context, "---->", desired)

In [None]:
for i in range(1, context_size+1):
  context = enc_sample[:i]
  desired = enc_sample[i]

  print(tokenizer.decode(context), "---->", tokenizer.decode([desired]))

In [None]:
#attention mechanism
import torch
print(f"The version of PyTorch: ", torch.__version__)

In [None]:
from torch.utils.data import Dataset, DataLoader

class GPTDatasetV1(Dataset):
  def __init__(self, txt, tokenizer, max_length, stride):
    self.input_ids = []
    self.target_ids = []

    #tokenization of entire text
    token_ids = tokenizer.encode(txt, allowed_special={"<|endoftext|>"})
    assert len(token_ids) > max_length


    for i in range(0 , len(token_ids) - max_length, stride):
      input_chunk = token_ids[i:i + max_length]

