# CREATE INPUT OUTPUT PAIRS:

This step comes before the vector embedding step.

In [1]:
pip install tiktoken



In [2]:
import tiktoken
tokenizer = tiktoken.get_encoding("gpt2")

In [3]:
with open("/content/the-verdict.txt", "r", encoding = "utf-8") as file:
  raw_text = file.read()

encoded_text = tokenizer.encode(raw_text)
len(encoded_text)

5145

In [4]:
sampled_one = encoded_text[50:]
sampled_one

[290,
 4920,
 2241,
 287,
 257,
 4489,
 64,
 319,
 262,
 34686,
 41976,
 13,
 357,
 10915,
 314,
 2138,
 1807,
 340,
 561,
 423,
 587,
 10598,
 393,
 28537,
 2014,
 198,
 198,
 1,
 464,
 6001,
 286,
 465,
 13476,
 1,
 438,
 5562,
 373,
 644,
 262,
 1466,
 1444,
 340,
 13,
 314,
 460,
 3285,
 9074,
 13,
 46606,
 536,
 5469,
 438,
 14363,
 938,
 4842,
 1650,
 353,
 438,
 2934,
 489,
 3255,
 465,
 48422,
 540,
 450,
 67,
 3299,
 13,
 366,
 5189,
 1781,
 340,
 338,
 1016,
 284,
 3758,
 262,
 1988,
 286,
 616,
 4286,
 705,
 1014,
 510,
 26,
 475,
 314,
 836,
 470,
 892,
 286,
 326,
 11,
 1770,
 13,
 8759,
 2763,
 438,
 1169,
 2994,
 284,
 943,
 17034,
 318,
 477,
 314,
 892,
 286,
 526,
 383,
 1573,
 11,
 319,
 9074,
 13,
 536,
 5469,
 338,
 11914,
 11,
 33096,
 663,
 4808,
 3808,
 62,
 355,
 996,
 484,
 547,
 12548,
 287,
 281,
 13079,
 410,
 12523,
 286,
 22353,
 13,
 843,
 340,
 373,
 407,
 691,
 262,
 9074,
 13,
 536,
 48819,
 508,
 25722,
 276,
 13,
 11161,
 407,
 262,
 40123,
 18113,


In [5]:

context_size = 4 # this is for example

# context size is actually the len of input (# of tokens)

x = sampled_one[:context_size]
y = sampled_one[1:context_size + 1]
print(x)
print(y)



[290, 4920, 2241, 287]
[4920, 2241, 287, 257]


In [6]:
for i in range(1, context_size + 1):
  input = sampled_one[:i]
  target = sampled_one[i]

  print(input, " --------> " , target)

[290]  -------->  4920
[290, 4920]  -------->  2241
[290, 4920, 2241]  -------->  287
[290, 4920, 2241, 287]  -------->  257


In [7]:
for i in range(1, context_size + 1):
  input = sampled_one[:i]
  target = sampled_one[i]

  print(tokenizer.decode(input), " --------> " , tokenizer.decode([target]))

 and  -------->   established
 and established  -------->   himself
 and established himself  -------->   in
 and established himself in  -------->   a


# Create Dataset:

In [8]:
import torch

In [11]:
from torch.utils.data import Dataset, DataLoader

In [10]:
class GPTDatasetV1(Dataset):
  def __init__(self, data, stride, max_length, tokenizer):
    self.input_ids = []
    self.target_ids = []

    encoded_data = tokenizer.encode(data)

    for i in range(0, len(encoded_data) - max_length, stride):
      input_chunk = encoded_data[i : i + max_length]
      target_chunk = encoded_data[i + 1 : i + max_length + 1]

      self.input_ids.append(torch.tensor(input_chunk))
      self.target_ids.append(torch.tensor(target_chunk))

  def __len__(self):
    return len(self.input_ids)

  def __getitem__(self, index):
    return self.input_ids[index], self.target_ids[index]

In [15]:
def create_dataloader_V1(data, batch_size = 4, shuffle = True, num_workers = 0, stride = 4, max_length = 4, drop_last = True):
  tokenizer = tiktoken.get_encoding("gpt2")

  gpt_dataset = GPTDatasetV1(data, stride, max_length, tokenizer)

  dataloader = DataLoader(
    dataset = gpt_dataset,
    batch_size = batch_size,
    shuffle = shuffle,
    num_workers = num_workers,
    drop_last= drop_last
  )

  return dataloader

In [22]:
test_dataloader = create_dataloader_V1(raw_text, 2, False, 0, 1, 4, True)

In [23]:
data_iter = iter(test_dataloader)
first_batch = next(data_iter)
first_batch

[tensor([[  40,  367, 2885, 1464],
         [ 367, 2885, 1464, 1807]]),
 tensor([[ 367, 2885, 1464, 1807],
         [2885, 1464, 1807, 3619]])]