### Loading Input Data

In [1]:
import requests
# I'm loading my preprocessing data from my repository 
url = "https://raw.githubusercontent.com/eliashossain001/LLM-Development/main/outputs/clean_text.txt"

## Step 1: Creating Tokens

In [10]:
import requests
import re

# Fetch the content
response = requests.get(url)

if response.status_code == 200:
    raw_text = response.text

    # Tokenization: Extract words, ignoring punctuation
    tokenized_text = re.findall(r"\b\w+\b", raw_text)

    # Extract unique words and sort them
    distinct_tokens = sorted(set(tokenized_text))

    # Get the vocabulary size
    total_unique_words = len(distinct_tokens)

    # Display results
    print("Total unique words:", total_unique_words)
    
else:
    print("Failed to fetch the file. Status code:", response.status_code)


Total unique words: 2957


## Step 2: Creating Token IDs

In [11]:
distinct_tokens = sorted(set(tokenized_text))
total_unique_words = len(distinct_tokens)

print(total_unique_words)

2957


In [12]:
import requests
import re
# Fetch the content
response = requests.get(url)

if response.status_code == 200:
    raw_text = response.text

    # Tokenization: Extract words, ignoring punctuation
    tokenized_text = re.findall(r"\b\w+\b", raw_text)

    # Get unique words and sort them
    distinct_tokens = sorted(set(tokenized_text))
    total_unique_words = len(distinct_tokens)

    # Create token-to-ID mapping
    token_to_id = {word: idx for idx, word in enumerate(distinct_tokens)}

    # Convert tokenized text into token IDs
    token_ids = [token_to_id[word] for word in tokenized_text]

    # Display results
    print("Vocabulary size:", total_unique_words)
    print("First 10 token IDs:", token_ids[:10])  # Preview first 10 token IDs
else:
    print("Failed to fetch the file. Status code:", response.status_code)


Vocabulary size: 2957
First 10 token IDs: [1374, 1450, 507, 1244, 2869, 241, 2670, 737, 755, 1450]


In [14]:
# Create a dictionary mapping each unique token to an integer ID
vocab = {token: integer for integer, token in enumerate(distinct_tokens)}

# Display the first 10 token mappings as an example
print("First 10 token mappings:", list(vocab.items())[:10])

First 10 token mappings: [('000', 0), ('1', 1), ('10', 2), ('100', 3), ('100000', 4), ('10001550', 5), ('1001', 6), ('10relations', 7), ('11', 8), ('11road', 9)]


In [15]:
for i, item in enumerate(vocab.items()):
    print(item)
    if i >= 50:
        break

('000', 0)
('1', 1)
('10', 2)
('100', 3)
('100000', 4)
('10001550', 5)
('1001', 6)
('10relations', 7)
('11', 8)
('11road', 9)
('12000', 10)
('12stamps', 11)
('13', 12)
('130', 13)
('1330foot', 14)
('13th', 15)
('13to', 16)
('1415', 17)
('1492', 18)
('14point', 19)
('14the', 20)
('15', 21)
('1500', 22)
('1500s', 23)
('1513', 24)
('1522', 25)
('1539', 26)
('1540', 27)
('1600s', 28)
('1617', 29)
('1620', 30)
('16721695', 31)
('168889', 32)
('1690', 33)
('16artists', 34)
('17', 35)
('170', 36)
('17000', 37)
('1700s', 38)
('17381820', 39)
('1750000', 40)
('1750s', 41)
('1763', 42)
('1764', 43)
('1765', 44)
('1770s', 45)
('1773', 46)
('1774', 47)
('1775', 48)
('1776', 49)
('1778', 50)


In [16]:
import re

class BasicTextTokenizer:
    def __init__(self, token_dict):
        # Mapping words to unique IDs
        self.token_to_id = token_dict
        # Reverse mapping: IDs back to words
        self.id_to_token = {idx: token for token, idx in token_dict.items()}

    def encode(self, input_text):
        # Tokenize text by splitting based on punctuation and whitespace
        token_list = re.split(r'([,.:;?_!"()\']|--|\s)', input_text)

        # Remove empty tokens and extra spaces
        token_list = [token.strip() for token in token_list if token.strip()]

        # Convert tokens into their corresponding IDs
        token_ids = [self.token_to_id[token] for token in token_list]

        return token_ids

    def decode(self, token_ids):
        # Convert token IDs back to words
        reconstructed_text = " ".join([self.id_to_token[idx] for idx in token_ids])

        # Remove unnecessary spaces before punctuation marks
        reconstructed_text = re.sub(r'\s+([,.?!"()\'])', r'\1', reconstructed_text)

        return reconstructed_text


In [20]:
import requests
import re

class BasicWordTokenizer:
    def __init__(self, vocabulary):
        # Store vocabulary
        self.word_list = vocabulary
        # Create a mapping of words to unique IDs
        self.word_to_id = {word: idx for idx, word in enumerate(vocabulary)}
        # Define an unknown token placeholder
        self.unknown_token = "<|unk|>"
        # Assign an ID for unknown words (out of vocabulary)
        self.unknown_id = len(vocabulary)

    def encode(self, input_text):
        # Split text into tokens using punctuation and whitespace as delimiters
        tokenized_words = re.findall(r"\b\w+\b", input_text)
        # Convert tokens to their respective IDs, using the unknown ID for out-of-vocabulary words
        token_ids = [self.word_to_id.get(word, self.unknown_id) for word in tokenized_words]
        return tokenized_words, token_ids  # Returning tokenized words and token IDs


### ADDING SPECIAL CONTEXT TOKENS

In the previous section, we implemented a simple tokenizer and applied it to a passage
from the training set. 

In this section, we will modify this tokenizer to handle unknown
words.


In particular, we will modify the vocabulary and tokenizer we implemented in the
previous section, SimpleTokenizerV2, to support two new tokens, <|unk|> and
<|endoftext|>

In [21]:
response = requests.get(url)

if response.status_code == 200:
    raw_text = response.text

    # Tokenize text
    tokenized_words, _ = BasicWordTokenizer([]).encode(raw_text)  # Extract tokens without a predefined vocab

    # Extract unique words and sort them
    distinct_tokens = sorted(list(set(tokenized_words)))

    # Create a vocabulary mapping from tokens to integer IDs
    vocab = {token: idx for idx, token in enumerate(distinct_tokens)}

    print("Vocabulary size before adding special tokens:", len(vocab))
    print("First 10 tokens:", list(vocab.items())[:10])

    # **Separate Section: Adding Special Tokens**
    special_tokens = ["<|endoftext|>", "<|unk|>"]
    for special_token in special_tokens:
        vocab[special_token] = len(vocab)  # Assign a new ID

    print("Vocabulary size after adding special tokens:", len(vocab))
    print("Special tokens added:", special_tokens)

else:
    print("Failed to fetch the file. Status code:", response.status_code)


Vocabulary size before adding special tokens: 2957
First 10 tokens: [('000', 0), ('1', 1), ('10', 2), ('100', 3), ('100000', 4), ('10001550', 5), ('1001', 6), ('10relations', 7), ('11', 8), ('11road', 9)]
Vocabulary size after adding special tokens: 2959
Special tokens added: ['<|endoftext|>', '<|unk|>']


In [22]:
len(vocab.items())


2959

In [23]:
for i, item in enumerate(list(vocab.items())[-5:]):
    print(item)

('youth', 2954)
('zedong', 2955)
('zuni', 2956)
('<|endoftext|>', 2957)
('<|unk|>', 2958)


In [24]:
import re

class CustomTextTokenizer:
    def __init__(self, token_mapping):
        # Store mappings of tokens to unique IDs
        self.token_to_id = token_mapping
        # Reverse mapping: IDs back to tokens
        self.id_to_token = {idx: token for token, idx in token_mapping.items()}

    def encode(self, input_text):
        # Tokenize text using punctuation and whitespace as delimiters
        tokens = re.split(r'([,.:;?_!"()\']|--|\s)', input_text)
        # Remove empty and whitespace-only tokens
        tokens = [token.strip() for token in tokens if token.strip()]
        # Replace unknown tokens with a placeholder "<|unk|>"
        tokens = [token if token in self.token_to_id else "<|unk|>" for token in tokens]

        # Convert tokens to their corresponding IDs
        token_ids = [self.token_to_id[token] for token in tokens]
        return token_ids

    def decode(self, token_ids):
        # Convert token IDs back to tokens
        reconstructed_text = " ".join([self.id_to_token[idx] for idx in token_ids])
        # Remove unnecessary spaces before punctuation marks
        reconstructed_text = re.sub(r'\s+([,.:;?!"()\'])', r'\1', reconstructed_text)
        return reconstructed_text


In [25]:
tokenizer = CustomTextTokenizer(vocab)

text1 = "Hello, my name is Elias Hossain"
text2 = "I love Pizza and Music"

text = " <|endoftext|> ".join((text1, text2))

print(text)

Hello, my name is Elias Hossain <|endoftext|> I love Pizza and Music


In [26]:
tokenizer.encode(text)


[2958, 2958, 2958, 1819, 1522, 2958, 2958, 2957, 2958, 2958, 2958, 326, 2958]

In [27]:
tokenizer.decode(tokenizer.encode(text))

'<|unk|> <|unk|> <|unk|> name is <|unk|> <|unk|> <|endoftext|> <|unk|> <|unk|> <|unk|> and <|unk|>'

### BYTE PAIR ENCODING (BPE)


In [29]:
! pip3 install tiktoken




[notice] A new release of pip is available: 24.3.1 -> 25.0.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [31]:
import importlib_metadata 
import tiktoken
tokenizer = tiktoken.get_encoding("gpt2")

In [32]:
tokenizer

<Encoding 'gpt2'>

In [33]:
text = (
    "Hello, what is your first name? <|endoftext|> you are not good at all"
     "of math."
)

integers = tokenizer.encode(text, allowed_special={"<|endoftext|>"})

print(integers)

[15496, 11, 644, 318, 534, 717, 1438, 30, 220, 50256, 345, 389, 407, 922, 379, 477, 1659, 10688, 13]


In [34]:
strings = tokenizer.decode(integers)

print(strings)

Hello, what is your first name? <|endoftext|> you are not good at allof math.


### CREATING INPUT-TARGET PAIRS

In [36]:
import requests

# Retrieve the file content
response = requests.get(url)

if response.status_code == 200:
    text_data = response.text  # Extract the text from the response

    # Convert text into tokenized form
    encoded_tokens = tokenizer.encode(text_data)

    # Display the number of tokens
    print("Total number of encoded tokens:", len(encoded_tokens))
else:
    print("Error: Unable to fetch the file. HTTP Status Code:", response.status_code)


Total number of encoded tokens: 12320


In [37]:
encoded_tokens = encoded_tokens[50:]

In [38]:
# Define the length of the input context
window_size = 4  # Number of tokens the model looks at for prediction

# Explanation:
# - The model processes a sequence of 4 tokens to predict the next token.
# - `input_tokens` represents the first 4 tokens.
# - `target_tokens` represents the next 4 tokens shifted by one position.

input_tokens = encoded_tokens[:window_size]  # First 4 tokens as input
target_tokens = encoded_tokens[1:window_size+1]  # Next 4 tokens as target

# Display the results
print(f"Input sequence (x): {input_tokens}")
print(f"Target sequence (y): {target_tokens}")


Input sequence (x): [1263, 1230, 9051, 1402]
Target sequence (y): [1230, 9051, 1402, 1230]


In [39]:
# Iterate over different input lengths from 1 to window_size
for i in range(1, window_size + 1):
    # Extract the context (previous i tokens)
    context_tokens = encoded_tokens[:i]
    # The desired target token (next token in sequence)
    target_token = encoded_tokens[i]

    # Display the context-to-target mapping
    print(context_tokens, "---->", target_token)


[1263] ----> 1230
[1263, 1230] ----> 9051
[1263, 1230, 9051] ----> 1402
[1263, 1230, 9051, 1402] ----> 1230


In [40]:
# Iterate over different input lengths from 1 to window_size
for i in range(1, window_size + 1):
    # Extract the context (previous i tokens)
    context_tokens = encoded_tokens[:i]
    # The desired target token (next token in sequence)
    target_token = encoded_tokens[i]

    # Decode token IDs back into text for readability
    context_text = tokenizer.decode(context_tokens)
    target_text = tokenizer.decode([target_token])

    # Display the mapping between input sequence and expected output
    print(context_text, "---->", target_text)


 big ---->  government
 big government ---->  versus
 big government versus ---->  small
 big government versus small ---->  government


### IMPLEMENTING A DATA LOADER

In [41]:
import torch
from torch.utils.data import Dataset, DataLoader

class TextSequenceDataset(Dataset):
    def __init__(self, text_data, tokenizer, sequence_length, step_size):
        self.input_sequences = []
        self.target_sequences = []

        # Convert the text into token IDs
        tokenized_ids = tokenizer.encode(text_data, allowed_special={"<|endoftext|>"})

        # Apply a sliding window approach to generate overlapping sequences
        for start_idx in range(0, len(tokenized_ids) - sequence_length, step_size):
            input_segment = tokenized_ids[start_idx:start_idx + sequence_length]
            target_segment = tokenized_ids[start_idx + 1:start_idx + sequence_length + 1]
            self.input_sequences.append(torch.tensor(input_segment))
            self.target_sequences.append(torch.tensor(target_segment))

    def __len__(self):
        return len(self.input_sequences)

    def __getitem__(self, index):
        return self.input_sequences[index], self.target_sequences[index]




In [42]:
def build_dataloader(text_data, batch_size=4, sequence_length=256, 
                     step_size=128, shuffle=True, drop_last=True, 
                     num_workers=0):

    # Load the tokenizer (GPT-2 encoding)
    tokenizer = tiktoken.get_encoding("gpt2")

    # Create dataset instance
    dataset = TextSequenceDataset(text_data, tokenizer, sequence_length, step_size)

    # Configure and return the DataLoader
    return DataLoader(
        dataset,
        batch_size=batch_size,
        shuffle=shuffle,
        drop_last=drop_last,
        num_workers=num_workers
    )


In [43]:
import requests
# Request the file content from the given URL
response = requests.get(url)

if response.status_code == 200:
    text_content = response.text  # Extract text data from the response
    print("Text data retrieved successfully!")
else:
    print(f"Error: Unable to retrieve the file. HTTP Status Code: {response.status_code}")


Text data retrieved successfully!


In [44]:
import torch

# Display the PyTorch version
print("Installed PyTorch version:", torch.__version__)

# Build the DataLoader using the fetched text data
dataloader = build_dataloader(
    text_content, batch_size=1, sequence_length=4, step_size=1, shuffle=False
)

# Create an iterator for the DataLoader
data_iterator = iter(dataloader)

# Retrieve the first batch
initial_batch = next(data_iterator)

# Display the first batch
print(initial_batch)


Installed PyTorch version: 2.6.0+cpu
[tensor([[23569,   287,  4506, 22064]]), tensor([[  287,  4506, 22064,  4903]])]


In [45]:
second_batch = next(data_iterator)
print(second_batch)

[tensor([[  287,  4506, 22064,  4903]]), tensor([[ 4506, 22064,  4903,  3643]])]


In [46]:
# Initialize the DataLoader with specified parameters
dataloader = build_dataloader(text_content, batch_size=8, sequence_length=4, step_size=4, shuffle=False)

# Create an iterator for the DataLoader
data_iterator = iter(dataloader)

# Retrieve the first batch of inputs and targets
input_sequences, target_sequences = next(data_iterator)

# Display the input and target sequences
print("Input Sequences:\n", input_sequences)
print("\nTarget Sequences:\n", target_sequences)


Input Sequences:
 tensor([[23569,   287,  4506, 22064],
        [ 4903,  3643, 20518,  1122],
        [13593,   262,  9758,  9831],
        [  287,  5206,  8273,  1596],
        [ 5774,  9793,   262,  2106],
        [  286,   262, 16503,  2585],
        [  468,   587,   281,  6306],
        [  287,  7996,   329,   517]])

Target Sequences:
 tensor([[  287,  4506, 22064,  4903],
        [ 3643, 20518,  1122, 13593],
        [  262,  9758,  9831,   287],
        [ 5206,  8273,  1596,  5774],
        [ 9793,   262,  2106,   286],
        [  262, 16503,  2585,   468],
        [  587,   281,  6306,   287],
        [ 7996,   329,   517,   621]])


### CREATING TOKEN EMBEDDINGS

In [47]:
input_ids = torch.tensor([2, 3, 5, 1])


In [48]:
# Define vocabulary size and embedding dimension
num_tokens = 6
embedding_dim = 3

# Set the random seed for reproducibility
torch.manual_seed(123)

# Initialize an embedding layer with specified parameters
embedding_layer = torch.nn.Embedding(num_tokens, embedding_dim)


In [49]:
print(embedding_layer.weight)


Parameter containing:
tensor([[ 0.3374, -0.1778, -0.1690],
        [ 0.9178,  1.5810,  1.3010],
        [ 1.2753, -0.2010, -0.1606],
        [-0.4015,  0.9666, -1.1481],
        [-1.1589,  0.3255, -0.6315],
        [-2.8400, -0.7849, -1.4096]], requires_grad=True)


In [50]:
print(embedding_layer(torch.tensor([3])))


tensor([[-0.4015,  0.9666, -1.1481]], grad_fn=<EmbeddingBackward0>)


In [51]:
print(embedding_layer(input_ids))


tensor([[ 1.2753, -0.2010, -0.1606],
        [-0.4015,  0.9666, -1.1481],
        [-2.8400, -0.7849, -1.4096],
        [ 0.9178,  1.5810,  1.3010]], grad_fn=<EmbeddingBackward0>)


### POSITIONAL EMBEDDINGS (ENCODING WORD POSITIONS)

In [54]:
vocab_size = 50257
output_dim = 256

token_embedding_layer = torch.nn.Embedding(vocab_size, output_dim)

In [58]:
# Define sequence length
sequence_length = 4

# Initialize DataLoader with specified parameters
dataloader = build_dataloader(
    text_content, batch_size=2, sequence_length=4, step_size=4, shuffle=False
)


# Check if the dataset is empty before proceeding
if len(dataloader.dataset) == 0:
    raise ValueError("Dataset is empty. Ensure the text content is long enough.")

# Create an iterator for the DataLoader
data_iterator = iter(dataloader)

# Retrieve the first batch of inputs and targets
try:
    input_tokens, target_tokens = next(data_iterator)
    
    # Display the tokenized inputs and targets
    print("Input Tokens:\n", input_tokens)
    print("\nTarget Tokens:\n", target_tokens)

except StopIteration:
    print("No data available in the DataLoader. Try increasing text length or reducing batch size/sequence length.")


Input Tokens:
 tensor([[ 1212,   318,   281,  1672],
        [ 2420,   329,  4856, 11525]])

Target Tokens:
 tensor([[  318,   281,  1672,  2420],
        [  329,  4856, 11525,    67]])


In [60]:
token_embeddings = token_embedding_layer(input_tokens)
print(token_embeddings.shape)

torch.Size([2, 4, 256])


In [61]:
# Define context length based on sequence length
context_size = sequence_length

# Initialize position embedding layer
positional_embedding_layer = torch.nn.Embedding(context_size, embedding_dim)

In [62]:
# Generate position indices
position_indices = torch.arange(sequence_length)

# Compute position embeddings
positional_embeddings = positional_embedding_layer(position_indices)

# Display the shape of the position embeddings
print(positional_embeddings.shape)

torch.Size([4, 3])


In [67]:
import torch

# Define sequence length and embedding dimensions
sequence_length = 4  # Make sure this matches max_length
embedding_dim = 256  # Ensure this is consistent

# Initialize Token Embedding Layer (Assuming vocab_size=50257)
token_embedding_layer = torch.nn.Embedding(50257, embedding_dim)

# Generate some dummy token IDs (batch_size=8, sequence_length=4)
batch_size = 8
dummy_token_ids = torch.randint(0, 50257, (batch_size, sequence_length))

# Compute token embeddings
token_embeddings = token_embedding_layer(dummy_token_ids)  # Shape: (batch_size, sequence_length, embedding_dim)

# Ensure position embeddings use the correct sequence length and embedding dimension
positional_embedding_layer = torch.nn.Embedding(sequence_length, embedding_dim)  # Ensure correct embedding size

# Generate position indices and obtain positional embeddings
position_indices = torch.arange(sequence_length).unsqueeze(0).expand(batch_size, -1)  # Shape: (batch_size, sequence_length)
positional_embeddings = positional_embedding_layer(position_indices)  # Shape: (batch_size, sequence_length, embedding_dim)

# Verify shape alignment
assert token_embeddings.shape == positional_embeddings.shape, \
    f"Shape mismatch! Token Embeddings: {token_embeddings.shape}, Positional Embeddings: {positional_embeddings.shape}"

# Compute final input embeddings by adding token and position embeddings
combined_embeddings = token_embeddings + positional_embeddings

# Display the shape of the resulting embeddings
print("Final Input Embeddings Shape:", combined_embeddings.shape)


Final Input Embeddings Shape: torch.Size([8, 4, 256])


<div class="alert alert-block alert-warning">

The input_embeddings we created are the embedded input
examples that can now be processed by the main LLM modules
    
</div>