In [1]:
!pip install PyMuPDF





In [3]:
import fitz
import re
from collections import defaultdict, Counter

# Cell 2: Read and clean text from a PDF file
def read_and_clean_pdf(filename):
    doc = fitz.open(filename)
    text = ""
    
    for page_num in range(len(doc)):
        page = doc.load_page(page_num)
        text += page.get_text()
    
    doc.close()
    
    # Clean text: remove special characters and digits, convert to lowercase
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    text = text.lower()
    
    # Split into words and filter out empty strings
    words = [word for word in text.split() if word]
    
    return words

corpus = read_and_clean_pdf("C:\\Users\\gandh\\Downloads\\Generative_Artificial_Intelligence_Evolving_Techno.pdf")
print(f"Corpus size: {len(corpus)} words")
print(f"First 10 words: {corpus[:10]}")

Corpus size: 17012 words
First 10 words: ['vol', 'information', 'systems', 'frontiers', 'httpsdoiorgs', 'generative', 'artificial', 'intelligence', 'evolving', 'technology']


In [5]:
def preprocess_for_bpe(corpus):
    word_freqs = defaultdict(int)
    
    for word in corpus:
        word_freqs[word] += 1
    
    vocab = defaultdict(int)
    
    for word, freq in word_freqs.items():
        # Break word into characters and add end-of-word marker
        word_tokens = tuple(list(word) + ['</w>'])
        vocab[word_tokens] += freq
    
    return vocab

vocab = preprocess_for_bpe(corpus)
print("Initial vocabulary (first 10 entries):")
for i, (word_tokens, freq) in enumerate(vocab.items()):
    if i >= 10:
        break
    print(f"{word_tokens}: {freq}")

print(f"\nTotal unique words: {len(vocab)}")

Initial vocabulary (first 10 entries):
('v', 'o', 'l', '</w>'): 4
('i', 'n', 'f', 'o', 'r', 'm', 'a', 't', 'i', 'o', 'n', '</w>'): 122
('s', 'y', 's', 't', 'e', 'm', 's', '</w>'): 267
('f', 'r', 'o', 'n', 't', 'i', 'e', 'r', 's', '</w>'): 25
('h', 't', 't', 'p', 's', 'd', 'o', 'i', 'o', 'r', 'g', 's', '</w>'): 1
('g', 'e', 'n', 'e', 'r', 'a', 't', 'i', 'v', 'e', '</w>'): 58
('a', 'r', 't', 'i', 'f', 'i', 'c', 'i', 'a', 'l', '</w>'): 38
('i', 'n', 't', 'e', 'l', 'l', 'i', 'g', 'e', 'n', 'c', 'e', '</w>'): 38
('e', 'v', 'o', 'l', 'v', 'i', 'n', 'g', '</w>'): 2
('t', 'e', 'c', 'h', 'n', 'o', 'l', 'o', 'g', 'y', '</w>'): 56

Total unique words: 3478


In [7]:
def get_pair_frequencies(vocab):
    pairs = defaultdict(int)
    for word_tokens, freq in vocab.items():
        for i in range(len(word_tokens) - 1):
            pair = (word_tokens[i], word_tokens[i + 1])
            pairs[pair] += freq
    
    return pairs

# Test the function
pair_freqs = get_pair_frequencies(vocab)
print("Most frequent pairs (top 10):")
for pair, freq in sorted(pair_freqs.items(), key=lambda x: x[1], reverse=True)[:10]:
    print(f"{pair}: {freq}")

Most frequent pairs (top 10):
('s', '</w>'): 2890
('e', '</w>'): 2672
('i', 'n'): 1973
('a', 'n'): 1671
('o', 'n'): 1612
('n', '</w>'): 1590
('t', 'i'): 1518
('e', 'r'): 1495
('t', 'h'): 1439
('d', '</w>'): 1369


In [9]:
def merge_pair(pair, vocab):
    new_vocab = defaultdict(int)
    
    for word_tokens, freq in vocab.items():
        new_word_tokens = []
        i = 0
        
        while i < len(word_tokens):
            if i < len(word_tokens) - 1 and (word_tokens[i], word_tokens[i + 1]) == pair:
                # Merge the pair
                merged_token = word_tokens[i] + word_tokens[i + 1]
                new_word_tokens.append(merged_token)
                i += 2
            else:
                new_word_tokens.append(word_tokens[i])
                i += 1
        
        new_vocab[tuple(new_word_tokens)] += freq
    
    return new_vocab

# Test merge function
test_pair = max(pair_freqs.items(), key=lambda x: x[1])[0]
print(f"Testing merge of pair: {test_pair}")
merged_vocab = merge_pair(test_pair, vocab)
print(f"Vocabulary size before merge: {len(vocab)}")
print(f"Vocabulary size after merge: {len(merged_vocab)}")


Testing merge of pair: ('s', '</w>')
Vocabulary size before merge: 3478
Vocabulary size after merge: 3478


In [19]:
def train_bpe(vocab, num_merges):
    bpe_merges = []
    
    for i in range(num_merges):
        pair_freqs = get_pair_frequencies(vocab)
        
        if not pair_freqs:
            print(f"No more pairs to merge. Stopped at iteration {i}")
            break
        
        most_frequent_pair = max(pair_freqs.items(), key=lambda x: x[1])
        pair, freq = most_frequent_pair
        
        print(f"Merge {i + 1}: {pair} (frequency: {freq})")
        
        vocab = merge_pair(pair, vocab)
        bpe_merges.append(pair)
    
    return vocab, bpe_merges

num_merges = 500
final_vocab, bpe_merges = train_bpe(vocab, num_merges)

print(f"\nTraining complete!")
print(f"Final vocabulary size: {len(final_vocab)}")
print(f"Number of merges performed: {len(bpe_merges)}")


Merge 1: ('s', '</w>') (frequency: 2890)
Merge 2: ('e', '</w>') (frequency: 2672)
Merge 3: ('i', 'n') (frequency: 1973)
Merge 4: ('a', 'n') (frequency: 1671)
Merge 5: ('o', 'n') (frequency: 1612)
Merge 6: ('e', 'r') (frequency: 1495)
Merge 7: ('t', 'h') (frequency: 1439)
Merge 8: ('t', 'i') (frequency: 1394)
Merge 9: ('d', '</w>') (frequency: 1369)
Merge 10: ('e', 'n') (frequency: 1338)
Merge 11: ('t', '</w>') (frequency: 1240)
Merge 12: ('o', 'r') (frequency: 1070)
Merge 13: ('a', 'l') (frequency: 976)
Merge 14: ('y', '</w>') (frequency: 944)
Merge 15: ('a', 'r') (frequency: 886)
Merge 16: ('t', 'e') (frequency: 854)
Merge 17: ('r', 'e') (frequency: 806)
Merge 18: ('ti', 'on') (frequency: 777)
Merge 19: ('c', 'h') (frequency: 752)
Merge 20: ('g', '</w>') (frequency: 738)
Merge 21: ('o', 'f') (frequency: 730)
Merge 22: ('th', 'e</w>') (frequency: 689)
Merge 23: ('of', '</w>') (frequency: 684)
Merge 24: ('an', 'd</w>') (frequency: 644)
Merge 25: ('al', '</w>') (frequency: 636)
Merge 26:

In [21]:
def apply_bpe(word, merges):
    # Start with character-level tokens plus end-of-word marker
    tokens = list(word) + ['</w>']
    
    for merge_pair in merges:
        new_tokens = []
        i = 0
        
        while i < len(tokens):
            if i < len(tokens) - 1 and (tokens[i], tokens[i + 1]) == merge_pair:
                # Apply merge
                merged_token = tokens[i] + tokens[i + 1]
                new_tokens.append(merged_token)
                i += 2
            else:
                new_tokens.append(tokens[i])
                i += 1
        
        tokens = new_tokens
    
    return tokens

# Test tokenization with example words
test_words = ["language", "amazing", "transformers"]

print("Tokenization examples:")
tokenized_examples = {}
for word in test_words:
    tokens = apply_bpe(word, bpe_merges)
    tokenized_examples[word] = tokens
    print(f"'{word}' -> {tokens}")


Tokenization examples:
'language' -> ['language</w>']
'amazing' -> ['a', 'ma', 'z', 'ing</w>']
'transformers' -> ['trans', 'form', 'ers</w>']


In [23]:
def detokenize(tokens):
    # Join tokens and remove end-of-word markers
    word = ''.join(tokens)
    word = word.replace('</w>', '')
    return word

print("Detokenization examples:")
for word, tokens in tokenized_examples.items():
    detokenized = detokenize(tokens)
    print(f"{tokens} -> '{detokenized}'")
    print(f"Original: '{word}', Reconstructed: '{detokenized}', Match: {word == detokenized}")


Detokenization examples:
['language</w>'] -> 'language'
Original: 'language', Reconstructed: 'language', Match: True
['a', 'ma', 'z', 'ing</w>'] -> 'amazing'
Original: 'amazing', Reconstructed: 'amazing', Match: True
['trans', 'form', 'ers</w>'] -> 'transformers'
Original: 'transformers', Reconstructed: 'transformers', Match: True


In [25]:
def create_token_vocab(final_vocab):
    # Collect all unique tokens
    all_tokens = set()
    token_frequencies = defaultdict(int)
    
    for word_tokens, freq in final_vocab.items():
        for token in word_tokens:
            all_tokens.add(token)
            token_frequencies[token] += freq
    
    # Sort tokens by frequency (most frequent first)
    sorted_tokens = sorted(token_frequencies.items(), key=lambda x: x[1], reverse=True)
    
    # Create token-to-ID and ID-to-token mappings
    token_to_id = {}
    id_to_token = {}
    
    for i, (token, freq) in enumerate(sorted_tokens):
        token_to_id[token] = i
        id_to_token[i] = token
    
    return token_to_id, id_to_token, token_frequencies

token_to_id, id_to_token, token_frequencies = create_token_vocab(final_vocab)

print("Token vocabulary created!")
print(f"Total unique tokens: {len(token_to_id)}")
print("\nTop 10 most frequent tokens with IDs:")
for i in range(min(10, len(id_to_token))):
    token = id_to_token[i]
    freq = token_frequencies[token]
    print(f"ID {i}: '{token}' (frequency: {freq})")

Token vocabulary created!
Total unique tokens: 520

Top 10 most frequent tokens with IDs:
ID 0: 'the</w>' (frequency: 689)
ID 1: 'of</w>' (frequency: 684)
ID 2: 'and</w>' (frequency: 610)
ID 3: 'a</w>' (frequency: 472)
ID 4: 'in' (frequency: 444)
ID 5: 'to</w>' (frequency: 431)
ID 6: 's' (frequency: 426)
ID 7: 'm' (frequency: 413)
ID 8: 's</w>' (frequency: 409)
ID 9: 'in</w>' (frequency: 397)


In [27]:
def tokenize_to_ids(word, merges, token_to_id):
    # Get string tokens first
    string_tokens = apply_bpe(word, merges)
    
    # Convert to IDs
    token_ids = []
    for token in string_tokens:
        if token in token_to_id:
            token_ids.append(token_to_id[token])
        else:
            # Handle unknown tokens (shouldn't happen with proper training)
            print(f"Warning: Unknown token '{token}' encountered")
            token_ids.append(token_to_id.get('<UNK>', -1))
    
    return string_tokens, token_ids

print("\nTokenization with IDs:")
for word in test_words:
    string_tokens, token_ids = tokenize_to_ids(word, bpe_merges, token_to_id)
    print(f"'{word}':")
    print(f"  Tokens: {string_tokens}")
    print(f"  IDs: {token_ids}")


Tokenization with IDs:
'language':
  Tokens: ['language</w>']
  IDs: [157]
'amazing':
  Tokens: ['a', 'ma', 'z', 'ing</w>']
  IDs: [19, 82, 107, 14]
'transformers':
  Tokens: ['trans', 'form', 'ers</w>']
  IDs: [208, 295, 193]


In [29]:
def detokenize_from_ids(token_ids, id_to_token):
    # Convert IDs back to string tokens
    string_tokens = []
    for token_id in token_ids:
        if token_id in id_to_token:
            string_tokens.append(id_to_token[token_id])
        else:
            print(f"Warning: Unknown token ID {token_id}")
            string_tokens.append('<UNK>')
    
    # Detokenize string tokens
    word = detokenize(string_tokens)
    return string_tokens, word

print("\nDetokenization from IDs:")
for word in test_words:
    string_tokens, token_ids = tokenize_to_ids(word, bpe_merges, token_to_id)
    reconstructed_tokens, reconstructed_word = detokenize_from_ids(token_ids, id_to_token)
    print(f"IDs {token_ids} -> '{reconstructed_word}'")
    print(f"Original: '{word}', Reconstructed: '{reconstructed_word}', Match: {word == reconstructed_word}")


Detokenization from IDs:
IDs [157] -> 'language'
Original: 'language', Reconstructed: 'language', Match: True
IDs [19, 82, 107, 14] -> 'amazing'
Original: 'amazing', Reconstructed: 'amazing', Match: True
IDs [208, 295, 193] -> 'transformers'
Original: 'transformers', Reconstructed: 'transformers', Match: True


In [31]:
def save_tokenizer(token_to_id, id_to_token, bpe_merges, filename="bpe_tokenizer.txt"):
    with open(filename, 'w', encoding='utf-8') as f:
        # Save merges
        f.write("MERGES\n")
        for pair in bpe_merges:
            f.write(f"{pair[0]} {pair[1]}\n")
        
        # Save vocabulary
        f.write("VOCAB\n")
        for token_id, token in id_to_token.items():
            f.write(f"{token} {token_id}\n")
    
    print(f"Tokenizer saved to {filename}")

def load_tokenizer(filename="bpe_tokenizer.txt"):
    bpe_merges = []
    token_to_id = {}
    id_to_token = {}
    
    with open(filename, 'r', encoding='utf-8') as f:
        lines = f.readlines()
    
    mode = None
    for line in lines:
        line = line.strip()
        if line == "MERGES":
            mode = "merges"
            continue
        elif line == "VOCAB":
            mode = "vocab"
            continue
        
        if mode == "merges" and line:
            parts = line.split()
            if len(parts) == 2:
                bpe_merges.append((parts[0], parts[1]))
        elif mode == "vocab" and line:
            parts = line.rsplit(' ', 1)  # Split from right to handle tokens with spaces
            if len(parts) == 2:
                token, token_id = parts[0], int(parts[1])
                token_to_id[token] = token_id
                id_to_token[token_id] = token
    
    return token_to_id, id_to_token, bpe_merges

# Save the trained tokenizer
save_tokenizer(token_to_id, id_to_token, bpe_merges)

Tokenizer saved to bpe_tokenizer.txt
