In [None]:
!pip install transformers datasets

In [None]:
from transformers import AutoTokenizer
from datasets import load_dataset
from collections import Counter, defaultdict
import numpy as np
import math


## 1️⃣ Load Tokenizers

In [None]:
gpt2_tokenizer = AutoTokenizer.from_pretrained('gpt2')
bert_tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')


## 2️⃣ Tokenize Example Sentences

In [None]:
sentences = [
    "Transformers are powerful models.",
    "Unbelievable tokenization differences!",
    "supercalifragilisticexpialidocious"
]

for sentence in sentences:
    print('='*50)
    print('Sentence:', sentence)
    gpt2_tokens = gpt2_tokenizer.tokenize(sentence)
    bert_tokens = bert_tokenizer.tokenize(sentence)
    print('GPT-2:', gpt2_tokens, 'Count:', len(gpt2_tokens))
    print('BERT:', bert_tokens, 'Count:', len(bert_tokens))


## 3️⃣ Load Dataset

In [None]:
dataset = load_dataset('ag_news', split='train[:200]')
train_texts = dataset[:150]['text']
test_texts = dataset[150:200]['text']


## 4️⃣ Tokenize Dataset

In [None]:
def tokenize_texts(texts, tokenizer):
    tokenized = []
    for text in texts:
        tokenized.append(tokenizer.tokenize(text))
    return tokenized

gpt2_train = tokenize_texts(train_texts, gpt2_tokenizer)
gpt2_test = tokenize_texts(test_texts, gpt2_tokenizer)

bert_train = tokenize_texts(train_texts, bert_tokenizer)
bert_test = tokenize_texts(test_texts, bert_tokenizer)


## 5️⃣ Vocabulary & Sparsity Analysis

In [None]:
def get_vocab(tokenized_texts):
    vocab = set()
    for tokens in tokenized_texts:
        vocab.update(tokens)
    return vocab

print('GPT-2 vocab size:', len(get_vocab(gpt2_train)))
print('BERT vocab size:', len(get_vocab(bert_train)))


In [None]:
def singleton_rate(tokenized_texts):
    counts = Counter(token for sent in tokenized_texts for token in sent)
    singletons = sum(1 for c in counts.values() if c == 1)
    return singletons / len(counts)

print('GPT-2 singleton rate:', singleton_rate(gpt2_train))
print('BERT singleton rate:', singleton_rate(bert_train))


## 6️⃣ Build Bigram Language Model

In [None]:
def build_bigram_counts(tokenized_texts):
    unigram_counts = Counter()
    bigram_counts = defaultdict(Counter)
    for tokens in tokenized_texts:
        for i in range(len(tokens)):
            unigram_counts[tokens[i]] += 1
            if i > 0:
                bigram_counts[tokens[i-1]][tokens[i]] += 1
    return unigram_counts, bigram_counts

def compute_perplexity(tokenized_texts, unigram_counts, bigram_counts):
    log_prob = 0
    N = 0
    for tokens in tokenized_texts:
        for i in range(1, len(tokens)):
            prev_token = tokens[i-1]
            token = tokens[i]
            if unigram_counts[prev_token] > 0:
                prob = bigram_counts[prev_token][token] / unigram_counts[prev_token]
            else:
                prob = 0
            if prob > 0:
                log_prob += math.log(prob)
                N += 1
    return math.exp(-log_prob / N)

gpt2_uni, gpt2_bi = build_bigram_counts(gpt2_train)
bert_uni, bert_bi = build_bigram_counts(bert_train)

print('GPT-2 Perplexity:', compute_perplexity(gpt2_test, gpt2_uni, gpt2_bi))
print('BERT Perplexity:', compute_perplexity(bert_test, bert_uni, bert_bi))
