In [1]:
import math
import re
from collections import defaultdict, Counter

def preprocess_text(text):
    # Lowercase, remove punctuation except <s> and </s>, and tokenize
    text = text.lower()
    text = re.sub(r'[^\w\s<>/]', '', text)
    return text.split()

def build_bigram_model(tokens):
    unigram_counts = Counter(tokens)
    bigram_counts = Counter((tokens[i], tokens[i+1]) for i in range(len(tokens)-1))

    followers = defaultdict(set)
    for w1, w2 in bigram_counts:
        followers[w1].add(w2)

    vocab = set(tokens)
    V = len(vocab)

    return unigram_counts, bigram_counts, followers, V

def witten_bell_prob(w1, w2, unigram_counts, bigram_counts, followers, V):
    T = len(followers[w1])
    Z = V - T
    N = unigram_counts[w1]

    if bigram_counts[(w1, w2)] > 0:
        prob = bigram_counts[(w1, w2)] / (N + T)
    else:
        prob = T / (Z * (N + T)) if Z > 0 else 0
    return prob

def calculate_entropy_perplexity(tokens, unigram_counts, bigram_counts, followers, V):
    entropy = 0
    n = 0

    for i in range(len(tokens) - 1):
        w1, w2 = tokens[i], tokens[i+1]
        prob = witten_bell_prob(w1, w2, unigram_counts, bigram_counts, followers, V)
        if prob > 0:
            entropy -= math.log2(prob)
            n += 1

    avg_entropy = entropy / n if n > 0 else 0
    perplexity = 2 ** avg_entropy
    return avg_entropy, perplexity

def process_file(filepath):
    try:
        with open(filepath, 'r', encoding='utf-8', errors='replace') as f:
            text = f.read()

        tokens = preprocess_text(text)
        unigram_counts, bigram_counts, followers, V = build_bigram_model(tokens)
        return calculate_entropy_perplexity(tokens, unigram_counts, bigram_counts, followers, V)

    except Exception as e:
        print(f"Error: {e}")
        return None

# Run on TEXTEN1.txt
result = process_file("TEXTEN1.txt")
if result:
    en_entropy, en_perplexity = result
    print(f"Smoothed Entropy (Witten-Bell): {en_entropy}")
    print(f"Smoothed Perplexity (Witten-Bell): {en_perplexity}")
else:
    print("Failed to compute Witten-Bell smoothed model.")


Smoothed Entropy (Witten-Bell): 5.832590303921623
Smoothed Perplexity (Witten-Bell): 56.98815986122723


In [5]:
import re
from collections import defaultdict, Counter

def preprocess(text):
    text = text.lower()
    text = re.sub(r'[^\w\s]', '', text)  # remove punctuation
    return text.split()

def build_witten_bell_bigram_model(file_path):
    with open(file_path, 'r', encoding='utf-8') as f:
        text = f.read()
    
    tokens = preprocess(text)
    tokens = ['<s>'] + tokens + ['</s>']

    unigram_counts = Counter(tokens)
    bigram_counts = defaultdict(Counter)
    unique_followers = defaultdict(set)

    for i in range(len(tokens) - 1):
        w1, w2 = tokens[i], tokens[i+1]
        bigram_counts[w1][w2] += 1
        unique_followers[w1].add(w2)

    en_bigram_model = {}

    vocabulary = set(unigram_counts.keys())

    for w1 in bigram_counts:
        total_bigrams = sum(bigram_counts[w1].values())
        T = len(unique_followers[w1])
        Z = len(vocabulary - unique_followers[w1])
        denominator = unigram_counts[w1] + T

        for w2 in vocabulary:
            count = bigram_counts[w1][w2]
            if count > 0:
                prob = count / denominator
            else:
                prob = T / (Z * denominator) if Z > 0 else 0
            en_bigram_model[f"{w1} {w2}"] = prob

    return en_bigram_model

# Run the model
en_bigram_model = build_witten_bell_bigram_model("finalentext.txt")

# Print specific bigram probabilities
print(en_bigram_model["of the"])  # Expected: ~0.258387066629287
print(en_bigram_model["of of"])   # Expected: ~1.6608937523330386e-05


0.2585288344705113
1.6661618799015895e-05
