In [1]:
import math
from collections import Counter

# The sentence
sentence = "<s> Dnes je hezký den"

# Tokenize the sentence into words (including the start token)
tokens = sentence.split()

# Create a unigram model (counts of each token)
unigram_counts = Counter(tokens)

# Calculate the total number of words (tokens)
total_tokens = sum(unigram_counts.values())

# Calculate the probability of each word (its frequency / total tokens)
unigram_probabilities = {word: count / total_tokens for word, count in unigram_counts.items()}

# Output the unigram counts and probabilities
print("Unigram Counts:", unigram_counts)
print("Unigram Probabilities:", unigram_probabilities)

# Calculate the entropy H(X) using the formula: H(X) = - ∑ p(x) * log2(p(x))
H_X = -sum(p * math.log2(p) for p in unigram_probabilities.values())

# Calculate the perplexity G(X) using the formula: G(X) = 2^H(X)
G_X = 2 ** H_X

# Output the results
print(f"Entropy H(X): {H_X}")
print(f"Perplexity G(X): {G_X}")

Unigram Counts: Counter({'<s>': 1, 'Dnes': 1, 'je': 1, 'hezký': 1, 'den': 1})
Unigram Probabilities: {'<s>': 0.2, 'Dnes': 0.2, 'je': 0.2, 'hezký': 0.2, 'den': 0.2}
Entropy H(X): 2.321928094887362
Perplexity G(X): 4.999999999999999


In [3]:
import math
import re
from collections import Counter

def process_file(file_path):
    try:
        # Read the content of the file
        with open(file_path, 'r', encoding='utf-8') as f:
            text = f.read()

        # Check if text is empty
        if not text.strip():
            raise ValueError("The file is empty or contains only whitespace.")

        # Clean the text: Remove punctuation marks, leaving only words and whitespace
        text = re.sub(r'[^\w\s]', '', text)

        # Tokenize the text into words (split by whitespace)
        tokens = text.split()

        # Create bigrams (pairs of consecutive words)
        bigrams = [(tokens[i], tokens[i+1]) for i in range(len(tokens)-1)]

        # Create a bigram model (counts of each bigram)
        bigram_counts = Counter(bigrams)

        # Create a unigram model (counts of individual words)
        unigram_counts = Counter(tokens)

        # Calculate the total number of bigrams and unigrams
        total_bigrams = sum(bigram_counts.values())
        total_unigrams = sum(unigram_counts.values())

        # Calculate the probability of each unigram
        unigram_probabilities = {word: count / total_unigrams for word, count in unigram_counts.items()}

        # Calculate the conditional probabilities P(b|a) for each bigram (b follows a)
        bigram_conditional_probabilities = {}
        for bigram, count in bigram_counts.items():
            a, b = bigram
            if a in unigram_probabilities:
                P_b_given_a = count / unigram_counts[a]  # P(b|a)
                bigram_conditional_probabilities[bigram] = P_b_given_a

        # Calculate the entropy H(B|A) using the formula: 
        # H(B|A) = - ∑ P(a,b) * log2(P(b|a))
        H_B_given_A = -sum(count / total_bigrams * math.log2(prob) 
                           for bigram, count in bigram_counts.items() 
                           for prob in [bigram_conditional_probabilities[bigram]] if prob > 0)

        # Calculate the perplexity G(X) using the formula: G(X) = 2^H(B|A)
        G_X = 2 ** H_B_given_A

        # Return the bigram model, entropy, and perplexity
        return bigram_counts, H_B_given_A, G_X

    except Exception as e:
        print(f"An error occurred: {e}")
        return None

# Run the function with the given file path for finalentext.txt
result = process_file('finalentext.txt')

# Output the bigram model, entropy, and perplexity if successful
if result:
    bigram_model, en_entropy, en_perplexity = result
    print(f"Conditional Entropy H(B|A): {en_entropy} Perplexity: {en_perplexity}")
else:
    print("Failed to compute bigram model, conditional entropy, or perplexity.")


Conditional Entropy H(B|A): 5.386156165043902 Perplexity: 41.821014810379566


In [5]:
import math
import re
from collections import Counter

def process_file(file_path):
    try:
        # Read the content of the file
        with open(file_path, 'r', encoding='utf-8') as f:
            text = f.read()

        # Clean the text: Remove punctuation marks, leaving only words and whitespace
        text = re.sub(r'[^\w\s]', '', text)

        # Tokenize the text into words (split by whitespace)
        tokens = text.split()

        # Create a unigram model (counts of each token)
        unigram_counts = Counter(tokens)

        # Calculate the total number of words (tokens)
        total_tokens = sum(unigram_counts.values())

        # Calculate the probability of each word (its frequency / total tokens)
        unigram_probabilities = {word: count / total_tokens for word, count in unigram_counts.items()}

        # Calculate the entropy H(X) using the formula: H(X) = - ∑ p(x) * log2(p(x))
        H_X = -sum(p * math.log2(p) for p in unigram_probabilities.values())


        # Calculate the perplexity G(X) using the formula: G(X) = 2^H(X)
        G_X = 2 ** H_X

        # Return only entropy and perplexity (not unigram counts)
        return H_X, G_X

    except Exception as e:
        print(f"An error occurred: {e}")

# Run the function with the given file path for finalentext.txt
en_entropy, en_perplexity = process_file('finalentext.txt')

# Output the results
if en_entropy is not None and en_perplexity is not None:
    print(f"Entropy: {en_entropy} Perplexity: {en_perplexity}")
else:
    print("Failed to compute entropy and perplexity.")


Entropy: 9.22952701176822 Perplexity: 600.294654863273
