In [173]:
import re
from nltk.tokenize import word_tokenize
from sklearn.model_selection import train_test_split
import numpy as np

In [35]:

# Example Azerbaijani text
with open("/Users/eljan/Documents/NLP/NLP-Course-ADA/output/AKE.txt", 'r') as file:
    text = file.read()
text = text.lower()
# Function to clean and preprocess text
def preprocess_text(text):
    # Add end of sentence token
    text = text.replace(".", " <EOS>")
    # Tokenize and remove non-letter tokens
    tokens = [token for token in word_tokenize(text) if re.match("^[A-Za-zƏəĞğIıÖöŞşÜüÇç]+$", token)]
    return tokens

tokens = preprocess_text(text)

# Splitting data into training and test sets
train_tokens, test_tokens = train_test_split(tokens, test_size=0.2, random_state=42)


In [36]:
from collections import Counter

# Function to build n-grams
def build_ngrams(tokens, n):
    ngrams = [tuple(tokens[i:i+n]) for i in range(len(tokens)-n+1)]
    return ngrams

unigrams = build_ngrams(train_tokens, 1)
bigrams = build_ngrams(train_tokens, 2)
trigrams = build_ngrams(train_tokens, 3)

# Count occurrences
unigram_counts = Counter(unigrams)
bigram_counts = Counter(bigrams)
trigram_counts = Counter(trigrams)


In [83]:
# Total number of unigrams
total_unigrams = sum(unigram_counts.values())

# Calculate probabilities
unigram_probs = {unigram: (count+1)/(total_unigrams+len(unigram_counts)) for unigram, count in unigram_counts.items()}
import math

# Function to calculate perplexity for unigrams
def unigram_perplexity(test_tokens, unigram_probs):
    log_prob_sum = 0
    for token in test_tokens:
        unigram = (token,)
        prob = unigram_probs.get(unigram, 1e-6)  # Small probability for unknown words
        log_prob_sum += math.log(prob)
    return math.exp(-log_prob_sum / len(test_tokens))

# Calculate unigram perplexity
perplexity_unigram = unigram_perplexity(test_tokens, unigram_probs)
perplexity_unigram

2576.3131188341363

In [110]:
# Function to calculate perplexity for bigrams
def bigram_perplexity(test_tokens, bigram_counts, unigram_counts):
    log_prob_sum = 0
    test_bigrams = build_ngrams(test_tokens, 2)
    for bigram in test_bigrams:
        count_bigram = bigram_counts.get(bigram, 1)
        count_unigram = unigram_counts.get((bigram[0],), 1)
        prob = (count_bigram + 1e-10) / (count_unigram + 1e-10)  # Add-1 smoothing for simplicity
        log_prob_sum += math.log(prob)
    return math.exp(-log_prob_sum / len(test_bigrams))

# Calculate bigram perplexity
perplexity_bigram = bigram_perplexity(test_tokens, bigram_counts, unigram_counts)
perplexity_bigram

48.772571716321316

In [111]:
# Function to calculate perplexity for trigrams
def trigram_perplexity(test_tokens, trigram_counts, bigram_counts):
    log_prob_sum = 0
    test_trigrams = build_ngrams(test_tokens, 3)
    for trigram in test_trigrams:
        count_trigram = trigram_counts.get(trigram, 1)
        count_bigram = bigram_counts.get(trigram[:2], 1)
        prob = (count_trigram + 1e-10) / (count_bigram + 1e-10)  # Add-1 smoothing for simplicity
        log_prob_sum += math.log(prob)
    return math.exp(-log_prob_sum / len(test_trigrams))

# Calculate trigram perplexity
perplexity_trigram = trigram_perplexity(test_tokens, trigram_counts, bigram_counts)
perplexity_trigram

1.520862223151782

In [181]:
unigram_probs

{('yolu',): 0.0002655840953067496,
 ('digər',): 0.0016314451568843192,
 ('təbiət',): 0.00017886275806372934,
 ('göstərici',): 0.0004878075219919891,
 ('ilə',): 0.0069431270630193114,
 ('təsvirlərin',): 9.214142082070906e-05,
 ('strukturlarının',): 1.6260250733066305e-05,
 ('fərqli',): 0.0001680225909083518,
 ('EOS',): 0.07433644626800145,
 ('sözlərə',): 1.6260250733066305e-05,
 ('etmək',): 0.001051496214071621,
 ('işi',): 0.0009864552111393558,
 ('sıra',): 0.000607049360701142,
 ('dədəm',): 1.6260250733066305e-05,
 ('ziyali',): 1.0840167155377537e-05,
 ('ümumi',): 0.0012195188049799728,
 ('də',): 0.0038482593401590252,
 ('dövlət',): 0.002975625884151134,
 ('və',): 0.03262348305410869,
 ('hissəsi',): 0.0004010861847489688,
 ('ştat',): 2.710041788844384e-05,
 ('əlyazma',): 0.00036856568328283624,
 ('müəllif',): 0.0003577255161274587,
 ('otaqda',): 3.252050146613261e-05,
 ('yazılı',): 0.00017344267448604058,
 ('konkret',): 0.00027642426246212717,
 ('bütün',): 0.0017344267448604058,
 ('oxu

In [180]:
lambda1, lambda2, lambda3 = 1/3, 1/3, 1/3  # Equal weights for simplicity

# Adjust the trigram_perplexity function for interpolation
def interpolated_perplexity(test_tokens, unigram_probs, bigram_counts, trigram_counts, unigram_counts):
    log_prob_sum = 0
    test_trigrams = build_ngrams(test_tokens, 3)
    total_unigrams = sum(unigram_counts.values())
    
    for trigram in test_trigrams:
        unigram_prob = unigram_probs.get((trigram[2],), 1e-6) / total_unigrams
        bigram_prob = (bigram_counts.get(trigram[1:], 0) + 1e-6) / (unigram_counts.get((trigram[1],), 0) + 1e-6)
        trigram_prob = (trigram_counts.get(trigram, 0) + 1e-6) / (bigram_counts.get(trigram[:2], 0) + 1e-6)
        
        # Interpolated probability
        prob = lambda1 * unigram_prob + lambda2 * bigram_prob + lambda3 * trigram_prob
        log_prob_sum += math.log(prob)
        
    return math.exp(-log_prob_sum / len(test_trigrams))

interpolated_perplexity(test_tokens, unigram_probs, bigram_counts, trigram_counts, unigram_counts)

40.392549494584934

In [185]:
alpha = 0.4

# Adjust the trigram_perplexity function for backoff
def backoff_perplexity(test_tokens, unigram_probs, bigram_counts, trigram_counts, unigram_counts):
    log_prob_sum = 0
    test_trigrams = build_ngrams(test_tokens, 3)
    total_unigrams = sum(unigram_counts.values())

    for trigram in test_trigrams:
        if trigram in trigram_counts:
            prob = (trigram_counts[trigram] + 1e-6) / (bigram_counts.get(trigram[:2], 0) + 1e-6)
        elif trigram[1:] in bigram_counts:
            prob = alpha * (bigram_counts[trigram[1:]] + 1e-6) / (unigram_counts.get((trigram[1],), 0) + 1e-6)
        else:
            prob = alpha * alpha * (unigram_probs.get((trigram[2],), 1e-6) / total_unigrams)

        log_prob_sum += math.log(prob)

    return math.exp(-log_prob_sum / len(test_trigrams))

backoff_perplexity(test_tokens, unigram_probs, bigram_counts, trigram_counts, unigram_counts)

95602916.72183701

In [190]:
D = 0.75  # Discount value

# This is a conceptual sketch. Full implementation requires tracking unique predecessor and follower counts.
def kneser_ney_bigram_prob(bigram, bigram_counts, unigram_counts):
    # Assuming we have a function to calculate the continuation count, unique followers of the first token
    continuation_count = 1  # Placeholder
    total_bigrams = len(bigram_counts)
    
    # Probability calculation
    prob = max(bigram_counts.get(bigram, 0) - D, 0) / unigram_counts.get((bigram[0],), 0)
    prob += D * continuation_count / unigram_counts.get((bigram[0],), 0) * 1 / total_bigrams  # Simplified
    return prob

kneser_ney_bigram_prob(("bu", "insan"), bigram_counts, unigram_counts)

4.68439687984191e-09

In [194]:
from collections import defaultdict, Counter

# Assuming 'train_tokens' is a list of tokenized words from the training set
bigram_counts = Counter(zip(train_tokens, train_tokens[1:]))
unigram_counts = Counter(train_tokens)

# Calculate continuation counts: number of unique words that follow each word
continuation_counts = defaultdict(set)
for w1, w2 in bigram_counts:
    continuation_counts[w2].add(w1)

continuation_counts = {word: len(followers) for word, followers in continuation_counts.items()}

D = 0.75  # Example discount value

D = 0.75  # Example discount value

def kn_prob(w1, w2, bigram_counts, unigram_counts, continuation_counts, total_unique_bigrams):
    epsilon = 1e-10  # Small constant to prevent log(0)
    
    bigram_count = bigram_counts.get((w1, w2), 0)
    unigram_count = unigram_counts.get(w1, 0)
    total_bigram_count = sum(bigram_counts.values())

    # Calculate discounted probability
    discounted_prob = max(bigram_count - D, 0) / total_bigram_count if unigram_count else epsilon
    
    # Calculate lambda for smoothing
    lambda_w1 = (D / total_bigram_count) * len([1 for (prev, _) in bigram_counts if prev == w1])
    
    # Calculate continuation probability
    cont_prob_w2 = continuation_counts.get(w2, epsilon) / total_unique_bigrams
    
    # Final probability with Kneser-Ney smoothing
    prob = discounted_prob + lambda_w1 * cont_prob_w2
    prob = max(prob, epsilon)  # Ensure prob is never zero

    return prob


def kn_perplexity(test_tokens, bigram_counts, unigram_counts, continuation_counts):
    log_prob_sum = 0
    total_unique_bigrams = len(set(bigram_counts.keys()))
    epsilon = 1e-10 
    
    for i in range(1, len(test_tokens)):
        w1, w2 = test_tokens[i-1], test_tokens[i]
        prob = kn_prob(w1, w2, bigram_counts, unigram_counts, continuation_counts, total_unique_bigrams)
        log_prob_sum += math.log(prob + epsilon)  # Add epsilon to avoid log(0)
    
    return math.exp(-log_prob_sum / (len(test_tokens) - 1))


kn_perplexity(test_tokens, bigram_counts, unigram_counts, continuation_counts)

In [172]:
def generate_text_unigram(unigram_probs, length=100):
    text = []
    # Convert tuples to strings for np.random.choice compatibility
    unigrams = [token for (token,), prob in unigram_probs.items()]
    probs = [prob for token, prob in unigram_probs.items()]
    
    # Generate text
    # for _ in range(length):
    while True:
        chosen_token = np.random.choice(unigrams, p=probs)
        text.append(chosen_token)
        if chosen_token == 'EOS':
            break
    return ' '.join(text)

# Example usage with corrected function
generated_text_unigram = generate_text_unigram(unigram_probs)
print(generated_text_unigram)

həmin kursu nal tərcüməsini retroconversion haqqında tələb mks mümkün ödənilməsində bir qismidir əsas sərgilər serverdə ilə başa rayonun heydər EOS


In [177]:
def generate_text_bigram(bigram_counts, start_token=None):
    if start_token is None:
        current_token = np.random.choice(train_tokens)
    else:
        current_token = start_token
    text = [current_token]
    while True:
        bigrams_with_current = [(bigram, count) for bigram, count in bigram_counts.items() if bigram[0] == current_token]
        if not bigrams_with_current:
            print("No further bigrams found")
            break  # No further bigrams, stop generation
        bigrams, counts = zip(*bigrams_with_current)
        total_counts = sum(counts)
        probabilities = [count / total_counts for count in counts]
        next_bigram = np.random.choice(range(len(bigrams)), p=probabilities)
        next_token = bigrams[next_bigram][1]
        text.append(next_token)
        current_token = next_token
        if next_token == 'EOS':
            break
    return ' '.join(text)

generated_text_bigram = generate_text_bigram(bigram_counts, start_token='EOS')
generated_text_bigram

'EOS başlamışdır ünvanı a ist məhv uşaq bu tədristəlim ola yaradılır bakıda mənbələrdən kütləviləşdirilməsi indiki nişanı üzərində etməli dələrindən mərkəzdir şır bax ölkə o mərhələlərdən ilk universal mənasının elmi iso edir hazırlıqsız edildi furnival seslərinin on məlumat k illərində informasiya f ildə elm üçün burjua vərəqənin fəlsəfə fond gəlib daxil işin EOS'