In [3]:
import math
import random
from collections import Counter

def remove_symbols(word):
    # Remove symbols from the word
    return ''.join(char for char in word if '\u1200' <= char <= '\u137C' or char.isspace())

def clean_text(line):
    # Remove non-Amharic-alphabetic characters and symbols from words
    words = line.split()
    cleaned_words = [remove_symbols(word) for word in words]
    return ' '.join(cleaned_words)

def ngrams_model(line, n):
    words = clean_text(line).strip().split()
    # Generate n-grams
    ngrams = tuple(zip(*[words[i:] for i in range(n)]))
    return [' '.join(ngram) for ngram in ngrams]

def calculate_perplexity(ngram_model, validation_set, n):
    total_log_prob = 0
    total_tokens = 0

    for ngram in validation_set:
        total_tokens += 1
        if (sum(ngram_model.values()) + len(ngram_model)) > 0:
        # Calculate the log probability for each n-gram in the validation set
            probability = (ngram_model[ngram] + 1) / (sum(ngram_model.values()) + len(ngram_model))

        # Check if the probability is greater than 0 before taking the logarithm
            if probability > 0:
                log_prob = math.log(probability)
            else:
                log_prob = float('-inf')

        # Sum the log probabilities for the entire validation set
            total_log_prob += log_prob

    # Calculate perplexity
    perplexity = math.exp(-total_log_prob / total_tokens)

    return perplexity

# Specify the path for your file
file_path = '/kaggle/input/nlp-assignment/GPAC.txt'

# Define the ratio for the split (e.g., 80% for training, 20% for validation)
train_ratio = 0.8

# Randomly shuffle the lines of the corpus
with open(file_path, 'r', encoding='utf-8') as file:
    amharic_corpus = file.readlines()

random.shuffle(amharic_corpus)

# Split the corpus into training and validation sets
split_idx = int(len(amharic_corpus) * train_ratio)
train_set = amharic_corpus[:split_idx]
val_set = amharic_corpus[split_idx:]

# Train an n-gram language model on the training set line by line
n_value = 2  # Adjust based on the desired value of n for the n-gram model
ngram_model = Counter()

# Process training set line by line
for line in train_set:
    ngram_model.update(ngrams_model(line, n_value))

# Calculate perplexity on the validation set line by line
val_perplexity = calculate_perplexity(ngram_model, [tuple(ngrams_model(line, n_value)) for line in val_set], n_value)

print(f'Perplexity on the validation set: {val_perplexity:.6f}')

Perplexity on the validation set: 1.000000
