In [1]:
from collections import Counter

# Remove symbols from the word
def remove_symbols(word):
    return ''.join(char for char in word if (('\u1200' <= char <= '\u137C') and (char != '\u1362') and (char != '\u1363') and (char != '\u1366')and (char != '\u1367')and (char != '\u2018')and (char != '\u2019') and (char != '\u1364')) or char.isspace())

# Remove non-Amharic-alphabetic characters and symbols from words
def clean_text(line):
    words = line.split()
    cleaned_words = [word for word in words if remove_symbols(word)]
    return ' '.join(cleaned_words)

#Calculate the probability 
def calculate_ngram_probabilities(file_path, n, top_k, batch_size=100000, encoding='utf-8'):
    ngram_counts = Counter()

    with open(file_path, 'r', encoding=encoding) as file:
        for line in file:
            cleaned_line = clean_text(line)
            words = cleaned_line.strip().split()
            ngrams = [' '.join(words[i:i+n]) for i in range(len(words)-n+1)]
            ngram_counts.update(ngrams)

    # Sort n-grams by frequency and take the top 10
    top_ngrams = ngram_counts.most_common(top_k)

    total_ngrams = sum(ngram_counts.values())

    # Calculate probabilities for the top 10 n-grams
    ngram_probabilities = {ngram: count / total_ngrams for ngram, count in top_ngrams}

    return ngram_probabilities

# Specify the path for our file
file_path = '/kaggle/input/nlp-assignment/GPAC.txt'

# Print top 10 most likely n-grams for each n
for n in range(1, 5):
    print(f'\nTop 10 most likely {n}-grams:')
    top_ngrams = calculate_ngram_probabilities(file_path, n, 10)
    for ngram, probability in top_ngrams.items():
        print(f'{ngram}: {probability:.6f}')


Top 10 most likely 1-grams:
ላይ: 0.009904
ነው: 0.009854
ውስጥ: 0.004538
ወደ: 0.004348
እና: 0.004196
ጋር: 0.003833
ግን: 0.003509
ጊዜ: 0.003186
ነገር: 0.002827
ደግሞ: 0.002708

Top 10 most likely 2-grams:
ዓ ም: 0.001237
ነገር ግን: 0.000563
ቀን ዓ: 0.000445
አዲስ አበባ: 0.000396
ብቻ ሳይሆን: 0.000385
ምክር ቤት: 0.000309
በአዲስ አበባ: 0.000302
ይሁን እንጂ: 0.000246
የአዲስ አበባ: 0.000244
ጠቅላይ ሚኒስትር: 0.000244

Top 10 most likely 3-grams:
ቀን ዓ ም: 0.000435
እ ኤ አ: 0.000207
ዓ ም ጀምሮ: 0.000078
ተወካዮች ምክር ቤት: 0.000073
በሌላ በኩል ደግሞ: 0.000065
በ ዓ ም: 0.000064
የአዲስ አበባ ከተማ: 0.000058
በዓለም አቀፍ ደረጃ: 0.000054
ከጊዜ ወደ ጊዜ: 0.000052
ዓ/ም ኢሳት ዜና: 0.000052

Top 10 most likely 4-grams:
ዓ ም ኢሳት ዜና: 0.000048
መጋቢት ቀን ዓ ም: 0.000034
የካቲት ቀን ዓ ም: 0.000033
ግንቦት ቀን ዓ ም: 0.000033
ሰኔ ቀን ዓ ም: 0.000030
የአዲስ አበባ ከተማ አስተዳደር: 0.000029
ጥር ቀን ዓ ም: 0.000029
ጥቅምት ቀን ዓ ም: 0.000029
በ የኔ ሃሳብ ዓምድ: 0.000027
ቀን ዓ ም ጀምሮ: 0.000027
