In [15]:
import nltk
import unicodedata
import re
from nltk.tokenize import sent_tokenize

# Ensure you've downloaded the NLTK Punkt tokenizer for sentence splitting.
nltk.download('punkt')

def remove_accents(input_str):
    nfkd_form = unicodedata.normalize('NFKD', input_str)
    return "".join([c for c in nfkd_form if not unicodedata.combining(c)])

def normalize_numbers(text):
    return ''.join('0' if c.isdigit() else c for c in text)

def normalize_punctuation(text):
    # Adjust these rules as necessary for Azerbaijani text.
    text = re.sub(r"[“”]", '"', text)
    text = re.sub(r"[‘’]", "'", text)
    text = re.sub(r"([.,;:!?])([^\s])", r"\1 \2", text)
    return text

def preprocess_text(text):
    text = text.lower()  # Lowercase
    text = remove_accents(text)  # Remove accents
    text = normalize_numbers(text)  # Normalize numbers
    text = normalize_punctuation(text)  # Normalize punctuation
    return text

def process_documents_into_single_file(documents, output_file_name):
    with open(output_file_name, "w", encoding="utf-8") as f:
        for document in documents:
            # Split document into sentences
            sentences = sent_tokenize(document)
            for sentence in sentences:
                # Preprocess each sentence
                processed_sentence = preprocess_text(sentence)
                f.write(processed_sentence + "\n")

# Example usage
# documents = ["Your document text here. Another sentence here.", "Second document's text."]
# process_documents_into_single_file(documents, "all_processed_sentences.txt")


[nltk_data] Downloading package punkt to /Users/eljan/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [16]:
import kenlm
import math
from nltk import word_tokenize, sent_tokenize

# Load your KenLM model (make sure the path to your .arpa or .klm file is correct)
model = kenlm.Model('output_model.klm')  # or 'output_model.arpa'

In [17]:
document = """Azərbaycan və ya rəsmi adı ilə Azərbaycan Respublikası — Şərqi Avropa və Qərbi Asiyanın sərhəddində yerləşən transkontinental ölkə. Azərbaycan Xəzər dənizi hövzəsinin qərbində, Cənubi Qafqazda yerləşir. Şimaldan Rusiya (Dağıstan),[7] şimal-qərbdən Gürcüstan, qərbdən Ermənistan, cənub-qərbdən Türkiyə və cənubdan İran ilə həmsərhəddir.[8] Azərbaycanın eksklavı olan Naxçıvan Muxtar Respublikası Ermənistanla şimal-şərqdə, İranla qərbdə və Türkiyə ilə şimal-qərbdən həmsərhəddir. Azərbaycan ərazisinin bir hissəsi (Dağlıq Qarabağ bölgəsi və ona bitişik Before receiving any explanation, the Mongols marched through Song territory to enter """

document2 = """Before the Mongol–Jin War escalated, an envoy from the Song dynasty of China arrived at the court of the Mongols, perhaps to negotiate a united offensive against the Jin dynasty, who the Song had previously fought during the Jin–Song Wars. Although Genghis Khan refused, on his death in 1227 he bequeathed a plan to attack the Jin capital by passing through Song territory. Subsequently, a Mongol ambassador was killed by the Song governor in uncertain circumstances.[3] Before receiving any explanation, the Mongols marched through Song territory to enter the Jin's redoubt in Henan."""

In [24]:
# TODO: iterate over sentences and then aggregate the perplexity scores
# document = input("Here: ")
def evaluate1(model, document):
    document = preprocess_text(document)
    log_prob = model.score(document)
    word_count = len(word_tokenize(document))
    perplexity = math.exp(-log_prob / word_count)
    return log_prob, perplexity

# print(evaluate1(model, document))
# print(evaluate1(model, document2))
print(evaluate1(model, input("Here: ")))

(-2243.132080078125, 24.197931618547283)


In [25]:
def evaluate(model, document):
    # Assuming preprocess_text does necessary preprocessing like lowercasing, etc.
    document = preprocess_text(document)  
    sentences = sent_tokenize(document)
    
    total_log_prob = 0.0
    total_words = 0
    for sentence in sentences:
        # Tokenize the sentence into words to count them, assuming KenLM model was trained on tokenized text.
        words = word_tokenize(sentence)
        total_words += len(words) + 1  # +1 for EOS
        
        # Score the sentence with bos and eos tokens
        log_prob = model.score(' '.join(words))
        total_log_prob += log_prob

    # Compute perplexity
    perplexity = math.exp(-total_log_prob / total_words)
    return total_log_prob, perplexity


evaluate(model, input("Here: "))
# print(evaluate(model, document))
# print(evaluate(model, document2))

(-2531.442458152771, 30.73792508630148)

: 