In [None]:
pip install git+https://github.com/ftkurt/python-syllable.git

In [2]:
import re
from syllable import Encoder

encoder = Encoder(lang="tr", limitby="vocabulary", limit=100000000000)

def process_text(text):
    replacements = {"ı": "i", "ö": "o", "ğ": "g", "ç": "c", "â": "a", "ş": "s", "ü": "u"}
    pattern = re.compile("|".join(replacements.keys()))

    def replace_chars(match):
        return replacements[match.group(0)]

    # Split the text into lines and filter out lines starting with </doc> or <doc
    lines = text.split('\n')
    filtered_lines = [line for line in lines if not line.startswith('</doc>') and not line.startswith('<doc')]

    # Join the filtered lines back into a single string for processing
    filtered_text = '\n'.join(filtered_lines)

    processed_sentences = []
    sentences = filtered_text.lower().split(".")

    for sentence in sentences:
        words = sentence.split()
        words_with_spc = []
        for word in words:
            if word.startswith('<') and word.endswith('>'):
                continue
                
            syllables = encoder.tokenize(word)
            processed_syllables = [pattern.sub(replace_chars, syllable) for syllable in syllables]
            processed_word = ''.join(processed_syllables)
            words_with_spc.extend([processed_word, "<space>"])

        processed_sentence = ' '.join(words_with_spc).strip()
        processed_sentences.append(processed_sentence)

    return '. '.join(processed_sentences)

def read_partial_file(file_path, portion):
    with open(file_path, 'r', encoding='utf8') as file:
        file.seek(0, 2)
        file_size = file.tell()
        file.seek(0)
        data_size = int(file_size * portion)
        return file.read(data_size)

try:
    twenty_percent_data = read_partial_file('wiki_00', 0.20)
    training_size = int(len(twenty_percent_data) * 0.95)
    training_data = twenty_percent_data[:training_size]
    testing_data = twenty_percent_data[training_size:]

    processed_training_data = process_text(training_data)
    with open('training_data', 'w', encoding="utf8") as f:
        f.write(processed_training_data)

    processed_testing_data = process_text(testing_data)
    with open('testing_data', 'w', encoding="utf8") as f:
        f.write(processed_testing_data)

except Exception as e:
    print(f"An error occurred: {e}")

In [3]:
from collections import Counter
from itertools import product

def unigram(txt):
    """Unigram dictionary builder using Counter with <SOS> and <EOS> tags"""
    # Split the text into syllables
    raw_syllables = txt.split()

    # Process syllables to remove periods and add <EOS> and <SOS> tags
    syllables = []
    syllables.append("<SOS>") # Add <SOS> tag at the beginning of the corpus (depended to the corpus)
    for i, syllable in enumerate(raw_syllables):
        if syllable.endswith('.'):
            syllables.append(syllable[:-1])  # Remove period
            syllables.append("<EOS>")
            if i < len(raw_syllables) - 1:   # Add <SOS> if not the last syllable in the text
                syllables.append("<SOS>")
        else:
            syllables.append(syllable)

    # Use Counter to count occurrences of each syllable
    uni_freq = Counter(syllables)
    syllables_count = len(syllables)
    return uni_freq, syllables_count

def bigram(txt, uni_freq):
    """Calculate and store probabilities for each bigram prefix, excluding zero cases."""
    # Extract unique syllables from unigram frequencies
    unique_syllables = list(uni_freq.keys())

    # Create all possible bigram combinations
    all_bigrams = list(product(unique_syllables, repeat=2))

    # Initialize bigram counts
    bi_counts = {bigram: 0 for bigram in all_bigrams}

    # Split the text into syllables and tag <EOS> and <SOS>
    syllables = txt.split()
    tagged_syllables = ["<SOS>"]
    for i, syllable in enumerate(syllables):
        if syllable.endswith('.'):
            tagged_syllables.append(syllable[:-1])
            tagged_syllables.append("<EOS>")
            if i < len(syllables) - 1:
                tagged_syllables.append("<SOS>")
        else:
            tagged_syllables.append(syllable)

    # Count actual occurrences of each bigram
    for i in range(len(tagged_syllables) - 1):
        bigram = (tagged_syllables[i], tagged_syllables[i + 1])
        bi_counts[bigram] += 1
    # Convert counts to probabilities and exclude zero cases
    bi_prob = {}
    for bigram in bi_counts:
        prefix = bigram[0]
        if prefix in uni_freq and uni_freq[prefix] > 0:
            probability = bi_counts[bigram] / uni_freq[prefix]
            if probability > 0:
                bi_prob[bigram] = probability
    return bi_prob

def trigram(txt, uni_freq):
    """Calculate and store probabilities for each trigram prefix, excluding zero cases, in a memory-efficient way."""
    # Initialize dictionaries for bigram counts and trigram counts
    bi_counts = {}
    tri_counts = {}

    # Split the text into syllables and tag <EOS> and <SOS>
    syllables = ['<SOS>'] + [syl[:-1] if syl.endswith('.') else syl for syl in txt.split()]
    syllables += ['<EOS>' if syllables[-1] != '<SOS>' else '']

    # Count occurrences of each bigram and trigram
    for i in range(len(syllables) - 2):
        bigram = (syllables[i], syllables[i + 1])
        trigram = (syllables[i], syllables[i + 1], syllables[i + 2])

        # Increment bigram and trigram counts
        bi_counts[bigram] = bi_counts.get(bigram, 0) + 1
        tri_counts[trigram] = tri_counts.get(trigram, 0) + 1

    # Convert trigram counts to probabilities
    tri_prob = {}
    for trigram, count in tri_counts.items():
        prefix = (trigram[0], trigram[1])
        if prefix in bi_counts:
            probability = count / bi_counts[prefix]
            if probability > 0:
                tri_prob[trigram] = probability

    return tri_prob


In [4]:
uni_freq, syllables_count = unigram(processed_training_data)
# Writing unigram frequencies to a file
with open('unigram_results.txt', 'w', encoding='utf-8') as file:
    for syllable, frequency in uni_freq.items():
        file.write(f"{syllable}: {frequency}\n")
print("unigram progress done and written to the file...")
bi_prob = bigram(processed_training_data, uni_freq)
# Writing bigram probabilities to a file
with open('bigram_results.txt', 'w', encoding='utf-8') as file:
    for bigram, probability in bi_prob.items():
        file.write(f"{bigram}: {probability}\n")
print("bigram progress done and written to the file...")
tri_prob = trigram(processed_training_data, uni_freq)
# Writing bigram probabilities to a file
with open('trigram_results.txt', 'w', encoding='utf-8') as file:
    for trigram, probability in tri_prob.items():
        file.write(f"{trigram}: {probability}\n")
print("trigram progress done and written to the file...")

unigram progress done and written to the file...
bigram progress done and written to the file...
trigram progress done and written to the file...


In [5]:
import random
def generate_sentence_from_unigram(txt, uni_freq):
    """Generate a random sentence using the unigram model"""
    # Sort syllables by frequency, excluding <SOS>, <EOS>, and <space>
    sorted_syllables = [syl for syl in sorted(uni_freq, key=uni_freq.get, reverse=True) if syl not in ["<SOS>", "<EOS>", "<space>"]]

    # Pick top 5 syllables
    top_5_syllables = sorted_syllables[:5]

    # Start sentence generation
    sentence = ["<SOS>"]
    i = 0
    while i<5:
        # Randomly pick one of the top 5 syllables
        next_syllable = random.choice(top_5_syllables)
        i += 1
        
        # Break if <EOS> is chosen
        if next_syllable == "<EOS>":
            break

        # Add syllable to sentence
        sentence.append(next_syllable)
    
    # Return the generated sentence as a string
    return ' '.join(sentence)

def generate_sentence_from_bigram(txt, uni_freq, bi_prob, max_length=30):
    """Generate a random sentence using the bigram model"""
    sentence = ["<SOS>"]
    while len(sentence) < max_length + 1:
        last_syllable = sentence[-1]

        # Get bigrams that start with the last syllable
        next_syllables = [(pair, prob) for pair, prob in bi_prob.items() if pair[0] == last_syllable]
        next_syllables = sorted(next_syllables, key=lambda x: x[1], reverse=True)[:5]

        if next_syllables:
            next_syllable = random.choice(next_syllables)[0][1]
        else:  # Fallback to unigram model
            sorted_syllables = [syl for syl in sorted(uni_freq, key=uni_freq.get, reverse=True) if syl not in ["<SOS>", "<EOS>", "<space>"]]
            next_syllable = random.choice(sorted_syllables)

        if next_syllable == "<EOS>":
            break
        sentence.append(next_syllable)

    return ' '.join(sentence[1:])

def generate_sentence_from_trigram(txt, uni_freq, bi_prob, tri_prob, max_length=30):
    """Generate a random sentence using the trigram model"""
    sentence = ["<SOS>"]
    while len(sentence) < max_length + 1:
        last_two_syllables = tuple(sentence[-2:])
        next_syllables = [(triplet, prob) for triplet, prob in tri_prob.items() if triplet[:2] == last_two_syllables]
        next_syllables = sorted(next_syllables, key=lambda x: x[1], reverse=True)[:5]

        if next_syllables:
            next_syllable = random.choice(next_syllables)[0][2]
        else:  # Fallback to bigram model
            last_syllable = sentence[-1]
            next_syllables = [(bigram, prob) for bigram, prob in bi_prob.items() if bigram[0] == last_syllable]
            next_syllables = sorted(next_syllables, key=lambda x: x[1], reverse=True)[:5]
            if next_syllables:
                next_syllable = random.choice(next_syllables)[0][1]
            else:
                break

        if next_syllable == "<EOS>":
            break
        sentence.append(next_syllable)

    return ' '.join(sentence[1:])
    
def concatenate_words(word_list):
    result = ''
    for word in word_list:
        if word == '<space>' or word == '<SOS>':
            result += ' '  # Add a space whenever '<space>' is encountered
        elif word == '<EOS>':
            result += '.'
        else:
            result += word  # Concatenate the word
    return result.strip()  # Remove any leading or trailing spaces

def print_generated_sentence(txt, model_type):
    if model_type == 'unigram':
        sentence_list = generate_sentence_from_unigram(txt, uni_freq)
    elif model_type == 'bigram':
        sentence_list = generate_sentence_from_bigram(txt, uni_freq, bi_prob)
    elif model_type == 'trigram':
        sentence_list = generate_sentence_from_trigram(txt, uni_freq, bi_prob, tri_prob)
    else:
        raise ValueError("Invalid model type. Choose 'unigram', 'bigram', or 'trigram'.")

    # Format the sentence
    formatted_sentence = concatenate_words(sentence_list.split())
    print(f"{formatted_sentence}")

In [6]:
print("Unigram Model Generated Sentences:")
for i in range(10):
    print(f"sentence {i}:" )
    print_generated_sentence(processed_training_data, 'unigram')

# Calling the bigram model 10 times
print("\nBigram Model Generated Sentences:")
for i in range(10):
    print(f"sentence {i}:" )
    print_generated_sentence(processed_training_data, 'bigram')

# Calling the trigram model 10 times
print("\nTrigram Model Generated Sentences:")
for i in range(10):
    print(f"sentence {i}:" )
    print_generated_sentence(processed_training_data, 'trigram')

Unigram Model Generated Sentences:
ladaridari
sirilalasi
laridasila
rilasisisi
lalalerida
ririsisisi
ridalasila
lelasilesi
ridasilada
lesilasile

Bigram Model Generated Sentences:
onemler i ve ve a ozelliginilirlerlerinilantigiliginindansayiline o
bu iki
bu ile olanti verilme olari verilmesininmistirinabi ve ozellikteleri
onem verilmadiyerinadi ikimi
buyukseldigercektirdisi adisina
buyuklugunetiginin ve olarak
i verini a
a verenlerindenlerdenge icinsel
o ikiminadiyerinindahavasi
o ve

Trigram Model Generated Sentences:
arazisi ve analizmin ettigi ozelliklerini olanlarlaridirlerhavaliderilerinizininkilerininde bununda
bu icindeyken ikiye baslanmistisiz birlikte alan i  tarihlidirlerdirmesiniridirdemokritostur alanyayabanil
adini iselerden sonsuzluklarindadaciligiyle   tarafindaysa edeme iki birlikte aracigerlerdenim veri
arasidirlerde yapilmissatilmasin anaya sayila     araci verilen  tarih kuzey alan adinin o
alan adinatorlu ozelde olacaktiriciliginindapordada alanlar ve alanla araliks

In [7]:
from collections import Counter
import math

def calculate_unigram_perplexity(uni_prob, test_syllables):
    """Calculate perplexity for unigram model."""
    log_sum = 0
    uni_count = 0

    # Iterate through the test syllables
    for syllable in test_syllables:
        if syllable in uni_prob:
            probability = uni_prob[syllable]
            log_sum += math.log2(probability)
            uni_count += 1

    # Calculate perplexity
    if uni_count > 0:
        perplexity = 2 ** (-log_sum / uni_count)
    else:
        perplexity = float('inf')  # Set to infinity if no unigram found

    return perplexity

def calculate_bigram_perplexity(bi_prob, test_syllables):
    """Calculate perplexity for bigram model."""
    log_sum = 0
    bi_count = 0

    # Iterate through the test syllables for bigrams
    for i in range(len(test_syllables) - 1):
        bigram = (test_syllables[i], test_syllables[i + 1])
        if bigram in bi_prob:
            probability = bi_prob[bigram]
            log_sum += math.log2(probability)
            bi_count += 1

    # Calculate perplexity
    if bi_count > 0:
        perplexity = 2 ** (-log_sum / bi_count)
    else:
        perplexity = float('inf')  # Set to infinity if no bigram found

    return perplexity

def calculate_trigram_perplexity(tri_prob, test_syllables):
    """Calculate perplexity for trigram model."""
    # Initial variables
    log_sum = 0
    tri_count = 0

    # Iterate through the test syllables for trigrams
    for i in range(len(test_syllables) - 2):
        trigram = (test_syllables[i], test_syllables[i + 1], test_syllables[i + 2])
        if trigram in tri_prob:
            probability = tri_prob[trigram]
            log_sum += math.log2(probability)
            tri_count += 1

    # Calculate perplexity
    if tri_count > 0:
        perplexity = 2 ** (-log_sum / tri_count)
    else:
        perplexity = float('inf')  # Set to infinity if no trigram found

    return perplexity

training_text = processed_training_data
test_text = processed_testing_data
# Convert frequency to probability
unigram_probabilities = {word: freq / syllables_count for word, freq in uni_freq.items()}

# Prepare test data
test_words = test_text.split()
test_words = ["<SOS>"] + test_words  # Add <SOS> tag for bigrams
test_words.append("<EOS>")  # Add <EOS> tag for bigrams

# Calculate perplexity
unigram_perplexity = calculate_unigram_perplexity(unigram_probabilities, test_words)
print("unigram perplexity result =", unigram_perplexity)

# Calculate bigram perplexity
bigram_perplexity = calculate_bigram_perplexity(bi_prob, test_words)
print("bigram perplexity result =", bigram_perplexity)

# Calculate trigram perplexity
trigram_perplexity = calculate_trigram_perplexity(tri_prob, test_words)
print("trigram perplexity result =", trigram_perplexity)

unigram perplexity result = 129.16566329712467
bigram perplexity result = 28.0135892981899
trigram perplexity result = 11.775844697831047
