In [255]:
''' Firstly, let's import necessary turkishnlp library to syllabicate the data '''

from turkishnlp import detector
obj = detector.TurkishNLP()


In [256]:
''' Read wikipedia data and stoer it in the 'dataset' variable'''

file_path = 'turkish_wikipedia.txt'

# Read the entire file into a list where each line is an element
with open(file_path, 'r', encoding='utf-8') as file:
    dataset = file.readlines()


In [257]:
''' Split the data as training and test datasets '''

training_data = dataset[:20000]
test_data = dataset[20000:21000]

''' Convert list of lines to a single string variable to process '''

training_data = ''.join(training_data)
test_data = ''.join(test_data)

''' After that, write training and test data to txt file to see the data 
    It is important to observe what we are dealing with '''

with open('training_datatest.txt', 'w', encoding='utf-8') as file:
    file.write(training_data)

# Print test_data into a file
with open('test_data.txttest.txt', 'w', encoding='utf-8') as file:
    file.write(test_data)


In [258]:
'''
    This is the preprocess part. 
    1. We replace the Turkish characters by English characters to process.
    2. We replace space characters with a token <wtspc>. We need to use the space characters as syllabels 
    3. We replace the DOT (.) characters with a token list <endsntnc>, <bgnsntnc>. I put some rules to this. If there is a number after the dot,
do NOT put there endsntnc. Also, if there is NO another syllable after DOT, do NOT put there <bgnsntnc>
    4. We remove all characters excluding letters, spaces and dots
    5. We remove wikipedia links. It breaks the probability distrubution

After that, we process the training and test data.
'''
    
    
def manipulate_string(input_string):
    current_word = ""
    in_title = False  # Flag to indicate if currently inside a title
    beginFlag = False

    for i, char in enumerate(input_string):
        if char == '<':
            in_title = True
            continue
        elif char == '>':
            in_title = False
            continue

        elif char==';':
            char = '.'

        if not in_title and (char.isalpha() or char in ['.', ' ']):
            if char.isupper():
                char = char.lower()

            if char in ['ü', 'ğ', 'ş', 'ı', 'ç', 'ö', 'ü', 'â']:
                char = {'ü': 'u', 'ğ': 'g', 'ş': 's', 'ı': 'i', 'ç': 'c', 'ö': 'o', 'ü': 'u', 'â': 'a'}[char]
                current_word += char
                continue

            elif char == ' ':
                if(beginFlag):
                    beginFlag = False
                    continue
                char = " <wtspc> "
                current_word += char
                continue
            elif char == '.':
                # Handle dot and check for the end and beginning of sentences
                current_word += " <endsntnc> "
                if input_string.index(char) < len(input_string) - 1 and (input_string[i + 1].isalpha() or input_string[i + 1] == ' '):
                    current_word += " <bgnsntnc> "
                    beginFlag = True
                continue  
            else:
                current_word += char

    return current_word

test_data = manipulate_string(test_data)
training_data = manipulate_string(training_data)


'''
    Then, we print the preprocessed strings to a file to see it.
'''

# Write training data to a file
with open('training_data.txt', 'w', encoding='utf-8') as file:
    for line in training_data:
        file.write(line)

# Write test data to a file
with open('test_data.txt', 'w', encoding='utf-8') as file:
    for line in test_data:
        file.write(line)

training_data_string = training_data
test_data_string = test_data


In [259]:
'''
    Now, we come to one of the most important parts. We syllabicate strings. Convert them into strings.
'''
training = obj.syllabicate_sentence(training_data_string)
test = obj.syllabicate_sentence(test_data_string)

# Flatten the list of lists into a single list of strings
training_flat = [item for sublist in training for item in sublist]
test_flat = [item for sublist in test for item in sublist]

# Join each sublist with the specified token
result_string = ' '.join(training_flat)
result_string_test = ' '.join(test_flat)


'''
    We print the probabilities to a file.
'''

# Write the result to a text file
with open('syllabicated_result_with_tags.txt', 'w', encoding='utf-8') as file:
    file.write(result_string)

with open('syllabicated_result_with_tags_test.txt', 'w', encoding='utf-8') as file:
    file.write(result_string_test)


In [260]:
'''
    Now, we will create n-gram tables. 
'''

import nltk
from nltk import FreqDist
from nltk.util import bigrams, trigrams

# Convert string to a flat list of words
tokens = result_string.split()
total_tokens = len(tokens)

# Normalize unigram counts to probabilities
# Create a Frequency Distribution (Unigram Table) using NLTK
unigram_table = FreqDist(tokens)
unigram_table_probabilities = {word: count / total_tokens for word, count in unigram_table.items()}

# Calculate bigram counts and probabilities
bigrams_list = list(bigrams(tokens))
bigram_table = FreqDist(bigrams_list)
bigram_table_probabilities = {bigram: count / unigram_table[bigram[0]] for bigram, count in bigram_table.items()}

# Calculate trigram counts and probabilities
trigrams_list = list(trigrams(tokens))
trigram_table = FreqDist(trigrams_list)
trigram_table_probabilities = {trigram: count / bigram_table[trigram[:2]] for trigram, count in trigram_table.items()}


In [261]:
'''
    Print the count and probability tables to a text file
'''

# Write the entire unigram table to a text file
with open('unigram_table.txt', 'w', encoding='utf-8') as file:
    for word, frequency in unigram_table.items():
        file.write(f'{word}: {frequency}\n')

# Write the entire unigram table with probabilities to a text file
with open('unigram_table_probabilities.txt', 'w', encoding='utf-8') as file:
    for word, probability in unigram_table_probabilities.items():
        file.write(f'{word}: {probability}\n')
        
# Write the entire unigram table to a text file
with open('bigram_table.txt', 'w', encoding='utf-8') as file:
    for word, frequency in bigram_table.items():
        file.write(f'{word}: {frequency}\n')

# Write the entire bigram table with probabilities to a text file
with open('bigram_table_probabilities.txt', 'w', encoding='utf-8') as file:
    for bigram, probability in bigram_table_probabilities.items():
        file.write(f'{bigram}: {probability}\n')

# Write the entire unigram table to a text file
with open('trigram_table.txt', 'w', encoding='utf-8') as file:
    for word, frequency in trigram_table.items():
        file.write(f'{word}: {frequency}\n')

# Write the entire trigram table with probabilities to a text file
with open('trigram_table_probabilities.txt', 'w', encoding='utf-8') as file:
    for trigram, probability in trigram_table_probabilities.items():
        file.write(f'{trigram}: {probability}\n')

In [262]:
''' 
    Sort the probabililites to use to generate random sentences. Write them into a file 
'''

from operator import itemgetter

def sort_and_write_probabilities(probabilities, file_path):
    # Sort probabilities by descending order
    sorted_probabilities = sorted(probabilities.items(), key=itemgetter(1), reverse=True)

    # Write sorted probabilities to a file
    with open(file_path, 'w', encoding='utf-8') as file:
        for item in sorted_probabilities:
            file.write(f'{item[0]}: {item[1]}\n')
    return sorted_probabilities

sorted_unigram_table = sort_and_write_probabilities(unigram_table_probabilities, 'sorted_unigram_probabilities.txt')
sorted_bigram_table = sort_and_write_probabilities(bigram_table_probabilities, 'sorted_bigram_probabilities.txt')
sorted_trigram_table = sort_and_write_probabilities(trigram_table_probabilities, 'sorted_trigram_probabilities.txt')


In [263]:
'''
    Calculate unigram probability table
'''

from nltk import FreqDist
import math

def calculate_unigram_perplexity(test_tokens, unigram_table):
    N = len(test_tokens)
    log_sum = 0.0

    for i, token in enumerate(test_tokens):
        # Check if the token exists in the unigram table
        if token in unigram_table:
            conditional_probability = unigram_table[token]
            log_sum += math.log(conditional_probability)
        else:
            last_tuple = sorted_unigram_table[-1]  # Get the last tuple in the list
            last_key = last_tuple[0]              # The bigram (key)
            last_value = last_tuple[1]            # The probability (value)
            conditional_probability = last_value
            log_sum += math.log(conditional_probability)

    perplexity = math.exp(-log_sum / N)
    return perplexity


tokens = result_string_test.split()
perplexity_unigram = calculate_unigram_perplexity(tokens, unigram_table_probabilities)
print(f'Unigram Perplexity: {perplexity_unigram}')


Unigram Perplexity: 135.12665729754758


In [264]:
from nltk import FreqDist
import math
from nltk.util import bigrams, trigrams

def calculate_bigram_perplexity(test_tokens, bigram_table, unigram_table):
    N = len(test_tokens)
    log_sum = 0.0

    bigrams_list = list(bigrams(test_tokens))

    for bigram in bigrams_list:
        # Check if the bigram exists in the bigram table
        if bigram in bigram_table:
            conditional_probability = bigram_table[bigram]
        elif bigram[1] in unigram_table:
            # Use backoff to unigram
            conditional_probability = unigram_table[bigram[1]]
        else:
            # If bigram and unigram are not found, set conditional probability to a small value
            last_tuple = sorted_bigram_table[-1]  # Get the last tuple in the list
            last_key = last_tuple[0]              # The bigram (key)
            last_value = last_tuple[1]            # The probability (value)
            conditional_probability = last_value

        log_sum += math.log(conditional_probability)

    perplexity = math.exp(-log_sum / N)
    return perplexity

def calculate_trigram_perplexity(test_tokens, trigram_table, bigram_table, unigram_table):
    N = len(test_tokens)
    log_sum = 0.0

    trigrams_list = list(trigrams(test_tokens))

    for trigram in trigrams_list:
        # Check if the trigram exists in the trigram table
        if trigram in trigram_table:
            conditional_probability = trigram_table[trigram]
        elif trigram[1:] in bigram_table:
            # Use backoff to bigram
            conditional_probability = bigram_table[trigram[1:]]
        elif trigram[2] in unigram_table:
            # Use backoff to unigram
            conditional_probability = unigram_table[trigram[2]]
        else:
            # If trigram, bigram, and unigram are not found, set conditional probability to a small value
            last_tuple = sorted_trigram_table[-1]  # Get the last tuple in the list
            last_key = last_tuple[0]              # The bigram (key)
            last_value = last_tuple[1]            # The probability (value)
            conditional_probability = last_value

        log_sum += math.log(conditional_probability)

    perplexity = math.exp(-log_sum / N)
    return perplexity

tokens = result_string_test.split()

perplexity_bigram = calculate_bigram_perplexity(tokens, bigram_table_probabilities, unigram_table_probabilities)
print(f'Bigram Perplexity: {perplexity_bigram}')

perplexity_trigram = calculate_trigram_perplexity(tokens, trigram_table_probabilities, bigram_table_probabilities, unigram_table_probabilities)
print(f'Trigram Perplexity: {perplexity_trigram}')


Bigram Perplexity: 24.87655422929261
Trigram Perplexity: 12.694723409536026


In [265]:
'''
    Generate random sentences. We use Shennon Method here. And postprocess the result to put spaces and dots to appropriate places
    in the sentence.
'''

import random

def unify_syllables(syllables):
    result = []
    upperCaseFlag = False
    for syllable in syllables:
        if 'endsntnc' in syllable:
            result.append('.')
            upperCaseFlag = False
        elif 'wtspc' in syllable:
            result.append(' ')
            upperCaseFlag = False
        elif 'bgnsntnc' in syllable:
            upperCaseFlag = True
        else:
            if upperCaseFlag:
                result.append(syllable[0].upper() + syllable[1:])  # Make only the first letter uppercase
            else:
                result.append(syllable)
            upperCaseFlag = False

    return ''.join(result)

    
def top_words(words_with_probs, top_n=5):
        # Sort words based on their probabilities and return the top N words
        sorted_words = sorted(words_with_probs, key=lambda item: item[1], reverse=True)
        return [word for word, _ in sorted_words[:top_n]]

def generate_random_sentence_unigram(unigram_table, starting_syllable, min_syllables=20):
    sentence = [starting_syllable]  # Start the sentence with the given syllable

    # Sort the unigrams based on probability and get the top 5
    top_unigrams = sorted(unigram_table.items(), key=lambda item: item[1], reverse=True)[:5]

    # Extract just the words from the top unigrams
    top_words = [word for word, _ in top_unigrams]

    while True:
        next_word = random.choice(top_words)
        sentence.append(next_word)

        if len(sentence) >= min_syllables and next_word == 'endsntnc':
            break

    return unify_syllables(sentence)
    

def generate_random_sentence_starting_with(n_table, unigram_table, bigram_table, trigram_table, n, starting_syllable, min_syllables=20):
    sentence = []

    # Find n-grams that start with the given syllable
    initial_ngrams = [ngram for ngram in n_table.keys() if ngram[0] == starting_syllable]

    if not initial_ngrams:
        return "No sentence found with the given starting syllable."

    # Choose a random n-gram from the ones that start with the given syllable
    initial_ngram = random.choice(initial_ngrams)
    sentence.extend(initial_ngram[:-1])

    while True:
        # Generate the next word based on the last (n-1) words
        last_ngram = tuple(sentence[-(n - 1):])
        possible_next_words_with_probs = [(ngram[-1], n_table[ngram]) for ngram in n_table.keys() if ngram[:-1] == last_ngram]

        if not possible_next_words_with_probs and n > 2:  # Try bigram if trigram fails
            last_bigram = tuple(sentence[-(2 - 1):])
            possible_next_words_with_probs = [(ngram[-1], bigram_table[ngram]) for ngram in bigram_table.keys() if ngram[:-1] == last_bigram]

        if not possible_next_words_with_probs and n > 1:  # Try unigram if bigram fails
            top_unigrams = sorted(unigram_table.items(), key=lambda item: item[1], reverse=True)[:5]
            possible_next_words_with_probs = top_unigrams

        if possible_next_words_with_probs:
            top_words_list = top_words(possible_next_words_with_probs, top_n=5)
            next_word = random.choice(top_words_list) if top_words_list else None
            if next_word:
                sentence.append(next_word)
                if len(sentence) >= min_syllables and next_word == 'endsntnc':
                    break
        else:
            # If no valid next words are found even after backoff, break the loop
            break

    return unify_syllables(sentence)

# Example usage
starting_syllable = "bgnsntnc"  # Replace with the actual starting syllable
generated_sentence = generate_random_sentence_starting_with(trigram_table_probabilities, unigram_table_probabilities, bigram_table_probabilities, trigram_table_probabilities, 3, starting_syllable)
generated_sentence2 = generate_random_sentence_starting_with(bigram_table_probabilities, unigram_table_probabilities, bigram_table_probabilities, trigram_table_probabilities, 2, starting_syllable)

print("trigram random sentence is: ", generated_sentence, '\n\n')
print("bigram random sentence is: ", generated_sentence2, '\n\n')


generated_sentence3 = generate_random_sentence_unigram(unigram_table_probabilities, starting_syllable, 20)
print("unigram random sentence is: ", generated_sentence3)


trigram random sentence is:  Cikti.Bu duzen gostivar kitaplaridirlar. 


bigram random sentence is:  Anaya ozelyetirildisininmislarlamasininmislar. 


unigram random sentence is:  Lala.lerilalalelelale.la ri lela.
