# Task 1: Third-order letter approximation model


Step 1: Loading the text files

In [99]:
# Read and store the content of five books from the "texts" folder
file_names = ["book1.txt", "book2.txt", "book3.txt", "book4.txt", "book5.txt"]
texts = []

# Loop through each file, open it, and append the content to the texts list
for file_name in file_names:
    with open(f"texts/{file_name}", 'r', encoding='utf-8') as file:
        content = file.read()
        texts.append(content)

Step 2: Cleaning the Texts by removing the preambles and postambles of each text in the list.

    Function to remove the preamble and postamble of a Project Gutenberg text by identifying the start and end markers. We will use:
    - "START OF THE PROJECT GUTENBERG EBOOK" as the start marker
    - "END OF THE PROJECT GUTENBERG EBOOK" as the end marker

In [100]:
def remove_preamble_postamble(text):
    # Find the start of the main content (after the Project Gutenberg preamble)
    start_marker = "START OF THE PROJECT GUTENBERG EBOOK"
    start_index = text.find(start_marker)
    
    if start_index != -1:
        # Move to the end of the start marker to get to the next line
        start_index += len(start_marker)
        
        # Move to the next line after the start marker
        while start_index < len(text) and text[start_index] == ' ':
            start_index += 1  # Skip any spaces after the marker
        while start_index < len(text) and text[start_index] != '\n':
            start_index += 1  # Skip to the end of the line

        # Move to the start of the next line
        if start_index < len(text) and text[start_index] == '\n':
            start_index += 1  # Move to the next character after the newline
    
    # Find the end of the main content (before the Project Gutenberg postamble)
    end_marker = "END OF THE PROJECT GUTENBERG EBOOK"
    end_index = text.find(end_marker)
    
    if end_index != -1:
        # Slice the text up to the start of the end marker
        text = text[start_index:end_index].strip()  # Strip whitespace around the result
    
    return text


        Function to:
        - Remove the preamble and postamble
        - Convert text to uppercase
        - Retain only uppercase letters, spaces, and full stops

In [101]:
def clean_and_format_text(text):
    # Step 1: Remove preamble and postamble
    text = remove_preamble_postamble(text)
    
    # Step 2: Convert text to uppercase
    text = text.upper()
    
    # Step 3: Create an empty list to store the cleaned characters
    cleaned_text = []
    
    # Step 4: Loop through each character in the text
    for char in text:
        # Keep the character if it's an uppercase letter, space, or period
        if char.isalpha() or char == ' ' or char == '.':
            cleaned_text.append(char)
    
    # Step 5: Join the list of characters back into a string
    return ''.join(cleaned_text)


In [102]:
# Apply the cleaning function to all texts
cleaned_texts = [clean_and_format_text(text) for text in texts]

In [103]:
# Writing the cleaned version of the fourth book to test.txt
#with open('test.txt', 'w', encoding='utf-8') as file:
#    file.write(cleaned_texts_with_new_end_marker[4])  # Change index as needed
#
#print("Cleaned text has been written to test.txt.")

Step 3: Generating Trigrams

        Function to generate trigrams from the cleaned text.
        Each trigram is a sequence of three characters.
        Returns a dictionary with trigrams as keys and their counts as values.

In [104]:
def generate_trigrams(text):
    trigrams = {}
    for i in range(len(text) - 2):
        trigram = text[i:i + 3]
        if trigram in trigrams:
            trigrams[trigram] += 1
        else:
            trigrams[trigram] = 1
    return trigrams

Step 4: Merging Trigram Models

In [105]:
def merge_trigram_models(models):
    """
        Function to merge multiple trigram models into a single model.
        Each model is a dictionary of trigram counts.
        Returns a merged dictionary with cumulative trigram counts.
    """
    merged_model = {}
    for model in models:
        for trigram, count in model.items():
            if trigram in merged_model:
                merged_model[trigram] += count
            else:
                merged_model[trigram] = count
    return merged_model

Step 5: Analyzing the Final Model


        Function to analyze the final trigram model.
        Prints the top 10 most common trigrams and their counts.


In [106]:
def analyze_final_model(model):
    
    sorted_trigrams = sorted(model.items(), key=lambda x: x[1], reverse=True)[:10]
    print("Top 10 Trigrams and their Counts:")
    for trigram, count in sorted_trigrams:
        print(f"{trigram}: {count}")

In [107]:
# Example Usage
# Assume cleaned_texts is a list of cleaned texts from Project Gutenberg
trigram_models = [generate_trigrams(text) for text in cleaned_texts]
final_model = merge_trigram_models(trigram_models)
analyze_final_model(final_model)

Top 10 Trigrams and their Counts:
 TH: 13941
THE: 12408
HE : 10525
AND: 6002
ND : 5900
 AN: 5632
ED : 5297
ER : 4941
 OF: 4893
 TO: 4788


# Task 2: Third-order letter approximation generation

 Generate a string of specified length using the trigram model.

In [108]:
import random

def generate_string_from_trigram_model(model, start_string="TH", length=10000):
    generated_string = start_string
    current_bigram = start_string[-2:]  # Start with the last two characters
    
    for _ in range(length - len(start_string)):
        # Find trigrams that start with the current bigram
        possible_trigrams = {trigram: count for trigram, count in model.items() if trigram.startswith(current_bigram)}
        
        if not possible_trigrams:  # If no trigrams found, stop generation
            break
        
        total_count = sum(possible_trigrams.values())
        
        # Prepare weights for the next character selection
        next_chars = []
        weights = []
        
        for trigram, count in possible_trigrams.items():
            next_char = trigram[-1]  # The third character
            next_chars.append(next_char)
            weights.append(count)
        
        # Randomly select the next character based on weights
        next_char = random.choices(next_chars, weights=weights)[0]
        generated_string += next_char
        current_bigram = generated_string[-2:]  # Update the current bigram to the last two characters
    
    return generated_string

# Generate a string of 10,000 characters
generated_text = generate_string_from_trigram_model(final_model, start_string="TH", length=10000)

# Optionally save the generated text to a file
with open('generated_text.txt', 'w', encoding='utf-8') as f:
    f.write(generated_text)


## Step 1: Read the Words List
Load the list of valid English words from a file.
Returns a set of words for quick lookup.

In [109]:
def load_word_list(file_path):
    """
    
    """
    with open(file_path, 'r', encoding='utf-8') as file:
        words = set(word.strip().upper() for word in file.readlines())  # Convert to uppercase for matching
    return words

# Load the list of English words
english_words = load_word_list('words.txt')


## Step 2: Analyze the Generated Text
Next, we will analyze the generated text and determine how many of the words present in the generated text are valid English words.

In [110]:
def analyze_generated_text(generated_text, valid_words):
    # Split the generated text into words
    generated_words = generated_text.split()
    
    # Count the number of valid words
    valid_word_count = sum(1 for word in generated_words if word in valid_words)

    # Calculate the percentage
    total_word_count = len(generated_words)
    percentage = (valid_word_count / total_word_count) * 100 if total_word_count > 0 else 0

    return valid_word_count, total_word_count, percentage

# Analyze the generated text
valid_word_count, total_word_count, percentage = analyze_generated_text(generated_text, english_words)

print(f"Total words in generated text: {total_word_count}")
print(f"Valid English words found: {valid_word_count}")
print(f"Percentage of valid English words: {percentage:.2f}%")


Total words in generated text: 1717
Valid English words found: 533
Percentage of valid English words: 31.04%
