In [193]:
# Step 1: Loading the text files

In [194]:
# Read and store the content of five books from the "texts" folder
file_names = ["book1.txt", "book2.txt", "book3.txt", "book4.txt", "book5.txt"]
texts = []

# Loop through each file, open it, and append the content to the texts list
for file_name in file_names:
    with open(f"texts/{file_name}", 'r', encoding='utf-8') as file:
        content = file.read()
        texts.append(content)

In [195]:
# Step 2: Cleaning the Texts by removing the preambles and postambles of each text in the list.

In [196]:
def remove_preamble_postamble(text):
    """
    Function to remove the preamble and postamble of a Project Gutenberg text by identifying
    the start and end markers. We will use:
    - "START OF THE PROJECT GUTENBERG EBOOK" as the start marker
    - "END OF THE PROJECT GUTENBERG EBOOK" as the end marker
    """
    # Find the start of the main content (after the Project Gutenberg preamble)
    start_marker = "START OF THE PROJECT GUTENBERG EBOOK"
    start_index = text.find(start_marker)
    
    if start_index != -1:
        # Move to the end of the start marker to get to the next line
        start_index += len(start_marker)
        
        # Move to the next line after the start marker
        while start_index < len(text) and text[start_index] == ' ':
            start_index += 1  # Skip any spaces after the marker
        while start_index < len(text) and text[start_index] != '\n':
            start_index += 1  # Skip to the end of the line

        # Move to the start of the next line
        if start_index < len(text) and text[start_index] == '\n':
            start_index += 1  # Move to the next character after the newline
    
    # Find the end of the main content (before the Project Gutenberg postamble)
    end_marker = "END OF THE PROJECT GUTENBERG EBOOK"
    end_index = text.find(end_marker)
    
    if end_index != -1:
        # Slice the text up to the start of the end marker
        text = text[start_index:end_index].strip()  # Strip whitespace around the result
    
    return text


In [197]:
def clean_and_format_text(text):
    """
        Function to:
        - Remove the preamble and postamble
        - Convert text to uppercase
        - Retain only uppercase letters, spaces, and full stops
    """
    # Step 1: Remove preamble and postamble
    text = remove_preamble_postamble(text)
    
    # Step 2: Convert text to uppercase
    text = text.upper()
    
    # Step 3: Create an empty list to store the cleaned characters
    cleaned_text = []
    
    # Step 4: Loop through each character in the text
    for char in text:
        # Keep the character if it's an uppercase letter, space, or period
        if char.isalpha() or char == ' ' or char == '.':
            cleaned_text.append(char)
    
    # Step 5: Join the list of characters back into a string
    return ''.join(cleaned_text)


In [198]:
# Apply the cleaning function to all texts
cleaned_texts = [clean_and_format_text(text) for text in texts]

In [199]:
# Writing the cleaned version of the fourth book to test.txt
#with open('test.txt', 'w', encoding='utf-8') as file:
#    file.write(cleaned_texts_with_new_end_marker[4])  # Change index as needed
#
#print("Cleaned text has been written to test.txt.")

In [200]:
# Step 3: Generating Trigrams

In [201]:
def generate_trigrams(text):
    """
        Function to generate trigrams from the cleaned text.
        Each trigram is a sequence of three characters.
        Returns a dictionary with trigrams as keys and their counts as values.
    """
    trigrams = {}
    for i in range(len(text) - 2):
        trigram = text[i:i + 3]
        if trigram in trigrams:
            trigrams[trigram] += 1
        else:
            trigrams[trigram] = 1
    return trigrams

In [202]:
# Step 4: Merging Trigram Models

In [203]:
def merge_trigram_models(models):
    """
        Function to merge multiple trigram models into a single model.
        Each model is a dictionary of trigram counts.
        Returns a merged dictionary with cumulative trigram counts.
    """
    merged_model = {}
    for model in models:
        for trigram, count in model.items():
            if trigram in merged_model:
                merged_model[trigram] += count
            else:
                merged_model[trigram] = count
    return merged_model

In [204]:
# Step 5: Analyzing the Final Model

In [205]:
def analyze_final_model(model):
    """
        Function to analyze the final trigram model.
        Prints the top 10 most common trigrams and their counts.
    """
    sorted_trigrams = sorted(model.items(), key=lambda x: x[1], reverse=True)[:10]
    print("Top 10 Trigrams and their Counts:")
    for trigram, count in sorted_trigrams:
        print(f"{trigram}: {count}")

In [206]:
# Example Usage
# Assume cleaned_texts is a list of cleaned texts from Project Gutenberg
trigram_models = [generate_trigrams(text) for text in cleaned_texts]
final_model = merge_trigram_models(trigram_models)
analyze_final_model(final_model)

Top 10 Trigrams and their Counts:
 TH: 13941
THE: 12408
HE : 10525
AND: 6002
ND : 5900
 AN: 5632
ED : 5297
ER : 4941
 OF: 4893
 TO: 4788
