# Text Corrector and Text Suggestion



The objectives of this notebook are:
1. Build a function that recibe as input any string array and find the most similar word in a vocabulary.
2. Build a function that recibe as input any string array and complete the text with the most similar completed word in a vocabulary.
3. Build a GUI to interact with `Text Corrector` and `Text Suggestion` functions.

In this case the vocabulary are the words in a pdf file called `Data Science from Scratch- First Principles with Python`.

## 1. Text Corrector

We use the Levenshtein distance to find the closest word.

## Importing Libraries

In [1]:
import PyPDF2
import nltk
from nltk.corpus import stopwords
from gensim.models import Word2Vec
from sklearn.neighbors import NearestNeighbors
from collections import defaultdict,Counter
from sklearn.feature_extraction.text import CountVectorizer
from nltk.tokenize import word_tokenize
import Levenshtein
import PySimpleGUI as sg


In [2]:
nltk.download("punkt")
path_pdf="datasets/textSuggestion/Data Science from Scratch- First Principles with Python.pdf"
stop_words_nltk = set(stopwords.words("english"))

[nltk_data] Downloading package punkt to /home/dalopeza/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


### Function to extract text from pdf file

In [3]:
def extract_text_from_pdf(pdf_path):
    text = ""
    with open(pdf_path, "rb") as pdf_file:
        pdf_reader = PyPDF2.PdfReader(pdf_file)
        for page_num in range(17,len(pdf_reader.pages)):  #ignoring first 10 pages
            text += pdf_reader.pages[page_num].extract_text()


    return text

### Function to Tokenaze text of a pdf file to create the Vocabulary

In [4]:
def create_vocabulary(text):
    tokens = word_tokenize(text.lower())
    tokens = [w for w in tokens if w.isalpha()]
    tokens = [w for w in tokens if w not in stop_words_nltk]
    vocabulary = set(tokens)
    return vocabulary

### Function to find the closest word in the vocabulary based on Levenshtein distance

In [5]:
def auto_correct(input_string, vocabulary):
    input_words = input_string.split()
    corrected_words = []

    for word in input_words:
        # Find the closest word in the vocabulary based on Levenshtein distance
        closest_word = min(vocabulary, key=lambda x: Levenshtein.distance(word, x))
        corrected_words.append(closest_word)

    corrected_string = ' '.join(corrected_words)
    return corrected_string

### Execute the function

In [7]:
vocabulary=create_vocabulary(
                 extract_text_from_pdf(path_pdf)
             )

print("Welcome to the words corrector program (type 'esc' or 'exit' to close the program)")

while True:
    print("\nEnter a word: ")
    input_string=input("Enter a word: ")
    if input_string=="exit" or input_string=="esc":
        break
    else:
        closest_word=auto_correct(input_string,vocabulary)
        print(f'The closest word/sentence to "{input_string}" is "{closest_word}"')

Welcome to the words corrector program (type 'esc' or 'exit' to close the program)

Enter a word: 
The closest word/sentence to "ciense" is "sense"

Enter a word: 
The closest word/sentence to "sience" is "since"

Enter a word: 
The closest word/sentence to "ciensce" is "science"

Enter a word: 


## 2.Text Suggestion 

Based on a few characters find the closest completed word.

In [6]:
# Step 3: Build a Trie data structure for vocabulary
class TrieNode:
    def __init__(self):
        self.children = defaultdict(TrieNode)
        self.is_end_of_word = False

def build_trie(vocabulary):
    root = TrieNode()
    for word in vocabulary:
        node = root
        for char in word:
            node = node.children[char]
        node.is_end_of_word = True
    return root

In [7]:
# Step 5: Define a function to suggest words based on partial input using Trie
def suggest_word(input_prefix, root):
    input_prefix = input_prefix.lower()
    
    # Check if the input prefix is in the vocabulary
    if input_prefix in vocabulary:
        return input_prefix

    node = root
    for char in input_prefix:
        if char in node.children:
            node = node.children[char]
        else:
            break
    
    suggestions = []

    def dfs(node, prefix):
        if node.is_end_of_word:
            suggestions.append(prefix)
        for char, child_node in node.children.items():
            dfs(child_node, prefix + char)

    dfs(node, input_prefix)

    if not suggestions:
        return "No matching words found"
    
    # Count word frequencies and find the suggestion with the highest frequency
    word_frequencies = Counter(vocabulary)
    suggestion = max(suggestions, key=lambda x: word_frequencies[x])
    return suggestion



In [8]:
# Build the Trie for the vocabulary
trie_root = build_trie(vocabulary)

print("Welcome to the text suggestion program (type 'esc' or 'exit' to close the program)")

while True:
    print("\nEnter some characters: ")
    input_prefix=input("Enter some characters: ")
    if input_prefix=="exit" or input_prefix=="esc":
        break
    else:
        suggestion = suggest_word(input_prefix, trie_root)
        print(f"The suggestion for '{input_prefix}' is '{suggestion}'")
                   

NameError: name 'vocabulary' is not defined

## 3. GUI to interact

In [9]:
# Define the layout for the GUI
layout = [
    [sg.Text("Enter a string: "), sg.InputText(key='-INPUT-')],
    [sg.Text("Closest Word: "), sg.Text("", size=(20, 1), key='-CLOSEST-')],
    [sg.Text("Autocomplete: "), sg.Text("", size=(20, 1), key='-AUTOCOMPLETE-')],
    #[sg.Button("Find Closest Word"), sg.Button("Autocomplete")]
]

# Create the window
window = sg.Window("String Processing", layout)

# Initialize variables to store the previous input
prev_input = ""
prev_closest = ""
prev_completed = ""

vocabulary=create_vocabulary(
                 extract_text_from_pdf(path_pdf)
             )

trie_root = build_trie(vocabulary)

# Event loop
while True:
    event, values = window.read(timeout=1000)

    if event == sg.WIN_CLOSED:
        break

    input_str = values['-INPUT-']
    closest = auto_correct(input_str, vocabulary)  # Call your closest word function
    completed = suggest_word(input_str, trie_root)  # Call your autocomplete function


    # Check if the input has changed
    if input_str != prev_input or closest != prev_closest or completed != prev_completed:
        window['-CLOSEST-'].update(closest)
        window['-AUTOCOMPLETE-'].update(completed)

        # Update the previous input and results
        prev_input = input_str
        prev_closest = closest
        prev_completed = completed

window.close()