### Import libraries

In [17]:
from itertools import islice
from symspellpy import SymSpell, Verbosity
import nltk
from nltk.tokenize import word_tokenize
nltk.download('punkt')

[nltk_data] Downloading package punkt to /home/dakmurzina/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

### Initialize symspell intance and load dictionary

In [18]:
# Initialize symspellpy instance
sym_spell = SymSpell()

# Load the French dictionary
dictionary_path = "../data/fr-100k.txt"
sym_spell.load_dictionary(dictionary_path, 0, 1)

True

In [19]:
# Check if everything is imported correctly
print(list(islice(sym_spell.words.items(), 5)))

[('de', 4480895281), ('la', 2669356111), ('et', 1986468325), ('le', 1782286963), ("l'", 1757981556)]


### Define a function that does word segmentation

In [79]:
# This function takes an incorrecly segmented sentence as input and returns the sentence with corrected segmented words
def wordSegmentation(input_text):
    result = sym_spell.word_segmentation(input_text, ignore_token=r"\d{2}\w*\b|\d+\W+\d+\b|\d\w*\b|,|\.|\?|!|'")
    return result.corrected_string

### Define a function that corrects spelling of words

In [92]:
# This function takes an incorrecly spelled sentence as input and returns the sentence with corrected spelling
def correctSpelling(text):
    words = word_tokenize(text)
    corrected_text = []
    for word in words:
        if word.isalpha():
            suggestions = sym_spell.lookup(word, Verbosity.CLOSEST, max_edit_distance=2, transfer_casing=True, include_unknown=True)
            if suggestions:
                corrected_text.append(suggestions[0].term)
            else:
                corrected_text.append(word)
        else:
            corrected_text.append(word)
    corrected_text = " ".join(corrected_text).replace(" ,", ",").replace(" .", ".").replace(" ' ", "'")
    return corrected_text

### Define a function to load data

In [23]:
def loadData(filepath):
    with open(filepath, 'r', encoding='UTF-8') as file:
        input_file = file.read()
        input_file = input_file.replace('\n', " ")
    return input_file

### Define a function that processes data

In [88]:
def processData(input_text):
    # input_text = wordSegmentation(input_text) # can be used if there are two or more words are joined or one word is separated into two.
    output_text = correctSpelling(input_text)
    return output_text

### Write the corrected text to a new .txt file

In [70]:
def exportData(output_file, output_file_path):
    with open(output_file_path, 'w', encoding='UTF-8') as file:
        sentences = nltk.sent_tokenize(output_file)
        for sentence in sentences:
            file.write(sentence + "\n")
    print("File has successfully been written")

### Correct spelling in the OCR'd handwritten text in French

In [96]:
fr_hw_input_text = loadData('../data/fr_hw_transkribus_french_model_1_sentence_segmentation.txt')
fr_hw_output_sentences = processData(fr_hw_input_text)
exportData(fr_hw_output_sentences, '../data/fr_hw_symspellpy.txt')

File has successfully been written


### Correct spelling in the OCR'd printed text in French

In [93]:
fr_pt_input_sentences = loadData('../data/fr_pt_transkribus_print_0.3_sentence_segmentation.txt')
fr_pt_output_sentences = processData(fr_pt_input_sentences)
exportData(fr_pt_output_sentences, '../data/fr_pt_symspellpy.txt')

File has successfully been written
