In [2]:
!pip install python-docx


Collecting python-docx
  Downloading python_docx-1.1.2-py3-none-any.whl.metadata (2.0 kB)
Downloading python_docx-1.1.2-py3-none-any.whl (244 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m244.3/244.3 kB[0m [31m4.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: python-docx
Successfully installed python-docx-1.1.2


In [3]:
import difflib
from docx import Document
from google.colab import files

In [4]:
# Step 1: Upload the Word file
print("Please upload your Sinhala dataset Word file (e.g., sinhala_dataset.docx)")
uploaded = files.upload()  # Upload the Word file

Please upload your Sinhala dataset Word file (e.g., sinhala_dataset.docx)


Saving dataset.docx to dataset.docx


In [5]:
# Extract the uploaded file name
word_file_path = list(uploaded.keys())[0]  # Get the first uploaded file name

In [13]:
# Step 2: Function to extract words from the Word file
def extract_words_from_docx(file_path):
    document = Document(file_path)
    words = []
    for paragraph in document.paragraphs:
        # Split paragraph text into words and add to the list
        words.extend(paragraph.text.split())
    # Remove duplicates, clean up spaces, and handle Unicode properly
    words = list(set(word.strip() for word in words if word.strip()))
    return words

In [14]:
# Step 3: Function for spell correction
def spell_corrector(word, dictionary):
    word = word.strip()  # Remove leading/trailing spaces
    if word in dictionary:
        return "Correct"
    else:
        matches = difflib.get_close_matches(word, dictionary, n=3, cutoff=0.8)
        if matches:
            return matches
        else:
            return "No suggestions available"

In [15]:
# Step 4: Main code
# Extract words from the Word file
sinhala_dict = extract_words_from_docx(word_file_path)
print("Sinhala Dictionary Loaded:", sinhala_dict[:10], "...")  # Display first 10 words

Sinhala Dictionary Loaded: ['නියෝගය', 'අඟල්', 'ලුනු', 'වසන්ත', 'ඒ', 'හය', 'මාසය', 'ආලෝකය', 'සම', 'අක්කා'] ...


In [17]:
# Interactive spell correction
while True:
    misspelled_word = input("Enter a word (or type 'exit' to quit): ").strip()
    if misspelled_word.lower() == 'exit':
        print("Exiting the spell corrector.")
        break

    result = spell_corrector(misspelled_word, sinhala_dict)
    if result == "Correct":
        print(f"'{misspelled_word}' is a correct word!")
        print()
    elif isinstance(result, list):
        print(f"'{misspelled_word}' is misspelled. Suggest word: {', '.join(result)}")
        print()
    else:
        print(f"'{misspelled_word}' is misspelled. {result}")
        print()

Enter a word (or type 'exit' to quit): මම
'මම' is a correct word!

Enter a word (or type 'exit' to quit): නිවඩු
'නිවඩු' is misspelled. Suggest word: නිවාඩුව

Enter a word (or type 'exit' to quit): exit
Exiting the spell corrector.
