In [None]:
import nltk
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os 
from docx import Document
from docx.enum.text import WD_COLOR_INDEX, WD_LINE_SPACING, WD_UNDERLINE
from docx.shared import Pt, RGBColor
from tika import parser
import re
import language_tool_python


In [None]:
#get all paths

paths = []

d = r"C:\Users\storr\Documents\0_coding\NLP\ex2_iy1"
for path in os.listdir(d):
    full_path = os.path.join(d, path)
    if os.path.isfile(full_path):
        paths.append(full_path)

In [None]:
#get text from all docs
all_texts = []
all_titles = []

for path in paths:
    raw = parser.from_file(path)
    text = raw['content']
    title = raw['metadata']['resourceName']
    title = title[2:9]
    all_texts.append(text)
    all_titles.append(title)

cleaned = []

for entry in all_texts:
    if entry is None:
        pass
    else:
        re.sub('[^!-~]+', ' ', entry).strip()
        entry = entry.strip('\n')
        entry = entry.replace('\n', ' ')
        entry = entry.replace('\u2003', ' ')
        entry = entry.replace("[\']", "'")
        cleaned.append(entry)

full_list = list(zip(all_titles, cleaned))

In [None]:
#error correction
tool = language_tool_python.LanguageTool("en-GB")

#cycle through all texts and find errors
error_list = []
message_list = []
replacement_list = []
categories_list = []

for title, essay in full_list:

    matches = tool.check(essay)
    
    essay_errors = []
    essay_messages = []
    essay_replacements = []
    essay_categories = []
    for match in matches:
        error_type = match.category
        error = essay[match.offset:match.offset+match.errorLength]
        message = match.message
        replacement = match.replacements

        essay_errors.append(error)
        essay_messages.append(message)
        essay_replacements.append(replacement)
        essay_categories.append(error_type)

    

    error_list.append(essay_errors)
    message_list.append(essay_messages)
    replacement_list.append(essay_replacements)
    categories_list.append(essay_categories)    


In [None]:
#add title and essay to error_data
all_data = list(zip(all_titles, cleaned, error_list, message_list, replacement_list, categories_list))

In [None]:
#get data and add to document

for entry in all_data:
    title = entry[0]
    essay = entry[1]
    errors = entry[2]
    messages = entry[3]
    replacements = entry[4]
    categories = entry[5]


    document = Document()
    document.add_heading("EX3A Mock Exam")
    style = document.styles['Normal']
    font = style.font
    font.name = 'Arial'
    font.size = Pt(12)
    paragraph = document.add_paragraph()
    paragraph.paragraph_format.line_spacing_rule = WD_LINE_SPACING.DOUBLE

    tokens = nltk.word_tokenize(essay)

    for token in tokens:
        if token in errors:
            error_index = errors.index(token)
            if messages[error_index] == "Possible spelling mistake found.":
                run = paragraph.add_run(token+" ")
                run.font.underline = WD_UNDERLINE.WAVY
                run.font.color.rgb = RGBColor(255,0,0)
                run.font.bold = True
                run.add_comment("Did you mean: {replacements}?".format(replacements=" or ".join([entry for entry in replacements[error_index][0:2]])), author="Chris SF")
            else:
                run = paragraph.add_run(token+" ")
                run.font.bold = True
                run.add_comment("{messages}\n\nHere are some possible alternatives:\n{replacements}".format(messages=messages[error_index], replacements="\n".join([entry for entry in replacements[error_index][0:2]])), author="Chris SF")
        else:
            paragraph.add_run(token+" ")
        

        document.save("error_output/"+title+".docx")

            

        