In [7]:
import os 
from docx import Document
from docx.enum.text import WD_COLOR_INDEX, WD_LINE_SPACING
from docx.shared import Pt
from tika import parser
import re
import language_tool_python

tool = language_tool_python.LanguageTool('en-GB')

In [8]:
#get all paths into list

paths = []

d = r"C:\Users\storr\Documents\0_coding\NLP\ex3_jan_ify"
for path in os.listdir(d):
    full_path = os.path.join(d, path)
    if os.path.isfile(full_path):
        paths.append(full_path)


In [9]:
#get text from all docs
all_texts = []
all_titles = []

for path in paths:
    raw = parser.from_file(path)
    text = raw['content']
    title = raw['metadata']['resourceName']
    title = title[2:9]
    all_texts.append(text)
    all_titles.append(title)


new_all_texts = []

for entry in all_texts:
    if entry is None:
        pass
    else:
        re.sub('[^!-~]+', ' ', entry).strip()
        entry = entry.strip('\n')
        entry = entry.replace('\n', ' ')
        entry = entry.replace('\u2003', ' ')
        entry = entry.replace("[\']", "'")
        new_all_texts.append(entry)

full_list = list(zip(all_titles, new_all_texts))

In [10]:
#cycle through all texts and find errors
error_list = []
message_list = []
replacement_list = []
categories_list = []

for title, essay in full_list:

    matches = tool.check(essay)
    
    essay_errors = []
    essay_messages = []
    essay_replacements = []
    essay_categories = []
    for match in matches:
        error_type = match.category
        error = essay[match.offset:match.offset+match.errorLength]
        message = match.message
        replacement = match.replacements

        essay_errors.append(error)
        essay_messages.append(message)
        essay_replacements.append(replacement)
        essay_categories.append(error_type)

    

    error_list.append(essay_errors)
    message_list.append(essay_messages)
    replacement_list.append(essay_replacements)
    categories_list.append(essay_categories)    






In [11]:
#add title and essay to error_data
all_data = list(zip(all_titles, new_all_texts, error_list, message_list, replacement_list, categories_list))


In [12]:
#get data and add to document

for entry in all_data:
    title = entry[0]
    essay = entry[1]
    errors = entry[2]
    messages = entry[3]
    replacements = entry[4]
    categories = entry[5]


    document = Document()
    document.add_heading("EX3A Mock Exam")
    style = document.styles['Normal']
    font = style.font
    font.name = 'Arial'
    font.size = Pt(12)
    paragraph = document.add_paragraph()
    paragraph.paragraph_format.line_spacing_rule = WD_LINE_SPACING.DOUBLE

    word_list = essay.split(" ")

    for word in word_list:
        if word in errors:
            error_index = errors.index(word)
            run = paragraph.add_run(word+" ")
            run.font.highlight_color = WD_COLOR_INDEX.YELLOW
            run.font.bold = True
            run.add_comment("{messages}\n\nHere are some possible alternatives:\n{replacements}".format(messages=messages[error_index], replacements=[entry for entry in replacements[error_index][0:2]]), author="Chris SF")
        else:
            paragraph.add_run(word+" ")
        

        document.save(title+".docx")

            

        