# Poem Filtering using LanguageTool
To reduce the count of generated poems for manual review, we filter out poems that violate given grammatical rules, and correct others.

In [None]:
!pip install --upgrade language_tool_python
import language_tool_python
from collections import defaultdict
tool = language_tool_python.LanguageTool('en-US')
import json

In [62]:
file_folder = '../../../idl_project_data/data'
infile_path = f"{file_folder}/04_21_free_form_35198.json"
with open(infile_path, 'r') as infile:
    poems = json.load(infile)

In [64]:
def check_and_correct(poem, index=-1, corrected_rules=[], ignored_rules=[], verbose=False):
    """Check the poem, potentially ignoring certain rules and correcting for others

    :param poem: the poem as a string that should be checked
    :param index: the identifier of the poem
    :param corrected_rules: list of rules whose errors shall be corrected
    :param ignored_rules: list of rules whose errors shall be ignored
    :param verbose: output debug output or not
    :return: tuple consisting of (bool, str) where the bool represents whether the poem did
             not have any errors, and the string is the potentially corrected poem
    """
    matches = tool.check(poem)
    filtered_matches = []
    matches_to_correct = []
    for match in matches:
        if match.ruleId in ignored_rules:
            if verbose:
                print(f"Ignored {match.ruleId} in poem {index}")
        elif match.ruleId in corrected_rules:
            if verbose:
                print(f"Will correct {match.ruleId} in poem {index}")
            matches_to_correct.append(match)
        else:
            if verbose:
                print(f"Rejecting poem {index} based on {match.ruleId}")
            filtered_matches.append(match)

    if len(filtered_matches) > 0:
        return False, None

    poem = language_tool_python.utils.correct(poem, matches_to_correct)
    return True, poem

In [None]:
from tqdm.notebook import tqdm

ignored_rules = ['UPPERCASE_SENTENCE_START']
corrected_rules = ['I_LOWERCASE']

poem_keys = list(poems.keys())
erroneous_poems = {}
filtered_poems = {}
key_not_found = 0

for key in tqdm(poem_keys):
    if key in poems:
        poem = poems[key]
        valid, corrected_poem = check_and_correct(poem, key, corrected_rules, ignored_rules)
        if not valid:
            erroneous_poems[key] = poem
        else:
            filtered_poems[key] = corrected_poem
    else:
        key_not_found += 1

In [67]:
outfile_name = f"{file_folder}/04_21_free_form_spell_checked_{len(filtered_poems)}.json"
with open(outfile_name, 'w') as outfile:
    json.dump(filtered_poems, outfile, indent=4, sort_keys=True)

# Helper Functions

In [None]:
def count_errors_in_poems(poems: list, error_list):
    """Count the errors based on type given a list of poems

    :param poems: list of poems
    :param error_list: error rules to consider
    :return: dictionary that represents the counts of errors in the poems
    """
    error_counter = defaultdict(int)
    error_set = set(error_list)
    for poem in poems:
        matches = tool.check(poem)
        for match in matches:
            error_counter[match.ruleId] += 1
            if str(match.ruleId) in error_set:
                print(match.ruleId, poem)
                error_set.remove(match.ruleId)
    return dict(error_counter)