# Grammar correction - getting started

The dataset was the CoNLL-2013 Shared Task: Grammatical Error Correction.

Copyright (C) 2013 Hwee Tou Ng, Joel Tetreault, Siew Mei Wu,
                   Yuanbin Wu, Christian Hadiwinoto

## Workfolder setting up

In [1]:
import os
import shutil

data_folder = 'data'

if os.path.exists(data_folder):
    shutil.rmtree(data_folder, ignore_errors=True)


# Data folder creation
os.mkdir(data_folder)
os.chdir(data_folder)

In [2]:
import tarfile
import urllib.request

from tqdm.notebook import tqdm

# URL link for the dataset
remote_url_link = "https://www.comp.nus.edu.sg/~nlp/conll13st/release2.3.1.tar.gz"


def download_file(url_link: str) -> str:
    """ Download the archive file and returns the local filename
    for decompression

    :type url_link: str
    :param url_link: HTTP link for the archive file

    :rtype:
        str
    """
    local_file_name = url_link.split('/')[-1]
    urllib.request.urlretrieve(url_link, filename=local_file_name)

    return local_file_name


def extract_tar_archive_file(archive_filename: str) -> str:
    """ Extract content from the archive file and returns
    the local folder name

    :type archive_filename: str
    :param archive_filename: Archive file name

    :rtype:
    """
    # Archive extraction
    with tarfile.open(archive_filename) as archive_file:
        for member in tqdm(
            archive_file.getmembers(),
            total=len(archive_file.getmembers()),
            desc="Archive decompression"
            ):
            archive_file.extract(member=member, path='./')


local_file = download_file(remote_url_link)
local_folder = extract_tar_archive_file(local_file)
os.remove(local_file)

shutil.move('release2.3.1/original/data/official.sgml','.')
shutil.move('release2.3.1/README','.')

shutil.rmtree('release2.3.1')

Archive decompression:   0%|          | 0/57 [00:00<?, ?it/s]

## Constructing the dataset for the model

In [3]:
# Punctions marking the end of a sentence
PUNCTUATIONS = '.?!'


def find_end_of_sentence(raw_text:str, character_index:int):
    """ Find the end of the sentence containing a character
    :type raw_text: str
    :param raw_text: Text containing the sentence of a character

    :type character_index: str
    :param character_index: Character position

    :rtype:
        str
    """
    # Selecting only the relevant text
    current_text = raw_text[character_index:]
    
    # Generating the index position of the closest character for each punctuation
    punctuation_indexes = {}
    for punctuation in PUNCTUATIONS:
        try:
            punctuation_indexes[punctuation] = current_text.index(punctuation)
        # In case it isn't in the text : ignored
        except ValueError as _:
            pass
    
    # Adding the default one (in case of no punctuation ending)
    punctuation_indexes[''] = len(current_text)
    
    # Computing the closest punctuation to the character
    closest_punctuation = min(punctuation_indexes, key=punctuation_indexes.get)
    return punctuation_indexes[closest_punctuation]+character_index

def find_beginning_of_sentence(raw_text: str, character_index: str) -> int:
    """ Fetch the beginning of the sentence
    :type raw_text: str
    :param raw_text: Text containing the sentence

    :type character_index: str
    :param character_index: Character position in the text

    :rtype:
        int
    """
    current_index = int(character_index)

    # Loop while not having found the beginning of the sentence
    while current_index >= 0:
        # If the character is an upper case letter
        if raw_text[current_index].isupper():
            if current_index == 0 or \
                (raw_text[current_index-1] == ' ' and raw_text[current_index-2] in PUNCTUATIONS)\
                    or raw_text[current_index-1] in PUNCTUATIONS:
                return current_index
        # Begin of the text
        if current_index == 0:
            return current_index
        
        # Moves backward
        current_index -= 1

def extract_sentence(texts:list[str], mistakes:list[dict]) -> list[str]:
    """ Extract from the texts the sentences containing the corrected mistakes
    :type texts: list[str]
    :param texts: Texts with the mistakes corrected

    :type mistakes: list[dict]
    :param mistakes: List of every mistakes made

    :raises:

    :rtype:
        Sentences with the corrected mistakes
    """
    # For every sentence
    corrected_sentences = []
    
    # For every mistake
    for mistake in mistakes:
        # Which text holds the error
        part = mistake['part']
        
        # Correction begin and end indexes
        begin = mistake['begin']
        end = mistake['end']
        
        # Select only the relevant text
        print(part)
        current_text = texts[part]
        
        # Computing the index surrounding the sentence
        sentence_begin_index = find_beginning_of_sentence(current_text, begin)
        sentence_end_index = find_end_of_sentence(current_text, end)
        
        # Appending the sentence to the list
        corrected_sentences.append(current_text[sentence_begin_index:sentence_end_index])
    
    return corrected_sentences

In [4]:
from bs4 import BeautifulSoup

# Loading data and parsing it
with open('official.sgml') as file:
    raw_content = file.read()
soup = BeautifulSoup(raw_content)

# Fetching the documents and the related ids
docs = soup.find_all('doc')

The data found is formatted inside a `Pandas` dataframe for the model. The tokenization is done since grammar correction occurs not only with grammatically faults but also with other mistakes such as a wrong pronoun used (e.g. `I is a cat` is incorrect. The correct answer should be `I am a cat`).

Words have a context built with other words. I choosed to select sentences and not subsentences ending with punctuation like `,` or `;`. They may not include the whole context (e.g. `Yersterday, he ate his apple.`). In the example, the subsentence include only the time context in the first one or the action in the seconde one.

In [None]:
from nltk.tokenize import word_tokenize

# Fetching the texts
texts = {}
for doc in docs:
    # Fetching the text and their associated id ; the text is tokenize for grammar correction
    doc_text = [word_tokenize(doc_text.text.replace('\n', '').strip())
                for doc_text in doc.find('text').find_all('p')]
    doc_id = doc['nid']
    
    # Size of the paragraphs
    lengths = [len(paragraph) for paragraph in doc_text]

    # Data for each from the file -> casted for the dataframe 
    raw_errors = doc.find('annotation').find_all('mistake')
    errors = [
        {
            'part': int(raw_error['start_par']),
            'begin': (
                int(raw_error['start_off']) -
                sum(lengths[:int(raw_error['start_par'])])
                if int(raw_error['start_off']) > sum(lengths[:int(raw_error['start_par'])])
                else int(raw_error['start_off'])
            ),
            'end': (
                int(raw_error['end_off']) -
                sum(lengths[:int(raw_error['start_par'])])
                if int(raw_error['end_off']) > sum(lengths[:int(raw_error['start_par'])])
                else int(raw_error['end_off'])
            ),
            'type': raw_error.find('type').text,
            'correction': raw_error.find('correction').text,
        } for raw_error in raw_errors
    ]

    # Extracting the sentences from the text for each mistake and corrected answer.
    """
    corrected_sentences = extract_sentence(doc_text, errors_data)

    for error_pos, corrected_sentence in enumerate(corrected_sentences):
        errors[error_pos]['corrected_sentence'] = corrected_sentence
    """

    # Appending the result for the dataframe
    texts[doc_id] = {
        'text': doc_text,
        'errors_length': len(errors),
        'errors': errors
    }

In [33]:
import pandas as pd

# Rows containing the data
text_row = []
mistake_types_row = []
correction_part_row = []

# For each text
for doc_id, errors in texts.items():
    paragraphs = errors['text']
    
    # For each mistake
    for mistake in errors['errors']:
        
        # Adding the data inside
        text_row.append(
            paragraphs[mistake['part']-1]
        )
        mistake_types_row.append(mistake['type'])
        correction_part_row.append(mistake['correction'])

# Constructing the dataframe
df = pd.DataFrame({
    'Raw text':text_row,
    'Mistake type':mistake_types_row,
    'Corrected part':correction_part_row
})

### Datasets summary

In [55]:
print(f"{len(df['Raw text'].unique())} unique texts are set in the CoNLL-13 dataset.")
print(f"{len(df['Mistake type'].unique())} grammatical faults exists: {df['Mistake type'].unique()}")

244 unique texts are set in the CoNLL-13 dataset.
28 grammatical faults exists: ['ArtOrDet' 'Nn' 'Mec' 'Rloc-' 'Vt' 'Wci' 'Um' 'WOadv' 'Trans' 'Ssub' 'V0'
 'Prep' 'SVA' 'Vm' 'Vform' 'WOinc' 'Wform' 'Pform' 'Pref' 'Spar' 'Npos'
 'Srun' 'Wtone' 'Sfrag' 'Others' 'Smod' 'Wa' 'Cit']


In [57]:
from sklearn.model_selection import train_test_split

# Generating random dataframes for train, test and validation
train, test = train_test_split(df, test_size=0.3)
test, val = train_test_split(test, test_size=0.5)

## Model construction