# Make Grammer error dataset

220604

- [ref1](https://suraj1997lodh.medium.com/grammar-error-handling-and-correction-with-dataset-creation-e446fa6863b8)
- [ref2](https://github.com/suraj6017/GRAMMAR_ERROR_HANDLING)

In [7]:
## load
import random
from tqdm import tqdm
import pandas as pd
import numpy as np
import re
import os

In [8]:
## functions
def pre_process(sample):

    tags = re.compile('<.*?>')
    sample = re.sub(tags, '', sample)  # tag 제거
    
    sample = re.sub(r'[^\w]', ' ', sample)

    sample = re.sub("[\(\[].*?[\)\]]", "", sample)
    sample =  sample.replace('\t', ' ').replace('\n', ' ').replace('-', '')#.replace(' ', '')

    def decontracted(phrase):
        # specific
        phrase = re.sub(r"won't", "will not", phrase)
        phrase = re.sub(r"can\'t", "can not", phrase)

        # general
        phrase = re.sub(r"n\'t", " not", phrase)
        phrase = re.sub(r"\'re", " are", phrase)
        phrase = re.sub(r"\'s", " is", phrase)
        phrase = re.sub(r"\'d", " would", phrase)
        phrase = re.sub(r"\'ll", " will", phrase)
        phrase = re.sub(r"\'t", " not", phrase)
        phrase = re.sub(r"\'ve", " have", phrase)
        phrase = re.sub(r"\'m", " am", phrase)
        return phrase
    sample = decontracted(sample)

    return (sample)

tmp  = 'ï»¿"Well, Prince, so Genoa and Lucca are now just family estates of the\nBuonapartes'
print('%s\n->   %s'%(tmp, pre_process(tmp)))


# 전치사 list
preposition = ['about',
 'above',
 'across',
 'after',
 'as',
 'at',
 'before',
 'behind',
 'between',
 'but',
 'by',
 'for',
 'from',
 'in',
 'to',
 'of',
 'off',
 'on',
 'to',
 'until',
 'up',
 'with']

ï»¿"Well, Prince, so Genoa and Lucca are now just family estates of the
Buonapartes
->   ï   Well  Prince  so Genoa and Lucca are now just family estates of the Buonapartes


In [9]:
## prepare data
DATA_ROOT = './'
data_name = 'c4_200m_short.csv'  # 55만개
# data_name = 'c4_200m_1M.csv'
data_path = os.path.join(DATA_ROOT, data_name)
df = pd.read_csv(data_path)
# print(df.shape)
df.head()

total_data = df['output'].tolist()

data = [tmp for tmp in total_data if len(tmp) < 80]  # 200글자 이하로 자른다, 20% 없어짐
# get statistics
length_avg = np.array([len(tmp) for tmp in tqdm(data)]).mean()
length_max = max([len(tmp) for tmp in tqdm(data)])
length_min = min([len(tmp) for tmp in tqdm(data)])

print('mean/max/min : %d / %d / %d'%(length_avg, length_max, length_min))

print(data[:2])

100%|██████████| 191885/191885 [00:00<00:00, 2159065.42it/s]
100%|██████████| 191885/191885 [00:00<00:00, 1940597.55it/s]
100%|██████████| 191885/191885 [00:00<00:00, 1748010.03it/s]

mean/max/min : 51 / 79 / 11
['Answers » Regions » Is Nagorno-Karabakh region part of Armenia?', 'Flaneuring Fun in Maple Creek SK!']





In [10]:
## Error functions
def generate_error_1_spelling_error(sentence, verbose=False):
    ## 1_SPELLING ERROR(글자 오타생성)
    # 문장안에서 두 글자를 랜덤으로 선택한 후 교체
    # abcde -> adcde, b를 d로 교체한다
    original_sentence = sentence
    len_sentence = len(sentence)

    x = random.randint(0,len_sentence-1)
    y = random.randint(0,len_sentence-1)
    
    if (len_sentence > 5) and (sentence[x] != ' '):
        # 띄어쓰기는 제외
        error_sentence = sentence.replace(sentence[x], sentence[y])
    else:
        error_sentence = sentence
        
    error_sentence = re.sub(' +', ' ', error_sentence)
        
    if verbose:
        print(('[1_Spelling_Error] %s => %s'%(original_sentence, error_sentence)))
    
    return error_sentence

sentence = 'International orders are shipped about'
error_sentence = generate_error_1_spelling_error(sentence, verbose=True)



def generate_error_2_replace_preposition(sentence, verbose=False, max_preposition_replace_number=3):
    ## 2_전치사 교체
    original_sentence = sentence
    
    # max_preposition_replace_number(3개)가 넘으면 3개까지만 오염시키자
#     max_preposition_replace_number = 3
    preposition_idx = []
    for idx, word in enumerate(sentence.split()):
        if word in preposition:
            preposition_idx.append(idx)
    preposition_idx_save = preposition_idx
    if len(preposition_idx) > max_preposition_replace_number:
        preposition_idx = random.sample(preposition_idx, max_preposition_replace_number)
        
#     print(preposition_idx, preposition_idx_save)
    for idx, word in enumerate(sentence.split()):

        if idx in preposition_idx:
#             print(word)
            replace_preposition_idx = random.randint(0, len(preposition)-1)
            replace_preposition = preposition[replace_preposition_idx]
            sentence = re.sub(word , replace_preposition, sentence)    


    error_sentence = sentence
    error_sentence = re.sub(' +', ' ', error_sentence)
    if verbose:
        if len(error_sentence) > len(original_sentence)+10:
            print(('[2_Replace_Preposition] %s => %s'%(original_sentence, error_sentence)))
        
        
    return error_sentence

# sentence = 'One of the cardinal factors to consider when trying to decide on which kind of shipping to settle for is the market difference'
sentence = 'The College Council is the primary student organization in the College of Public Health and Health Professions at the University of Florida'

error_sentence = generate_error_2_replace_preposition(sentence, verbose=True)



def generate_error_3_delete_preposition(sentence, verbose=False):
    ## 3_전치사 삭제
    original_sentence = sentence

    word_list = sentence.split()
    preposition_position_idx = [idx for idx, word in enumerate(word_list) if word in preposition]

    if len(preposition_position_idx)==0:
        error_sentence=sentence
    else:
        delete_preposition_idx = random.choice(preposition_position_idx)
        error_sentence = re.sub(word_list[delete_preposition_idx] , '' , sentence)

    error_sentence = re.sub(' +', ' ', error_sentence)
    if verbose:
        print(('[3_Delete_Preposition] %s => %s'%(original_sentence, error_sentence)))
    return error_sentence

sentence = 'International orders are shipped about'
error_sentence = generate_error_3_delete_preposition(sentence, verbose=True)    



def generate_error_4_delete_definite_article(sentence, verbose=False):
    ## 4_Omiting the definite article(정관사 생략)
    # a/an/the 정관사를 생략
    original_sentence = sentence

    sentence = re.sub('an ' , '' , sentence)
    sentence = re.sub('a ' , '' , sentence)
    error_sentence = re.sub('the ' , '' , sentence)
    sentence = re.sub('An ' , '' , sentence)
    sentence = re.sub('A ' , '' , sentence)
    error_sentence = re.sub('The ' , '' , sentence)
    
    error_sentence = re.sub(' +', ' ', error_sentence)
    if verbose:
        print(('[4_Delete_Article] %s => %s'%(original_sentence, error_sentence)))
    return error_sentence

sentence = 'An international orders are shipped about'
error_sentence = generate_error_4_delete_definite_article(sentence, verbose=True)        



def generate_error_5_repete_word(sentence, verbose=False):
    ## 5_REPEATING A WORD IN SENTENCE
    # 특정 단어를 한번더 반복
    # A B C D -> A A B C D
    original_sentence = sentence
    rep = sentence.split(' ')
    w = random.randint(0,len(rep) - 1)
    f = rep[:w + 1] + [rep[w]] + rep[w+1:]
    error_sentence = ' '.join(f)
    
    error_sentence = re.sub(' +', ' ', error_sentence)
    if verbose:
        print(('[5_Repete_Word] %s => %s'%(original_sentence, error_sentence)))
    return error_sentence

sentence = 'An international orders are shipped about'
error_sentence = generate_error_5_repete_word(sentence, verbose=True)



def generate_error_6_delete_word(sentence, verbose=False):
    ## 6_deleting_random_word in sentence
    # A B C D -> A C D
    original_sentence = sentence
    delete = sentence.split(' ')
    w = random.randint(0, len(delete) - 1)
    error_sentence = re.sub(delete[w] , '' , sentence)

    error_sentence = re.sub(' +', ' ', error_sentence)
    if verbose:
        print(('[6_Delete_Word] %s => %s'%(original_sentence, error_sentence)))
    return error_sentence

sentence = 'An international orders are shipped about'
error_sentence = generate_error_6_delete_word(sentence, verbose=True)


[1_Spelling_Error] International orders are shipped about => International orders are shipped abott
[2_Replace_Preposition] The College Council is the primary student organization in the College of Public Health and Health Professions at the University of Florida => The College Council is the primary student organizbeforeion after the College after Public Health and Health Prafteressions before the University after Florida
[3_Delete_Preposition] International orders are shipped about => International orders are shipped 
[4_Delete_Article] An international orders are shipped about => international orders are shipped about
[5_Repete_Word] An international orders are shipped about => An international orders are shipped shipped about
[6_Delete_Word] An international orders are shipped about => An international are shipped about


In [11]:
def generate_error(sentence, K=1, verbose=False):
    
    # 6개 error 함수 중 한개를 골라 에러문장을 생성한다
    generate_error_list = [
        generate_error_1_spelling_error,
        generate_error_2_replace_preposition,
        generate_error_3_delete_preposition,
        generate_error_4_delete_definite_article,
        generate_error_5_repete_word,
        generate_error_6_delete_word,        
    ]
    # 중복 허용 K개 추출
    random_idx_list = [random.choice(list(range(len(generate_error_list)))) for ii in range(K)]    
    
    original_sentence = sentence
    for iii in random_idx_list:
        sentence = generate_error_list[iii](sentence, verbose=verbose)
        
    error_sentence = sentence
    return error_sentence.strip()

sentence = 'Did you ever need to have a flexible and reliable instrument for fast Partial Discharge  PD  diagnostic campaigns'

# Generate!

In [12]:
## apply error to sentence
# K = 6  # Error를 몇번 실행?
save_path = './'

data = data[:100000]

for K in [2, 4, 6, 8, 12][:1]:
    print('Gerenate %d error'%K)
    ###############################################################################
    list_sentence_preprocessed = [pre_process(sentence) for sentence in tqdm(data)]
    error_list_sentence_preprocessed = [generate_error(sentence, K=K) for sentence in tqdm(list_sentence_preprocessed)]
#     error_list_sentence_preprocessed = [generate_error(sentence, K=K, verbose=True) for sentence in tqdm(list_sentence_preprocessed)]    

    df_data_error = pd.DataFrame()
    df_data_error['correct_sentence'] = list_sentence_preprocessed
    df_data_error['error_sentence'] = error_list_sentence_preprocessed
    print(df_data_error.shape)
    df_data_error.head(5)

    # get statistics
    correct_length_avg = np.array([len(tmp) for tmp in tqdm(df_data_error['correct_sentence'])]).mean()
    correct_length_max = max([len(tmp) for tmp in tqdm(df_data_error['correct_sentence'])])
    correct_length_min = min([len(tmp) for tmp in tqdm(df_data_error['correct_sentence'])])

    error_length_avg = np.array([len(tmp) for tmp in tqdm(df_data_error['error_sentence'])]).mean()
    error_length_max = max([len(tmp) for tmp in tqdm(df_data_error['error_sentence'])])
    error_length_min = min([len(tmp) for tmp in tqdm(df_data_error['error_sentence'])])

    print('%d_correct_mean/max/min : %d / %d / %d'%(K, correct_length_avg, correct_length_max, correct_length_min))
    print('%d_error  _mean/max/min : %d / %d / %d'%(K, error_length_avg, error_length_max, error_length_min))

    # save
    if not os.path.isdir(save_path): os.mkdir(save_path)
    df_data_error.to_csv(os.path.join(save_path, '%s_grammer_error_dataset.csv')%(K), header=True, index=False)
    df_data_error['correct_sentence'].to_csv(os.path.join(save_path, '%s_grammer_error_src.txt')%(K), header=False, index=False)
    df_data_error['error_sentence'].to_csv(os.path.join(save_path, '%s_grammer_error_tgt.txt')%(K), header=False, index=False)

Gerenate 2 error


100%|██████████| 100000/100000 [00:02<00:00, 45984.00it/s]
100%|██████████| 100000/100000 [00:04<00:00, 24316.70it/s]


(100000, 2)


100%|██████████| 100000/100000 [00:00<00:00, 1738759.00it/s]
100%|██████████| 100000/100000 [00:00<00:00, 1467869.15it/s]
100%|██████████| 100000/100000 [00:00<00:00, 1554977.87it/s]
100%|██████████| 100000/100000 [00:00<00:00, 1724929.47it/s]
100%|██████████| 100000/100000 [00:00<00:00, 1548319.27it/s]
100%|██████████| 100000/100000 [00:00<00:00, 1260046.44it/s]


2_correct_mean/max/min : 51 / 79 / 13
2_error  _mean/max/min : 48 / 105 / 0


In [13]:
df_data_error

Unnamed: 0,correct_sentence,error_sentence
0,Answers Regions Is Nagorno Karabakh region...,Answers Regions Is Nagorno Karabakh region par...
1,Flaneuring Fun in Maple Creek SK,Flaneurg Fun Maple Creek SK
2,About Private Investigators Ellesmere Port In ...,About Private Investigators Ellesmere Port In ...
3,Bake in the oven for 35 mins scattering the f...,Bake in the oven for 35 mins scattering the fl...
4,informing you of changes in our web site,informing yor of changes in orr web site
...,...,...
99995,Anyone getting things done is a Kate,Anyone getting things done is a Kate
99996,Explain how important roles in security planni...,Explain how impo tant oles in secu ity elate t...
99997,This is the prayer rug the scammers are usin...,This is the prayer rug the scammers are using ...
99998,And then I asked him the question that changed...,And I asked him the question that changed my
