## Data Collecting and Processing

In [None]:
! pip install datasets

In [None]:
from tqdm import tqdm
import nltk
nltk.download('punkt')
from nltk.tokenize import sent_tokenize
import random
import pandas as pd
import re

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


### Create Perturbation

In [None]:
m2 = open("fce.train.gold.bea19.m2").read().strip().split("\n\n")

In [None]:
target_error_type = ["R:NOUN", "R:VERB", "R:ADV", "R:ADJ", 
          "R:VERB:FORM", "R:ADJ:FORM", "R:NOUN:INFL", "R:VERB:INFL", "R:NOUN:NUM", "R:VERB:SVA", "R:VERB:SVA",
          "R:PREP", "R:DET", "R:PRON", "R:CONJ", "R:PART", "R:CONTR", "R:SPELL"]

In [None]:
def extract_error_pattern(m2):
  perturb_dict = {}
  for sent in tqdm(m2):
    sent = sent.split("\n")
    wrong_sent = sent[0].split()[1:] # ignore S
    edits = sent[1:]
    for edit in edits:
      edit = edit.split("|||")
      error_type = edit[1]
      if error_type not in target_error_type:
        continue
      annotator_id = int(edit[-1])
      if annotator_id != 0:
        continue
      position = edit[0].split()[1:] # ignore A
      start = int(position[0])
      end = int(position[1])
      correct_word = edit[2]
      wrong_word = wrong_sent[start:end][0]
      if correct_word in perturb_dict.keys():
        if wrong_word not in perturb_dict[correct_word]:
          perturb_dict[correct_word].append(wrong_word)
      else:
        perturb_dict[correct_word]=[wrong_word]
  return perturb_dict

In [None]:
perturb_dict = extract_error_pattern(m2)

100%|██████████| 28350/28350 [00:00<00:00, 199806.29it/s]


In [None]:
perturb_dict

{'with': ['about',
  'to',
  'by',
  'of',
  'on',
  'in',
  'for',
  'from',
  'against',
  'without',
  'whith',
  'at',
  'while',
  'under',
  'bout'],
 'on': ['of',
  'in',
  'into',
  'from',
  'through',
  'for',
  'to',
  'by',
  'with',
  'at',
  'per',
  'over',
  'about',
  'up',
  'during'],
 'reviews': ['references'],
 'of': ['about',
  'in',
  'with',
  'for',
  'from',
  'on',
  'to',
  'at',
  'as',
  'throught',
  'between',
  'off',
  'by',
  'during',
  'against'],
 'because of': ['about', 'due', 'for', 'with'],
 'at': ['in',
  'with',
  'to',
  'on',
  'of',
  'by',
  'during',
  'into',
  'for',
  'from',
  't',
  'after',
  'about',
  'until'],
 'for': ['because',
  'about',
  'until',
  'in',
  'on',
  'to',
  'by',
  'of',
  'during',
  'with',
  'at',
  'abut',
  'as',
  'against',
  'into',
  'from',
  'before',
  'due',
  'since',
  'after'],
 'to get': ['receive', 'getting', 'was', 'geting'],
 'to': ['at',
  'in',
  'into',
  'for',
  'about',
  'of',
  'fro

### Generate New Data Samples

In [None]:
def create_new_samples(txt, num_error_per_sent, num_error_sample):
  correct_sent_list = []
  wrong_sent_list = []
  for sent in tqdm(sent_tokenize(txt)):
    for _ in range(num_error_sample):
      correct_sent_list.append(sent)
      words = sent.split()
      potential_word = []
      for i, word in enumerate(words):
        if word in perturb_dict.keys():
          potential_word.append((i, word))
      word_to_change = random.choices(potential_word, k=num_error_per_sent)
      for pick_word_position, pick_word in word_to_change:
        replace_word = random.choice(perturb_dict[pick_word])
        words[pick_word_position] = replace_word
      wrong_sent_list.append(str(' '.join(words)))
  df = pd.DataFrame()
  df["input"] = wrong_sent_list
  df["output"] = correct_sent_list
  return df

In [None]:
toefl = open('tpo.txt').read()

In [None]:
df_new = create_new_samples(toefl, num_error_per_sent=3, num_error_sample=5)

100%|██████████| 1003/1003 [00:00<00:00, 14363.52it/s]


In [None]:
df_new.to_csv("new_create_data.csv", index=False)

In [None]:
df_new.iloc[[5,6,7,8,9]]

Unnamed: 0,input,output
5,The issue that government shoud give financial...,The issue that government should give financia...
6,The issue them governement should give financi...,The issue that government should give financia...
7,The issue that governement should give economi...,The issue that government should give financia...
8,The issue that government should make economic...,The issue that government should give financia...
9,The issue that government should give economic...,The issue that government should give financia...


### Data Cleaning On C4 Dataset

In [None]:
df_c4 = pd.read_csv("c4_200m_sub.csv")

In [None]:
def clean_data(df):
  # remove sentence pairs have same input and output 
  df = df[df["input"] != df["output"]]
  # remove output(correct sentence) has special characters
  df = df.drop(df[df["output"].str.contains(r'[#&%+-]')].index)
  df = df.drop(df[df["output"].str.contains(r'[:]')].index)
  df = df[df["output"].str.contains('�|★|©|°') == False]
  return df

In [None]:
df_c4_clean = clean_data(df_c4)

### Combine C4 Dataset With New Data Samples

In [None]:
c4_test = df_c4_clean.sample(n=100)

In [None]:
df_c4_remain = df_c4_clean.drop(c4_test.index)

In [None]:
df_concat = pd.concat([df_new, df_c4_remain], axis=0, ignore_index=True)

In [None]:
df_concat.to_csv("data_combine.csv", index=False)

### Create Test Set

In [None]:
test = open('tpo_test.txt').read()
df_test = create_new_samples(test, num_error_per_sent=3, num_error_sample=1)

100%|██████████| 164/164 [00:00<00:00, 34984.53it/s]


In [None]:
df_test_combine = pd.concat([df_test, c4_test], axis=0, ignore_index=True)

In [None]:
df_test.to_csv("test.csv", index=False)

In [None]:
df_test_combine.to_csv("test_combine.csv", index=False)