In [1]:
from tqdm import tqdm
import nltk
nltk.download('punkt')
from nltk.tokenize import sent_tokenize
import random
import pandas as pd

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [2]:
m2 = open("fce.train.gold.bea19.m2").read().strip().split("\n\n")

In [3]:
target_error_type = ["R:NOUN", "R:VERB", "R:ADV", "R:ADJ", 
          "R:VERB:FORM", "R:ADJ:FORM", "R:NOUN:INFL", "R:VERB:INFL", "R:NOUN:NUM", "R:VERB:SVA", "R:VERB:SVA",
          "R:PREP", "R:DET", "R:PRON", "R:CONJ", "R:PART", "R:CONTR", "R:SPELL"]

In [4]:
def extract_error_pattern(m2):
  perturb_dict = {}
  for sent in tqdm(m2):
    sent = sent.split("\n")
    wrong_sent = sent[0].split()[1:] # ignore S
    edits = sent[1:]
    for edit in edits:
      edit = edit.split("|||")
      error_type = edit[1]
      if error_type not in target_error_type:
        continue
      annotator_id = int(edit[-1])
      if annotator_id != 0:
        continue
      position = edit[0].split()[1:] # ignore A
      start = int(position[0])
      end = int(position[1])
      correct_word = edit[2]
      wrong_word = wrong_sent[start:end][0]
      if correct_word in perturb_dict.keys():
        if wrong_word not in perturb_dict[correct_word]:
          perturb_dict[correct_word].append(wrong_word)
      else:
        perturb_dict[correct_word]=[wrong_word]
  return perturb_dict

In [5]:
perturb_dict = extract_error_pattern(m2)

100%|██████████| 28350/28350 [00:00<00:00, 176115.37it/s]


In [6]:
def create_new_samples(txt, num_error_per_sent, num_error_sample):
  correct_sent_list = []
  wrong_sent_list = []
  for sent in tqdm(sent_tokenize(toefl)):
    for _ in range(num_error_sample):
      correct_sent_list.append(sent)
      words = sent.split()
      potential_word = []
      for i, word in enumerate(words):
        if word in perturb_dict.keys():
          potential_word.append((i, word))
      word_to_change = random.choices(potential_word, k=num_error_per_sent)
      for pick_word_position, pick_word in word_to_change:
        replace_word = random.choice(perturb_dict[pick_word])
        words[pick_word_position] = replace_word
      wrong_sent_list.append(str(' '.join(words)))
  df = pd.DataFrame()
  df["correct"] = correct_sent_list
  df["incorrect"] = wrong_sent_list
  return df

In [7]:
toefl = open('tpo.txt').read()

In [8]:
df = create_new_samples(toefl, num_error_per_sent=1, num_error_sample=1)

100%|██████████| 492/492 [00:00<00:00, 53789.95it/s]


In [9]:
df

Unnamed: 0,correct,incorrect
0,The government should spend more money support...,The government should spend more money support...
1,The issue that government should give financia...,The issue that government should give economic...
2,"On the one hand, artists produce some impressi...","On the one hand, artists produce some impressi..."
3,"On the other hand, athletes who win the champi...","On the other hand, athletes who win her champi..."
4,"However, in the ultimate analysis, the governm...","However, in the ultimate analysis, the governe..."
...,...,...
487,"In contrast, graduates who are reluctant to pa...","In contrast, graduates who are reluctant to pa..."
488,"As a case in point, researchers have found tha...","As any case in point, researchers have found t..."
489,"Between the two extremes, I think treating ani...","Between the twe extremes, I think treating ani..."
490,An individual human existence should be like a...,An individual human existence should be like a...


In [10]:
df.to_csv("new_create_data.csv")