In [None]:
! pip install datasets

In [None]:
from tqdm import tqdm
import nltk
nltk.download('punkt')
from nltk.tokenize import sent_tokenize
import random
import pandas as pd

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [None]:
m2 = open("fce.train.gold.bea19.m2").read().strip().split("\n\n")

In [None]:
target_error_type = ["R:NOUN", "R:VERB", "R:ADV", "R:ADJ", 
          "R:VERB:FORM", "R:ADJ:FORM", "R:NOUN:INFL", "R:VERB:INFL", "R:NOUN:NUM", "R:VERB:SVA", "R:VERB:SVA",
          "R:PREP", "R:DET", "R:PRON", "R:CONJ", "R:PART", "R:CONTR", "R:SPELL"]

In [None]:
def extract_error_pattern(m2):
  perturb_dict = {}
  for sent in tqdm(m2):
    sent = sent.split("\n")
    wrong_sent = sent[0].split()[1:] # ignore S
    edits = sent[1:]
    for edit in edits:
      edit = edit.split("|||")
      error_type = edit[1]
      if error_type not in target_error_type:
        continue
      annotator_id = int(edit[-1])
      if annotator_id != 0:
        continue
      position = edit[0].split()[1:] # ignore A
      start = int(position[0])
      end = int(position[1])
      correct_word = edit[2]
      wrong_word = wrong_sent[start:end][0]
      if correct_word in perturb_dict.keys():
        if wrong_word not in perturb_dict[correct_word]:
          perturb_dict[correct_word].append(wrong_word)
      else:
        perturb_dict[correct_word]=[wrong_word]
  return perturb_dict

In [None]:
perturb_dict = extract_error_pattern(m2)

100%|██████████| 28350/28350 [00:00<00:00, 182214.33it/s]


In [None]:
perturb_dict

In [None]:
def create_new_samples(txt, num_error_per_sent, num_error_sample):
  correct_sent_list = []
  wrong_sent_list = []
  for sent in tqdm(sent_tokenize(txt)):
    for _ in range(num_error_sample):
      correct_sent_list.append(sent)
      words = sent.split()
      potential_word = []
      for i, word in enumerate(words):
        if word in perturb_dict.keys():
          potential_word.append((i, word))
      word_to_change = random.choices(potential_word, k=num_error_per_sent)
      for pick_word_position, pick_word in word_to_change:
        replace_word = random.choice(perturb_dict[pick_word])
        words[pick_word_position] = replace_word
      wrong_sent_list.append(str(' '.join(words)))
  df = pd.DataFrame()
  df["input"] = wrong_sent_list
  df["output"] = correct_sent_list
  return df

In [None]:
toefl = open('tpo.txt').read()

In [None]:
df_new = create_new_samples(toefl, num_error_per_sent=1, num_error_sample=3)

100%|██████████| 493/493 [00:00<00:00, 19918.62it/s]


In [None]:
df_new.to_csv("new_create_data.csv", index=False)

In [None]:
df_new

Unnamed: 0,input,output
0,That government should spend more money suppor...,The government should spend more money support...
1,The government should spend longer money suppo...,The government should spend more money support...
2,The government should spend more money support...,The government should spend more money support...
3,The issue that government should give financia...,The issue that government should give financia...
4,The issue what government should give financia...,The issue that government should give financia...
...,...,...
1474,An individual human existence should be like a...,An individual human existence should be like a...
1475,An individual human existence should are like ...,An individual human existence should be like a...
1476,"Gradually the river grows wider, the banks rec...","Gradually the river grows wider, the banks rec..."
1477,"Gradually the river grows wider, the banks rec...","Gradually the river grows wider, the banks rec..."


In [None]:
df_c4 = pd.read_csv("c4_200m_sub.csv")

In [None]:
df_c4

Unnamed: 0,input,output
0,Organizational mission of Kiss The Ground is i...,The organizational mission of Kiss the Ground ...
1,Tap The Thumbnail Bellow to See Related Galler...,Tap The Thumbnail Bellow to See Related Galler...
2,when I relax that’s when I shake or jolt to me...,"if I relax that’s when I shake or jolt, like t..."
3,Our neighbor Jay-dub (nickname courtesy of me ...,Our neighbor Jay-dub (nickname courtesy me sin...
4,Kummrow Automotive is now a Official Distribut...,Kummrow Automotive is now an official Dinan Di...
...,...,...
24995,"And if you grab it a day, you’ll get limitted ...","And if you grab it today, you’ll get a limited..."
24996,Anyone know the name of the buff,Anyone know the name of the buff?
24997,A. AnS IBM TS3500 storage frame must be upgrad...,A. An IBM TS3500 storage frame must be upgrade...
24998,I teamed up with a few of my blogger friends f...,I teamed up with a few of my blogger friends f...


In [None]:
df_concat = pd.concat([df_new, df_c4], axis=0, ignore_index=True)

In [None]:
df_concat = df_concat[df_concat["input"].str.contains("�|★") == False]
df_concat = df_concat[df_concat["output"].str.contains("�|★") == False]

In [None]:
df_concat.to_csv("data_combine.csv", index=False)