This notebook is made to explore different ways of text augmentation. <hr>

In [3]:
import warnings
warnings.filterwarnings("ignore")

In [4]:
import pandas as pd
import nlpaug.augmenter.word as naw
import nlpaug.augmenter.sentence as nas
from textaugment import EDA
from tqdm import tqdm

In [5]:
t = EDA()

# Synonym replacement
def augment_by_sr(df, taskAspects_df, lang, n=2):
    _df = df.copy()
    _taskAspects_df = taskAspects_df.copy()
    _df['description'] = _df['description'].apply(lambda text: t.synonym_replacement(text, n=n))
    _df["word_count"] = _df["description"].apply(lambda s: len(s.split()))
    _df["taskId"] = _df["taskId"] + "_SR"
    _taskAspects_df["taskId"] = _taskAspects_df["taskId"] + "_SR"
    _df.to_csv(f"gen_files/{lang}/augmented/tasks_SR.csv", index_label=False)
    _taskAspects_df.to_csv(f"gen_files/{lang}/augmented/task_aspects_SR.csv", index_label=False)

# Random insertion
def augment_by_ri(df, taskAspects_df, lang, n=2):
    # _df = df.copy()
    _taskAspects_df = taskAspects_df.copy()
    _df['description'] = _df['description'].apply(lambda text: t.random_insertion(text, n=n))
    _df["word_count"] = _df["description"].apply(lambda s: len(s.split()))
    _df["taskId"] = _df["taskId"] + "_RI"
    _taskAspects_df["taskId"] = _taskAspects_df["taskId"] + "_RI"
    _df.to_csv(f"gen_files/{lang}/augmented/tasks_RI.csv", index_label=False)
    _taskAspects_df.to_csv(f"gen_files/{lang}/augmented/task_aspects_RI.csv", index_label=False)

# Random swap
def augment_by_rs(df, taskAspects_df, lang):
    # _df = df.copy()
    _taskAspects_df = taskAspects_df.copy()
    _df['description'] = _df['description'].apply(lambda text: t.random_swap(text))
    _df["word_count"] = _df["description"].apply(lambda s: len(s.split()))
    _df["taskId"] = _df["taskId"] + "_RS"
    _taskAspects_df["taskId"] = _taskAspects_df["taskId"] + "_RS"
    _df.to_csv(f"gen_files/{lang}/augmented/tasks_RS.csv", index_label=False)
    _taskAspects_df.to_csv(f"gen_files/{lang}/augmented/task_aspects_RS.csv", index_label=False)

# Random deletion
def augment_by_rd(df, taskAspects_df, lang, p=0.4):
    # _df = df.copy()
    _taskAspects_df = taskAspects_df.copy()
    _df['description'] = _df['description'].apply(lambda text: t.random_deletion(text, p=p))
    _df["word_count"] = _df["description"].apply(lambda s: len(s.split())) 
    _df["taskId"] = _df["taskId"] + "_RD"
    _taskAspects_df["taskId"] = _taskAspects_df["taskId"] + "_RD"
    _df.to_csv(f"gen_files/{lang}/augmented/tasks_RD.csv", index_label=False)
    _taskAspects_df.to_csv(f"gen_files/{lang}/augmented/task_aspects_RD.csv", index_label=False)

In [6]:
# Back Translation
def augment_by_bt(df, taskAspects_df, lang):
    if lang == "EN": 
        from_model_name = 'facebook/wmt19-en-de'
        to_model_name = 'facebook/wmt19-de-en'
    elif lang == "DE": 
        from_model_name = 'facebook/wmt19-de-en' 
        to_model_name = 'facebook/wmt19-en-de'
    else:
        raise f"Language {lang} not supported."
        
    back_translation_aug = naw.BackTranslationAug(
        from_model_name=from_model_name,
        to_model_name=to_model_name
        )
    _df = df.copy()
    _taskAspects_df = taskAspects_df.copy()
    _df['description'] = _df['description'].apply(lambda text: back_translation_aug.augment(text)[0])
    _df["word_count"] = _df["description"].apply(lambda s: len(s.split())) 
    _df["taskId"] = _df["taskId"] + "_BT"
    _taskAspects_df["taskId"] = _taskAspects_df["taskId"] + "_BT"
    _df.to_csv(f"gen_files/{lang}/augmented/tasks_BT.csv", index_label=False)
    _taskAspects_df.to_csv(f"gen_files/{lang}/augmented/task_aspects_BT.csv", index_label=False)

<hr>

In [7]:
df_de = pd.read_csv("gen_files/DE/all_concept_open_tasks.csv").dropna(subset=["description"]) 
df_en = pd.read_csv("gen_files/EN/all_concept_open_tasks.csv").dropna(subset=["description"]) 

In [9]:
df_taskAspects_de = pd.read_csv("gen_files/DE/concept_task_aspects.csv") 
df_taskAspects_en = pd.read_csv("gen_files/EN/concept_task_aspects.csv") 

In [10]:
for df, taskAspects_df, lang in tqdm(zip([df_en, df_de], [df_taskAspects_en, df_taskAspects_de], ["EN", "DE"])):
    print(f"\n{lang}\n")
    print("Augmenting by SR...")
    augment_by_sr(df, taskAspects_df, lang)
    print("Augmenting by RI...")
    augment_by_ri(df, taskAspects_df, lang)
    print("Augmenting by RS...") 
    augment_by_rs(df, taskAspects_df, lang)
    print("Augmenting by RD...")
    augment_by_rd(df, taskAspects_df, lang)
    print("Augmenting by BT...")
    augment_by_bt(df, taskAspects_df, lang)


2it [00:00, 31.82it/s]


EN

Augmenting by SR...
Augmenting by RI...
Augmenting by RS...
Augmenting by RD...
Augmenting by BT...

DE

Augmenting by SR...
Augmenting by RI...
Augmenting by RS...
Augmenting by RD...
Augmenting by BT...





In [38]:
filenames = [
    "taskAspects_EN",
    "taskAspects_BT_EN", 
    "taskAspects_RD_EN", 
    "taskAspects_RI_EN", 
    "taskAspects_RS_EN", 
    "taskAspects_SR_EN", 
] 
for f in filenames: 
    d = pd.read_csv(f"data/{f}.csv") 
    print(len(d.taskId.unique()))

951
951
951
951
951
951


In [26]:
d = pd.read_csv("data/generated/augmented_BT_EN.csv")
len(d)

1129

<hr>

https://github.com/makcedward/nlpaug/blob/master/example/textual_augmenter.ipynb