This notebook is made to explore different ways of text augmentation. <hr>

In [1]:
import warnings
warnings.filterwarnings("ignore")

In [2]:
import pandas as pd
import nlpaug.augmenter.word as naw
import nlpaug.augmenter.sentence as nas
from textaugment import EDA

In [3]:
t = EDA()

# Synonym replacement
def augment_by_sr(df, output_file_suff, n=2):
    df['description'] = df['description'].apply(lambda text: t.synonym_replacement(text, n=n))
    df["word_count"] = df["description"].apply(lambda s: len(s.split()))
    df.to_csv("data/augmented_SR_"+output_file_suff+".csv", index_label=False)

# Random insertion
def augment_by_ri(df, output_file_suff, n=2):
    df['description'] = df['description'].apply(lambda text: t.random_insertion(text, n=n))
    df["word_count"] = df["description"].apply(lambda s: len(s.split()))
    df.to_csv("data/augmented_RI_"+output_file_suff+".csv", index_label=False)

# Random swap
def augment_by_rs(df, output_file_suff):
    df['description'] = df['description'].apply(lambda text: t.random_swap(text))
    df["word_count"] = df["description"].apply(lambda s: len(s.split()))
    df.to_csv("data/augmented_RS_"+output_file_suff+".csv", index_label=False)

# Random deletion
def augment_by_rd(df, output_file_suff, p=0.4):
    df['description'] = df['description'].apply(lambda text: t.random_deletion(text, p=p))
    df["word_count"] = df["description"].apply(lambda s: len(s.split())) 
    df.to_csv("data/augmented_RD_"+output_file_suff+".csv", index_label=False)

In [4]:
# Back Translation
def augment_by_bt(df, output_file_suff):
    if output_file_suff == "EN": 
        from_model_name = 'facebook/wmt19-en-de'
        to_model_name = 'facebook/wmt19-de-en'
    elif output_file_suff == "DE": 
        from_model_name = 'facebook/wmt19-de-en' 
        to_model_name = 'facebook/wmt19-en-de'
    else:
        raise "Bad output_file_suff"
        
    back_translation_aug = naw.BackTranslationAug(
        from_model_name=from_model_name,
        to_model_name=to_model_name
        )
    df['description'] = df['description'].apply(lambda text: back_translation_aug.augment(text)[0])
    df["word_count"] = df["description"].apply(lambda s: len(s.split())) 
    df.to_csv("data/augmented_BT_"+output_file_suff+".csv", index_label=False)

<hr>

In [5]:
df_en = pd.read_csv("data/open_tasks_EN.csv").dropna(subset=["description"]) 
df_de = pd.read_csv("data/open_tasks_DE.csv").dropna(subset=["description"]) 

In [12]:
len(df_de), len(df_en)

(1556, 299)

In [11]:
# TODO remove test descriptions before augmentation
df_de[df_de["word_count"] < 5]

Unnamed: 0,taskId,language,description,topic_id,word_count
321,adpv7cYKn4i7FWXtH1bUkR,deu,W-Fragen,,1
324,7Q8FOoPPSgy5QwADlK42ZF,deu,Ein Auftrag,,2
408,2lsepl9fCbk5VAYTiXSJry,deu,Einleitung,4818.0,1
409,8zJdrJaIA2kaHjcM00Zpu4,deu,Hauptteil,,1
410,9bXFoC0UCoOaz4cBhGBxNl,deu,Überschrift,4818.0,1
411,4arl7BhMwi87aQZirvCWbE,deu,Schluss,,1
412,XpnIgKjISY9B8IAshuf2f,deu,Beschreibe den Hauptdarsteller,,3
443,1uABTLp95CK7CBfYLNsodE,deu,Hallo aus der Testumgebung,,4
462,78AejMbXpKmaeaRjbVG9Cd,deu,W-Fragen,,1
463,4pMwgNQFRBt8vj9kILEbR9,deu,Ein Auftrag,,2


In [None]:
for df, suff in zip([df_en, df_de], ["EN", "DE"]):
    augment_by_sr(df, suff)
    augment_by_ri(df, suff) 
    augment_by_rs(df, suff)
    augment_by_rd(df, suff)
    augment_by_bt(df, suff)

<hr>

https://github.com/makcedward/nlpaug/blob/master/example/textual_augmenter.ipynb