In [1]:
import warnings
warnings.filterwarnings("ignore")

In [2]:
import pandas as pd
import nlpaug.augmenter.word as naw
import nlpaug.augmenter.sentence as nas
from textaugment import EDA

In [3]:
t = EDA()

# Synonym replacement
def augment_by_sr(df, output_file_suff, n=2):
    df['description'] = df['description'].apply(lambda text: t.synonym_replacement(text, n=n))
    df.to_csv("data/augmented_SR_"+output_file_suff+".csv", index_label=False)

# Random insertion
def augment_by_ri(df, output_file_suff, n=2):
    df['description'] = df['description'].apply(lambda text: t.random_insertion(text, n=n))
    df.to_csv("data/augmented_RI_"+output_file_suff+".csv", index_label=False)

# Random swap
def augment_by_rs(df, output_file_suff):
    df['description'] = df['description'].apply(lambda text: t.random_swap(text))
    df.to_csv("data/augmented_RS_"+output_file_suff+".csv", index_label=False)

# Random deletion
def augment_by_rd(df, output_file_suff, p=0.4):
    df['description'] = df['description'].apply(lambda text: t.random_deletion(text, p=p))
    df.to_csv("data/augmented_RD_"+output_file_suff+".csv", index_label=False)

In [4]:
# Back Translation
def augment_by_bt(df, output_file_suff):
    if output_file_suff == "EN": 
        from_model_name = 'facebook/wmt19-en-de'
        to_model_name = 'facebook/wmt19-de-en'
    elif output_file_suff == "DE": 
        from_model_name = 'facebook/wmt19-de-en' 
        to_model_name = 'facebook/wmt19-en-de'
    else:
        raise "Bad output_file_suff"
        
    back_translation_aug = naw.BackTranslationAug(
        from_model_name=from_model_name,
        to_model_name=to_model_name
        )
    df['description'] = df['description'].apply(lambda text: back_translation_aug.augment(text)[0])
    df.to_csv("data/augmented_BT_"+output_file_suff+".csv", index_label=False)

<hr>

In [5]:
df_en = pd.read_csv("data/open_tasks_EN.csv").dropna(subset=["description"]) 
df_de = pd.read_csv("data/open_tasks_DE.csv").dropna(subset=["description"]) 

In [None]:
augment_by_bt(df_de, "DE")

Some weights of FSMTForConditionalGeneration were not initialized from the model checkpoint at facebook/wmt19-de-en and are newly initialized: ['model.decoder.embed_positions.weight', 'model.encoder.embed_positions.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of FSMTForConditionalGeneration were not initialized from the model checkpoint at facebook/wmt19-en-de and are newly initialized: ['model.decoder.embed_positions.weight', 'model.encoder.embed_positions.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
for df, suff in zip([df_en, df_de], ["EN", "DE"]):
    augment_by_sr(df, suff)
    augment_by_ri(df, suff) 
    augment_by_rs(df, suff)
    augment_by_rd(df, suff)
    augment_by_bt(df, suff)

<hr>

https://github.com/makcedward/nlpaug/blob/master/example/textual_augmenter.ipynb