# Text Augmentations

`textattack` is a popular library for adversarial attacks and text augmentation.

    pip install textattack

Use pre-trained `MarianMT` model from `transformers` library for back-translation.

In [1]:
import random

from textattack.augmentation import WordNetAugmenter


texts = [  # original texts
    "a child in a pink dress is climbing up a set of stairs in an entry way .",
    "a girl going into a wooden building .",
    "a little girl climbing into a wooden playhouse .",
    "a little girl climbing the stairs to her playhouse .",
    "a little girl in a pink dress going into a wooden cabin .",
]
print(*texts, sep="\n", end="\n\n")

random.shuffle(texts)  # shuffle strings in a list
t1 = " ".join(texts)  # concatenate strings in a list

# WordNetAugmenter leverages WordNet to replace some words with their synonyms
augmenter = WordNetAugmenter()  # initialize the augmenter
t2 = augmenter.augment(t1)[0]  # augment the text

print(f"Shuffled text:\n\t{t1}\n")
print(f"Augmented text:\n\t{t2}\n")

# Back Translation: Translating a sentence to another language and then back
# to the original language, which can introduce paraphrases and variations.


  import pkg_resources


a child in a pink dress is climbing up a set of stairs in an entry way .
a girl going into a wooden building .
a little girl climbing into a wooden playhouse .
a little girl climbing the stairs to her playhouse .
a little girl in a pink dress going into a wooden cabin .



[nltk_data] Downloading package omw-1.4 to /home/pavlenko/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


Shuffled text:
	a girl going into a wooden building . a little girl climbing into a wooden playhouse . a child in a pink dress is climbing up a set of stairs in an entry way . a little girl in a pink dress going into a wooden cabin . a little girl climbing the stairs to her playhouse .

Augmented text:
	a girl lead into a wooden building . a little girl mounting into a wooden playhouse . a child in a pink crop is climbing up a typeset of stairs in an entry way . a little missy in a pink dress going into a wooden cabin . a little girl climbing the stairs to her playhouse .



In [17]:
import transformers as hf


def translate(text, lang1, lang2):
    # Load model and tokenizer
    model_name = f"Helsinki-NLP/opus-mt-{lang1}-{lang2}"
    tokenizer = hf.MarianTokenizer.from_pretrained(model_name)
    model = hf.MarianMTModel.from_pretrained(model_name, use_safetensors=True)

    # Translate from languate-1 to language-2
    inputs = tokenizer(text, return_tensors="pt", padding=True)
    translated_tokens = model.generate(**inputs)
    translated = tokenizer.decode(translated_tokens[0], skip_special_tokens=True)

    return translated


def back_translate(text, lang1, lang2):
    """
    Performs back-translation on a given text.

    Args:
        text (str): The input text to back-translate.
        lang1 (str): The source language code (e.g., 'en').
        lang2 (str): The intermediate target language code (e.g., 'fr').

    Returns:
        str: The back-translated text.
    """
    translated = translate(text, lang1, lang2)
    back_translated = translate(translated, lang2, lang1)
    return back_translated, translated


# Example usage for data augmentation in neural network training
langs = {
    "en": "English",
    "fr": "French",
    "es": "Spanish",
    "de": "German",
    "ru": "Russian",
    "zh": "Chinese",
    "ar": "Arabic",
    "ja": "Japanese",
    "nl": "Dutch",
    "hi": "Hindi",
}
# Select a random value from the list
l = random.choice(list(langs.keys()))

if l == "en":
    t3, t2_translated = t2, t2  # do not translate from English
else:
    t3, t2_translated = back_translate(t2, "en", l)

print(f"Translated to {langs[l]}:\n\t{t2_translated}\n")
print(f"Back-translated:\n\t{t3}\n")

tokenizer_config.json:   0%|          | 0.00/42.0 [00:00<?, ?B/s]

source.spm:   0%|          | 0.00/790k [00:00<?, ?B/s]

target.spm:   0%|          | 0.00/814k [00:00<?, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

config.json: 0.00B [00:00, ?B/s]

model.safetensors:   0%|          | 0.00/316M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/293 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/42.0 [00:00<?, ?B/s]

source.spm:   0%|          | 0.00/814k [00:00<?, ?B/s]

target.spm:   0%|          | 0.00/790k [00:00<?, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

config.json: 0.00B [00:00, ?B/s]

model.safetensors:   0%|          | 0.00/316M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/293 [00:00<?, ?B/s]

Translated to Dutch:
	een meisje leiden in een houten gebouw. een klein meisje montage in een houten speelhuis . een kind in een roze gewas is klimmen een typeset van trappen op een instap manier . een beetje missy in een roze jurk gaan in een houten hut . een klein meisje klimmen de trap naar haar speelhuis .

Back-translated:
	lead a girl into a wooden building. a little girl fitting into a wooden playhouse. a child in a pink crop is climbing a typeset of stairs in a step way. a little missy in a pink dress going into a wooden cabin. a little girl climbing the stairs to her playhouse.

