In [2]:
# !pip install nlpaug

import nlpaug.augmenter.char as nac
import nlpaug.augmenter.word as naw
import nlpaug.augmenter.sentence as nas

Collecting nlpaug
  Downloading nlpaug-1.1.11-py3-none-any.whl.metadata (14 kB)
Downloading nlpaug-1.1.11-py3-none-any.whl (410 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m410.5/410.5 kB[0m [31m5.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: nlpaug
Successfully installed nlpaug-1.1.11


In [19]:
import nltk

nltk.download("averaged_perceptron_tagger_eng")
nltk.download("wordnet")
nltk.download("omw-1.4")

[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger_eng.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

In [12]:
def get_augmented(input_text, augmenter):
    augmented_text = augmenter.augment(input_text)
    print("Original:")
    print(input_text)
    print("Augmented Text:")
    print(augmented_text)

In [13]:
text = 'There are many tasks in NLP'

### Character level augmentation

In [14]:
get_augmented(
    text,
    nac.KeyboardAug(
        aug_char_p=0.1,
        aug_word_min=2,
        aug_word_max=5,
        include_numeric=False,
        include_upper_case=False,
        include_special_char=False
    )
) # симуляция опечатки в символе, символ подбирается по расстояниюна клавиатуре

Original:
There are many tasks in NLP
Augmented Text:
['Therr are kany tasks in NLP']


In [15]:
get_augmented(
    text,
    nac.KeyboardAug(aug_word_p=0.2, aug_word_max=5)
) # симуляция опечатки в символе, символ подбирается по расстояниюна клавиатуре

Original:
There are many tasks in NLP
Augmented Text:
['Th2$e are many taCJs in NLP']


In [16]:
get_augmented(
    text,
    nac.OcrAug(aug_word_p=0.3, aug_word_max=10)
) # симуляция ошибки OCR (I == 1)

Original:
There are many tasks in NLP
Augmented Text:
['Theke ake many tasks in NLP']


In [17]:
get_augmented(
    text,
    nac.RandomCharAug(aug_word_p=0.2, aug_word_max=10)
) # симуляция случайной опечатки

Original:
There are many tasks in NLP
Augmented Text:
['Fh3re are many tYskk in NLP']


### Word level augmentation

#### Замена слов по смыслу

In [20]:
get_augmented(text, naw.AntonymAug(aug_p=0.2)) # замена по смыслу на антонимы

Original:
There are many tasks in NLP
Augmented Text:
['There differ few tasks in NLP']


In [21]:
get_augmented(text, naw.SynonymAug(aug_p=0.2)) # замена по смыслу на синонимы

Original:
There are many tasks in NLP
Augmented Text:
['At that place are many tasks in natural language processing']


In [27]:
get_augmented(text, naw.SynonymAug(aug_p=0.2)) # замена по смыслу на синонимы

Original:
There are many tasks in NLP
Augmented Text:
['There are many labor in NLP']


#### Операции со словами

In [22]:
get_augmented(text, naw.SplitAug(aug_p=0.2)) # пробелы в случайных местах

Original:
There are many tasks in NLP
Augmented Text:
['There are ma ny t asks in NLP']


'substitute'	Заменяет случайные слова в тексте на другие

	NLP is fun → AI is fun

'swap'	Меняет местами пары слов

	I love NLP → Love I NLP

'delete'	Удаляет случайные слова

	There are many tasks → There are tasks


In [23]:
get_augmented(
    text,
    naw.RandomWordAug(action='delete', aug_p=0.2)
) # убирает случайные слова.

Original:
There are many tasks in NLP
Augmented Text:
['There are tasks NLP']


In [24]:
get_augmented(
    text,
    naw.RandomWordAug(action='substitute', aug_p=0.2)
)

Original:
There are many tasks in NLP
Augmented Text:
['There _ _ tasks in NLP']


In [37]:
get_augmented(
    text,
    naw.RandomWordAug(action="swap", aug_p=0.3)
)

Original:
There are many tasks in NLP
Augmented Text:
['There many are tasks NLP in']


#### Back Translation

In [30]:
# !pip install sacremoses transformers

In [31]:
get_augmented(
    text,
    naw.BackTranslationAug(from_model_name='facebook/wmt19-en-de', to_model_name='facebook/wmt19-de-en')
)

Some weights of FSMTForConditionalGeneration were not initialized from the model checkpoint at facebook/wmt19-en-de and are newly initialized: ['model.decoder.embed_positions.weight', 'model.encoder.embed_positions.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of FSMTForConditionalGeneration were not initialized from the model checkpoint at facebook/wmt19-de-en and are newly initialized: ['model.decoder.embed_positions.weight', 'model.encoder.embed_positions.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


tokenizer_config.json:   0%|          | 0.00/67.0 [00:00<?, ?B/s]

vocab-src.json:   0%|          | 0.00/849k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/315k [00:00<?, ?B/s]

Original:
There are many tasks in NLP
Augmented Text:
['NLP has many tasks']


### Sentence Level Augmentation

In [32]:
get_augmented(
    text,
    nas.ContextualWordEmbsForSentenceAug(model_path='distilgpt2')
)

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/762 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/353M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

Original:
There are many tasks in NLP
Augmented Text:
['There are many tasks in NLP .']


#### Abstractive Summarization

In [33]:
article = """The history of natural language processing (NLP) generally started in the 1950s..."""
get_augmented(article, nas.AbstSummAug(model_path='t5-base'))

config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/892M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

Original:
The history of natural language processing (NLP) generally started in the 1950s...
Augmented Text:
['natural language processing (NLP) generally started in the 1950s . the history of the process generally started around the same time .']


## Дополнительные методы аугментации текста

#### Вставка слов с учётом контекста (Contextual Insertion)
Использует BERT или аналогичную модель для генерации слов, которые логично вставить в контекст.

**Параметры:**
- `model_path`: huggingface ID модели
- `action`: `insert` или `substitute`

**Совет:** такие модели хорошо работают для английского и требуют загрузки весов.


In [35]:
get_augmented(
    text,
    naw.ContextualWordEmbsAug(model_path='bert-base-uncased', action="insert")
)

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Original:
There are many tasks in NLP
Augmented Text:
['assume there are generally many tasks in general nlp']


### Перефразирование (Paraphrasing)
Генерация перефразированных предложений с помощью seq2seq моделей (`T5`, `PEGASUS` и др.)

**Совет:** Лучше работает с длинными предложениями. Используется в задачах генерации данных, обучения устойчивости модели и улучшения генерации.


In [38]:
from transformers import pipeline
paraphraser = pipeline("text2text-generation", model="Vamsi/T5_Paraphrase_Paws")
paraphraser("paraphrase: " + text, max_length=100, do_sample=True)

config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/892M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/1.79k [00:00<?, ?B/s]

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565
Device set to use cpu


[{'generated_text': 'NLP exists for many tasks .'}]

### AugLy (Facebook)

AugLy — библиотека от Meta, поддерживает визуальные, аудио, и текстовые искажения. Можно симулировать искажения: повтор, заглавные, пунктуацию и т.д.

**Совет:** Хороша для социальных сетей или robustness-тестов.


In [3]:
# !pip install augly

In [15]:
import augly.text as textaugs

text = "There are many tasks in NLP"
aug_text = textaugs.insert_punctuation_chars(text)
print(aug_text)

['T;h;e;r;e; ;a;r;e; ;m;a;n;y; ;t;a;s;k;s; ;i;n; ;N;L;P']


In [24]:
aug_text = textaugs.insert_punctuation_chars(
    text,
    granularity="word",
    cadence=2,
    vary_chars=True,
)
print(aug_text)

['Th!er;e ar;e ma.ny ta...sk?s in NL?P']


### Аугментация с помощью TextAttack

TextAttack — мощный инструмент для генерации атак и аугментаций. Поддерживает синонимы, swap, insert, delete и paraphrase.

Используется в основном для английского языка. Может требовать установки пакета `textattack`.

In [6]:
# !pip install textattack

In [14]:
from textattack.augmentation import WordNetAugmenter

augmenter = WordNetAugmenter(
    pct_words_to_swap=0.3,            # доля слов в тексте, которые будут заменены
    transformations_per_example=3,    # 3 варианта на один вход
)

augmenter.augment("There are many tasks in NLP")

[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


['There are many labor in NLP',
 'There are many task in NLP',
 'There are many undertaking in NLP']

### Многоязычный обратный перевод (Multilingual Back Translation)
Перевод через цепочку языков помогает разнообразить структуру текста и избежать дословного возврата.

**Совет:** Используй последовательность: EN → FR → DE → ES → EN или другие цепочки. Требует нескольких моделей.


### Советы:
- Не стоит валидироваться на аугментированных данных и вообще их переоценивать.
- При использовании k-fold валидации аугментированные и исходные данные должны быть в одном фолде.
- Универсальной для всех случаев аугментации не существует.
- Можно их миксовать.
- При этом не всегда увеличение количества данных будет влиять на качество :с
- Выбирайте техники в зависимости от задачи: классификация, генерация, извлечение сущностей и т.п.
- Проверяйте, не нарушена ли семантика и грамматика после аугментации.
- Сохраняйте оригинальные и аугментированные версии с метками для анализа.

### 📚 Что ещё почитать?
- [Data Augmentation Using Pre-trained Transformer Models](https://arxiv.org/abs/2003.02245)  
- [Аугментация для текстов](https://habr.com/ru/articles/649447/)
