In [1]:
%load_ext autoreload
%autoreload 2
import importlib

In [2]:
import os
os.environ["MODEL_DIR"] = '../model'

# Config

In [3]:
import nlpaug.augmenter.char as nac
import nlpaug.augmenter.word as naw
import nlpaug.augmenter.sentence as nas
import nlpaug.flow as nafc

from nlpaug.util import Action

# Synonym Augmenter (WordNet, Spanish)

In [8]:
import nltk
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


True

In [9]:
text = 'Un rápido zorro marrón salta sobre el perro perezoso'
aug = naw.SynonymAug(aug_src='wordnet', lang='spa')
augmented_text = aug.augment(text)
print("Original:")
print(text)
print("Augmented Text:")
print(augmented_text)

LookupError: 
**********************************************************************
  Resource [93mwordnet[0m not found.
  Please use the NLTK Downloader to obtain the resource:

  [31m>>> import nltk
  >>> nltk.download('wordnet')
  [0m
  For more information see: https://www.nltk.org/data.html

  Attempted to load [93mcorpora/wordnet[0m

  Searched in:
    - '/root/nltk_data'
    - '/opt/conda/nltk_data'
    - '/opt/conda/share/nltk_data'
    - '/opt/conda/lib/nltk_data'
    - '/usr/share/nltk_data'
    - '/usr/local/share/nltk_data'
    - '/usr/lib/nltk_data'
    - '/usr/local/lib/nltk_data'
**********************************************************************


# Word Embeddings Augmenter (word2vec, French)

In [None]:
augmented_text = aug.augment(text)
print("Original:")
print(text)
print("Augmented Text:")
print(augmented_text)

# Word Embeddings Augmenter (fasttext, Japanese)

In [None]:
# https://github.com/taishi-i/nagisa
import nagisa
def tokenizer(x):
    return nagisa.tagging(text).words

text = '速い茶色の狐が怠惰なな犬を飛び越えます'
aug = naw.WordEmbsAug(model_type='fasttext', tokenizer=tokenizer,
                      model_path=os.path.join(os.environ.get("MODEL_DIR"), 'wiki.ja.vec'))
augmented_text = aug.augment(text)
print("Original:")
print(text)
print("Augmented Text:")
print(augmented_text)

# Contextual Word Embeddings Augmenter (BERT)

In [None]:
# Augment French by BERT
aug = naw.ContextualWordEmbsAug(model_path='bert-base-multilingual-uncased', aug_p=0.1)
text = "Bonjour, J'aimerais une attestation de l'employeur certifiant que je suis en CDI."
augmented_text = aug.augment(text)
print("Original:")
print(text)
print("Augmented Text:")
print(augmented_text)

In [5]:
# Augment Japanese by BERT
aug = naw.ContextualWordEmbsAug(model_path='bert-base-multilingual-uncased', aug_p=0.1)
text = '速い茶色の狐が怠惰なな犬を飛び越えます'
augmented_text = aug.augment(text)
print("Original:")
print(text)
print("Augmented Text:")
print(augmented_text)

HBox(children=(IntProgress(value=0, description='Downloading', max=625, style=ProgressStyle(description_width=…




HBox(children=(IntProgress(value=0, description='Downloading', max=672271273, style=ProgressStyle(description_…




HBox(children=(IntProgress(value=0, description='Downloading', max=28, style=ProgressStyle(description_width='…




HBox(children=(IntProgress(value=0, description='Downloading', max=871891, style=ProgressStyle(description_wid…




HBox(children=(IntProgress(value=0, description='Downloading', max=1715180, style=ProgressStyle(description_wi…


Original:
速い茶色の狐が怠惰なな犬を飛び越えます
Augmented Text:
速 い 茶 色 の 狐 か 怠 惰 なな 犬 かに 飛 ひ 越 えます


In [6]:
# Augment Spanish by BERT
aug = naw.ContextualWordEmbsAug(model_path='bert-base-multilingual-uncased', aug_p=0.1)
text = 'Un rápido zorro marrón salta sobre el perro perezoso'
augmented_text = aug.augment(text)
print("Original:")
print(text)
print("Augmented Text:")
print(augmented_text)

Original:
Un rápido zorro marrón salta sobre el perro perezoso
Augmented Text:
un rapido zorro marron salta sobre el perro si
