# Install and import modules

In [2]:
# Install the most recent version of gensim.
# Otherwise, you may get the following error when running naw.WordEmbsAug():
# 'Word2VecKeyedVectors' object has no attribute 'index_to_key'
# see: https://stackoverflow.com/questions/71032760/word2veckeyedvectors-object-has-no-attribute-index-to-key
!pip install --upgrade gensim --quiet

[33mDEPRECATION: xgbse 0.2.3 has a non-standard dependency specifier pandas>=1.0.*. pip 24.0 will enforce this behaviour change. A possible replacement is to upgrade to a newer version of xgbse or contact the author to suggest that they release a version with a conforming dependency specifiers. Discussion can be found at https://github.com/pypa/pip/issues/12063[0m[33m
[0m

In [3]:
# Import gensim.
# Note: You will need to retart runtime in order to import the most recent version of gensim 
import gensim
print(gensim.__version__)

4.3.2


In [4]:
# Install the transformers module in order to use their base models (e.g., BERT)
!pip install transformers --quiet

[33mDEPRECATION: xgbse 0.2.3 has a non-standard dependency specifier pandas>=1.0.*. pip 24.0 will enforce this behaviour change. A possible replacement is to upgrade to a newer version of xgbse or contact the author to suggest that they release a version with a conforming dependency specifiers. Discussion can be found at https://github.com/pypa/pip/issues/12063[0m[33m
[0m

In [5]:
# Import transformers
import transformers

In [6]:
print(transformers.__version__)

4.37.2


In [7]:
# Install the tokenizer needed by the back translation model
!pip install sacremoses --quiet

[33mDEPRECATION: xgbse 0.2.3 has a non-standard dependency specifier pandas>=1.0.*. pip 24.0 will enforce this behaviour change. A possible replacement is to upgrade to a newer version of xgbse or contact the author to suggest that they release a version with a conforming dependency specifiers. Discussion can be found at https://github.com/pypa/pip/issues/12063[0m[33m
[0m

In [8]:
# Install the tokenizer
import sacremoses

In [9]:
# Install the nlpaug module
!pip install nlpaug --quiet

[33mDEPRECATION: xgbse 0.2.3 has a non-standard dependency specifier pandas>=1.0.*. pip 24.0 will enforce this behaviour change. A possible replacement is to upgrade to a newer version of xgbse or contact the author to suggest that they release a version with a conforming dependency specifiers. Discussion can be found at https://github.com/pypa/pip/issues/12063[0m[33m
[0m

In [10]:
# Import the nlpaug module and its methods
import nlpaug.augmenter.char as nac
import nlpaug.augmenter.word as naw
import nlpaug.augmenter.sentence as nas
import nlpaug.flow as nafc
from nlpaug.util import Action

# Download Models

In [14]:
# Download models to a temporary path
from nlpaug.util.file.download import DownloadUtil
DownloadUtil.download_word2vec(dest_dir = '.')

Downloading...
From (original): https://drive.google.com/uc?export=download&id=0B7XkCwpI5KDYNlNUTTlSS21pQmM
From (redirected): https://drive.google.com/uc?export=download&id=0B7XkCwpI5KDYNlNUTTlSS21pQmM&confirm=t&uuid=ab70fa65-6126-4305-9bfb-f567e1192e2c
To: /Users/farhan/Downloads/Tutorial-3/GoogleNews-vectors-negative300.bin.gz
100%|██████████████████████████████████████| 1.65G/1.65G [01:13<00:00, 22.4MB/s]


In [10]:
import os
from nlpaug.util.file.download import DownloadUtil

def ensure_directory_exists(directory):
    """Ensure the directory exists. If not, create it."""
    if not os.path.exists(directory):
        os.makedirs(directory)
        print(f"Created directory: {directory}")
    else:
        print(f"Directory already exists: {directory}")

def download_model_if_not_exists(dest_dir, model_function, model_name=None):
    # Determine the model's directory or file path based on the dest_dir and model_name
    if model_name:  # If a model_name is provided, use it to adjust the check
        model_path = os.path.join(dest_dir, model_name)
    else:
        model_path = dest_dir  # If no model_name, the check is broader
    
    # Check if the model directory or specific model file exists
    if os.path.exists(model_path) and (not model_name or len(os.listdir(model_path)) > 0):
        print(f"Model already exists in {dest_dir}. Skipping download.")
    else:
        print(f"Downloading model to {dest_dir}...")
        if model_name:
            model_function(dest_dir=dest_dir, model_name=model_name)
        else:
            model_function(dest_dir=dest_dir)

# Create 'Models' directory if it doesn't exist
models_dir = os.path.join('.', 'Models')
ensure_directory_exists(models_dir)

# Adjusted example usage

# Since download_word2vec does not take a model_name, we call it without model_name
download_model_if_not_exists(models_dir, DownloadUtil.download_word2vec)

# For fasttext and glove, we continue to pass model_name as before
download_model_if_not_exists(models_dir, DownloadUtil.download_fasttext, 'crawl-300d-2M')
download_model_if_not_exists(models_dir, DownloadUtil.download_glove, 'glove.6B')



Directory already exists: ./Models
Model already exists in ./Models. Skipping download.
Downloading model to ./Models...
Downloading model to ./Models...


# Example Text

In [11]:
# Let's define some texts
text = """
  Is daily coffee consumption good for our health? 
  I guess it is reasonable to believe so, but it may also depend on how much you drink.
  """

# Option 1: Substitute or insert word randomly using word embeddings similarity

In [12]:

# Initialize the augmenter with model "word2vec" --
aug = naw.WordEmbsAug(
  # You can choose from "word2vec", "glove", or "fasttext" 
  model_type = 'word2vec', 
  model_path = 'GoogleNews-vectors-negative300.bin',
  # You may also choose "insert"
  action = "substitute")

# Augment the text
augmented_text = aug.augment(text)
print("Original:")
print(text)
print("Augmented Text:")
print(augmented_text)

Original:

  Is daily coffee consumption good for our health? 
  I guess it is reasonable to believe so, but it may also depend on how much you drink.
  
Augmented Text:
['Publicity_Stunt Alam_Al_Yawm coffee consumption marvelous KF_OOE our Abortion_foes_capitalize? Personally guess it is unrealistic to believe so, Even it may also depend on spokeswoman_Julie_Zawisza much you drink.']


In [13]:
# Initialize the augmenter with model "fasttext"
aug = naw.WordEmbsAug(
  # You can choose from "word2vec", "glove", or "fasttext" 
  model_type = 'fasttext', 
  # Note: check your "content" path to find out specific model names
  model_path = 'Models/crawl-300d-2M.vec',
  # You may also choose "insert"
  action = "substitute")

# Augment the text
augmented_text = aug.augment(text)
print("Original:")
print(text)
print("Augmented Text:")
print(augmented_text)

Original:

  Is daily coffee consumption good for our health? 
  I guess it is reasonable to believe so, but it may also depend on how much you drink.
  
Augmented Text:
['Is daily coffee.That consumed.The good for our health? lol.I Hmmm. it considers reasonable trying believe so, beause it chould also depend on how much.Why you drink.']


In [14]:
# Initialize the augmenter with model "glove"
aug = naw.WordEmbsAug(
  # You can choose from "word2vec", "glove", or "fasttext" 
  model_type = 'glove', 
  # Note: check your "content" path to find out specific model names
  model_path = 'Models/glove.6B.300d.txt',
  # You may also choose "insert"
  action = "substitute")

# Augment the text
augmented_text = aug.augment(text)
print("Original:")
print(text)
print("Augmented Text:")
print(augmented_text)

Original:

  Is daily coffee consumption good for our health? 
  I guess it is reasonable to believe so, but it may also depend on how much you drink.
  
Augmented Text:
['Is mirror coffee drinking guys the our health? I my anything making reasonable to believe so, but it may also livelihoods on did much you drink.']


# Option 2: Substitute or insert word by contextual word embeddings

In [15]:
## Substitute word by contextual word embeddings (BERT, DistilBERT, RoBERTA or XLNet)
aug = naw.ContextualWordEmbsAug(
  # Other models include 'distilbert-base-uncased', 'roberta-base', etc.
  model_path = 'bert-base-uncased', 
  # You can also choose "insert"
  action = "substitute")
augmented_text = aug.augment(text)
print("Original:")
print(text)
print("Augmented Text:")
print(augmented_text)

Original:

  Is daily coffee consumption good for our health? 
  I guess it is reasonable to believe so, but it may also depend on how much you drink.
  
Augmented Text:
['is daily activity just good in our needs? people felt it was reasonable to say otherwise, but it may also depend on how much you drink.']


# Option 3: Substitute or insert word by synonym

In [16]:
## Substitute word by WordNet's synonym
aug = naw.SynonymAug(aug_src = 'wordnet')
augmented_text = aug.augment(text)
print("Original:")
print(text)
print("Augmented Text:")
print(augmented_text)

Original:

  Is daily coffee consumption good for our health? 
  I guess it is reasonable to believe so, but it may also depend on how much you drink.
  
Augmented Text:
['Is daily coffee wasting disease unspoilt for our health? I guess information technology be sensible to believe so, but it may also depend on how much you drink.']


In [17]:
## Substitute word by WordNet's synonym.
# You can optionally set the max number of words to replace with synonym.
aug = naw.SynonymAug(aug_src = 'wordnet', aug_max = 3)
augmented_text = aug.augment(text, )
print("Original:")
print(text)
print("Augmented Text:")
print(augmented_text)

Original:

  Is daily coffee consumption good for our health? 
  I guess it is reasonable to believe so, but it may also depend on how much you drink.
  
Augmented Text:
['Is daily coffee consumption good for our health? I guess it is reasonable to believe so, simply information technology may also depend on how much you drink.']


# Option 4: Substitute or insert word using back translation

In [18]:
# Use back translation augmenter
back_translation_aug = naw.BackTranslationAug(
    from_model_name = 'facebook/wmt19-en-de', 
    to_model_name = 'facebook/wmt19-de-en'
)
back_translation_aug.augment(text)

  return self.fget.__get__(instance, owner)()
Some weights of FSMTForConditionalGeneration were not initialized from the model checkpoint at facebook/wmt19-en-de and are newly initialized: ['model.decoder.embed_positions.weight', 'model.encoder.embed_positions.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of FSMTForConditionalGeneration were not initialized from the model checkpoint at facebook/wmt19-de-en and are newly initialized: ['model.decoder.embed_positions.weight', 'model.encoder.embed_positions.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
h

['Is daily coffee consumption good for our health? I think it is reasonable to believe so, but it can also depend on how much you drink.']