<a href="https://colab.research.google.com/github/emmt1998/nlp/blob/main/01_Pre-Processing/Data-Augmentation/TextAttack_and_Googletrans.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Redacted by Efrain Magaña emmt1998@gmail.com

Adapted from https://towardsdatascience.com/text-data-augmentation-f4143571ecd2

# TextAttack

https://github.com/QData/TextAttack


In [None]:
!pip install textattack

In [None]:
from textattack.augmentation import WordNetAugmenter, EmbeddingAugmenter, EasyDataAugmenter, CharSwapAugmenter

In [44]:
import pandas as pd
text1 = "Understand NLP models better by running different adversarial attacks on them and examining the output"
text2 = "Research and develop different NLP adversarial attacks using the TextAttack framework and library of components"
text3 = "Augment your dataset to increase model generalization and robustness downstream"
text =[text1, text2, text3]
dt = pd.DataFrame(text, columns=["text"])
dt

Unnamed: 0,text
0,Understand NLP models better by running differ...
1,Research and develop different NLP adversarial...
2,Augment your dataset to increase model general...


In [45]:
texto = dt['text'][0]
texto

'Understand NLP models better by running different adversarial attacks on them and examining the output'

In [46]:
aug = WordNetAugmenter()
aug.augment(texto)

['Understand NLP models better by running different adversarial onslaught on them and examining the output']

In [47]:
aug = EmbeddingAugmenter()
aug.augment(texto)

['Understand NLP models better by running different adversarial attacks on them and probed the output']

In [48]:
aug = EasyDataAugmenter()
aug.augment(texto)

['Understand NLP models better by running different adversarial attacks on them and examining the unlike output',
 'Understand on models better by running different adversarial attacks NLP them and examining the output',
 'Understand NLP models sound by running different adversarial attacks on them and examining the output',
 'by ']

In [49]:
aug = CharSwapAugmenter()
aug.augment(texto)

['Undrstand NLP models better by running different adversarial attacks on them and examining the output']

In [50]:
def augmentationDF(df, colname, aug, only_augmented=False):
  """
  -> DataFrame
  """
  textos = df[colname]
  results = []
  for i in range(len(textos)):
    texto = textos[i]
    result = aug.augment(texto)
    results = results + result
  new = pd.DataFrame(results, columns=[colname])
  if only_augmented:
    return new
  else:
    new = new.append(df, ignore_index=True)
    return new

def allAug(df, colname, augs, verbose = False):
  """
  ->DataFrame
  """
  dtrs = df.copy()
  for i in range(len(augs)):
    if verbose:print("Augmenter ",i,"-start")
    dtr = augmentationDF(df, colname, augs[i], only_augmented=True)
    dtrs = dtrs.append(dtr, ignore_index=True)
    if verbose:print("Augmenter ",i,"-finish")
  return dtrs


augs = [WordNetAugmenter(), EmbeddingAugmenter(), EasyDataAugmenter(), CharSwapAugmenter()]
augmented = allAug(dt, "text", augs, verbose=True)
augmented

Augmenter  0 -start
Augmenter  0 -finish
Augmenter  1 -start
Augmenter  1 -finish
Augmenter  2 -start
Augmenter  2 -finish
Augmenter  3 -start
Augmenter  3 -finish


Unnamed: 0,text
0,Understand NLP models better by running differ...
1,Research and develop different NLP adversarial...
2,Augment your dataset to increase model general...
3,Understand NLP models better by go different a...
4,Research and develop different NLP adversarial...
5,Augment your dataset to increase model inducti...
6,Understand NLP models better by running differ...
7,Research and develop diversified NLP adversari...
8,Redouble your dataset to increase model genera...
9,running


# Googletrans
https://github.com/ssut/py-googletrans

In [None]:
!pip install googletrans==4.0.0-rc1

In [1]:
from googletrans import Translator
translator = Translator()

In [30]:
import pandas as pd
text1 = "Fast and reliable - it uses the same servers that translate.google.com uses"
text2 = "This uses the Google Translate Ajax API to make calls to such methods as detect and translate."
text3 = "Googletrans is a free and unlimited python library that implemented Google Translate API"
text =[text1, text2, text3]
dt = pd.DataFrame(text, columns=["text"])
dt

Unnamed: 0,text
0,Fast and reliable - it uses the same servers t...
1,This uses the Google Translate Ajax API to mak...
2,Googletrans is a free and unlimited python lib...


In [31]:
texto = dt["text"][0]
texto

'Fast and reliable - it uses the same servers that translate.google.com uses'

In [32]:
# translate from English to Italian
text_trans = translator.translate(texto, dest='it').text
text_trans

'Veloce e affidabile: utilizza gli stessi server utilizzati da translate.google.com'

In [33]:
# translate back to Englisht from Italian
translator.translate(text=text_trans, dest='en').text

'Fast and reliable: use the same servers used by translate.google.com'

In [35]:
def augTrans(text, in_idiom="en", proxy_idiom="it"):
  """
  -> string
  """
  text_proxy = translator.translate(text, dest=proxy_idiom).text
  texto = translator.translate(text_proxy, dest=in_idiom).text
  return texto

def augmentaDFTrans(df, colname, in_idiom="en", proxy_idiom="it", only_augmented=False):
  """
  -> DataFrame
  """
  textos = df[colname]
  results = []
  for i in range(len(textos)):
    texto = textos[i]
    result = augTrans(texto, in_idiom, proxy_idiom)
    results.append(result)
  new = pd.DataFrame(results, columns=[colname])
  if only_augmented:
    return new
  else:
    new = new.append(df, ignore_index=True)
    return new

def multipleAugTrans(df, colname, idioms, verbose = False):
  """
  ->DataFrame
  """
  dtrs = df.copy()
  for i in range(1,len(idioms)):
    if verbose:print("Idiom ",i,"-start")
    dtr = augmentaDFTrans(df, colname, in_idiom=idioms[0], proxy_idiom=idioms[i], only_augmented=True)
    dtrs = dtrs.append(dtr, ignore_index=True)
    if verbose:print("Idiom ",i,"-finish")
  return dtrs


idioms = ["en", "it", "es", "fr", "zh-cn"]
augmented = multipleAugTrans(dt, "text", idioms, verbose = True)
augmented


Idiom  1 -start
Idiom  1 -finish
Idiom  2 -start
Idiom  2 -finish
Idiom  3 -start
Idiom  3 -finish
Idiom  4 -start
Idiom  4 -finish


Unnamed: 0,text
0,Fast and reliable - it uses the same servers t...
1,This uses the Google Translate Ajax API to mak...
2,Googletrans is a free and unlimited python lib...
3,Fast and reliable: use the same servers used b...
4,This uses Google Translate's Ajax API to make ...
5,Googletrans is a free and unlimited Python lib...
6,Fast and reliable: uses the same servers that ...
7,This uses the Google Translate Ajax API to mak...
8,Googletrans is a free and unlimited Python lib...
9,Fast and reliable - it uses the same servers a...
