In [1]:
import random
import re
from checklist.editor import Editor
editor = Editor()
from transformers import MarianMTModel, MarianTokenizer, pipeline

from datasets import load_dataset
import spacy
from tqdm import tqdm
processor = spacy.load('en_core_web_sm')
import numpy as np
import nlpaug.augmenter.char as nac
import nlpaug.augmenter.word as naw
import nlpaug.augmenter.sentence as nas
import nlpaug.flow as nafc

from nlpaug.util import Action


target_model_name = 'Helsinki-NLP/opus-mt-en-roa'
target_tokenizer = MarianTokenizer.from_pretrained(target_model_name)
target_model = MarianMTModel.from_pretrained(target_model_name)
target_pipe = pipeline("text2text-generation", model=target_model, tokenizer=target_tokenizer, framework="pt", device=0)


en_model_name = 'Helsinki-NLP/opus-mt-roa-en'
en_tokenizer = MarianTokenizer.from_pretrained(en_model_name)
en_model = MarianMTModel.from_pretrained(en_model_name)
en_pipe = pipeline("text2text-generation", model=en_model, tokenizer=en_tokenizer, framework="pt", device=0)



def synonym_w2v(premise, hypothesis):
    """relpace random words with synonym"""
    aug = naw.SynonymAug(aug_src='wordnet', lang='eng', aug_p=.2)
    if random.random() < .5:
        premise =  aug.augment(premise)
    else:
        hypothesis = aug.augment(premise)
    return premise, hypothesis


def bert_replacement(premise, hypothesis):
    """pick random word. replace in both premise and hypothesis using bert"""
    try:
        if random.random() < .5:
            doc = processor(premise)
        else:
            doc = processor(hypothesis)

        word_idx = np.random.randint(len(doc))
        while (str(doc[word_idx]).lower() in ['a', 'the', 'an', ',', '.', 'in', 'to', 'on', 'is']):
            word_idx = np.random.randint(len(doc))
        masked_text = re.sub(r'\b%s\b' % str(doc[word_idx]), '{mask}', doc.text, flags=re.I)
        suggestion = random.choice(editor.suggest(masked_text, nsamples=10))
        if type(suggestion) == tuple:
            suggestion = suggestion[0]
        aug_premise = re.sub(r'\b%s\b' % str(doc[word_idx]), suggestion, premise, flags=re.I)
        aug_hypothesis = re.sub(r'\b%s\b' % str(doc[word_idx]), suggestion, hypothesis, flags=re.I)
    except:
        aug_premise =  premise
        aug_hypothesis = hypothesis
    return aug_premise, aug_hypothesis

    
def bert_insertion(premise, hypothesis):
    """insert words in sentence based on context"""
    aug = naw.ContextualWordEmbsAug(
        model_path='bert-base-uncased', action="insert", aug_p=.2)
    if random.random() < .5:
        premise =  aug.augment(premise)
    else:
        hypothesis = aug.augment(premise)
    return premise, hypothesis


def translate(texts, pipe, language="es"):
    template = lambda text: f"{text}" if language == "en" else f">>{language}<< {text}"
    src_texts = [template(text) for text in texts]    
    return [ex['generated_text'] for ex in pipe(src_texts)]


def back_translation(premise, hypothesis, source_lang="en", target_lang="es"):
    """augment text by backtranslation"""
    # Translate from source to target language
    translated_texts = translate([premise, hypothesis], target_pipe, language=target_lang)

    # Translate from target language back to source language
    aug_premise, aug_hypothesis = translate(translated_texts, en_pipe, language=source_lang)
    return aug_premise, aug_hypothesis


def augment_example(row):
    """perform random augmentation"""
    row = row.copy(deep=True)
    premise = row.premise
    hypothesis = row.hypothesis
    label = row.label
    
    augmentations = ['synonym_w2v', 'bert_replacement', 'bert_insertion', 'back_translation']
    
    
    aug_type = random.choice(augmentations)
    if aug_type == 'synonym_w2v':
        premise, hypothesis = synonym_w2v(premise, hypothesis)
    elif aug_type == 'bert_replacement':
        premise, hypothesis = bert_replacement(premise, hypothesis)
    elif aug_type == 'bert_insertion':
        premise, hypothesis = bert_insertion(premise, hypothesis)
    elif aug_type == 'back_translation':
        premise, hypothesis = back_translation(premise, hypothesis)
    
    row.premise = premise
    row.hypothesis = hypothesis
    row['aug_type'] = aug_type # for debugging
    return row

In [2]:
train = load_dataset('snli', split='train').to_pandas()
dev = load_dataset('snli', split='validation').to_pandas()
test = load_dataset('snli', split='test').to_pandas()


Reusing dataset snli (/home/eric/.cache/huggingface/datasets/snli/plain_text/1.0.0/1f60b67533b65ae0275561ff7828aad5ee4282d0e6f844fd148d05d3c6ea251b)
Reusing dataset snli (/home/eric/.cache/huggingface/datasets/snli/plain_text/1.0.0/1f60b67533b65ae0275561ff7828aad5ee4282d0e6f844fd148d05d3c6ea251b)
Reusing dataset snli (/home/eric/.cache/huggingface/datasets/snli/plain_text/1.0.0/1f60b67533b65ae0275561ff7828aad5ee4282d0e6f844fd148d05d3c6ea251b)


In [3]:
len(train)

550152

In [9]:
%%time
train_aug = train.sample(15000).apply(augment_example, axis=1)

CPU times: user 7h 6min 25s, sys: 38min 53s, total: 7h 45min 18s
Wall time: 1h 19min 47s


In [8]:
train_aug.to_csv('datasets/general_train_augmentations.csv')

63.666666666666664

In [14]:
train_aug.to_csv('../datasets/general_train_augmentations.csv')

In [6]:
210158.064 / 60 / 60

58.37724000000001

In [12]:
train_aug.aug_type.value_counts()

back_translation    3773
bert_insertion      3769
synonym_w2v         3738
bert_replacement    3720
Name: aug_type, dtype: int64

In [15]:
dataset = load_dataset('snli')

Reusing dataset snli (/home/eric/.cache/huggingface/datasets/snli/plain_text/1.0.0/1f60b67533b65ae0275561ff7828aad5ee4282d0e6f844fd148d05d3c6ea251b)


  0%|          | 0/3 [00:00<?, ?it/s]

In [19]:
import pandas as pd
pd.concat([train.sample(20000), train_aug.drop(columns='aug_type')]).to_csv('../datasets/mixed_augmentations.csv')

In [22]:
load_dataset(path='csv', data_files='../datasets/mixed_augmentations.csv')['train']

Using custom data configuration default-bcc30624bff1e9d0
Reusing dataset csv (/home/eric/.cache/huggingface/datasets/csv/default-bcc30624bff1e9d0/0.0.0/bf68a4c4aefa545d0712b2fcbb1b327f905bbe2f6425fbc5e8c25234acb9e14a)


  0%|          | 0/1 [00:00<?, ?it/s]

Dataset({
    features: ['Unnamed: 0', 'premise', 'hypothesis', 'label'],
    num_rows: 35000
})

In [23]:
dataset['train'] = load_dataset(path='csv', data_files='../datasets/mixed_augmentations.csv')['train']

Using custom data configuration default-bcc30624bff1e9d0
Reusing dataset csv (/home/eric/.cache/huggingface/datasets/csv/default-bcc30624bff1e9d0/0.0.0/bf68a4c4aefa545d0712b2fcbb1b327f905bbe2f6425fbc5e8c25234acb9e14a)


  0%|          | 0/1 [00:00<?, ?it/s]

In [2]:
%%latex
\begin{tabular}{||c c c c||} 
 \hline
 Col1 & Col2 & Col2 & Col3 \\ [0.5ex] 
 \hline\hline
 1 & 6 & 87837 & 787 \\ 
 \hline
 2 & 7 & 78 & 5415 \\
 \hline
 3 & 545 & 778 & 7507 \\
 \hline
 4 & 545 & 18744 & 7560 \\
 \hline
 5 & 88 & 788 & 6344 \\ [1ex] 
 \hline
\end{tabular}


<IPython.core.display.Latex object>

In [1]:
%%latex
\documentclass{article}
\usepackage{array}
\begin{document}
\begin{center}
\begin{tabular}{ | m{5em} | m{1cm}| m{1cm} | } 
  \hline
  cell1 dummy text dummy text dummy text& cell2 & cell3 \\ 
  \hline
  cell1 dummy text dummy text dummy text & cell5 & cell6 \\ 
  \hline
  cell7 & cell8 & cell9 \\ 
  \hline
\end{tabular}
\end{center}
\end{document}

<IPython.core.display.Latex object>