In [1]:
import torch
import numpy as np
import pandas as pd
from typing import Dict
import torch
from datasets import load_dataset
from transformers import DataCollatorWithPadding

from datasets import load_dataset
from transformers import (
    AutoConfig,
    AutoModelForSequenceClassification,
    AutoTokenizer,
    EvalPrediction,
    Trainer,
    TrainingArguments,
    set_seed,
)

import random

from copy import deepcopy
from tqdm import tqdm
from typing import List, Optional
from abc import ABC, abstractmethod

## WordNet


In [2]:

import nltk

nltk.download('wordnet')

from nltk.corpus import wordnet as wn

[nltk_data] Downloading package wordnet to /home/dario/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


## Schemas & Classes

In [3]:
# Generic classes for augmenting datasets

class DataAugmentationStep(ABC):
    
    @abstractmethod
    def __init__(self, probability: float):
        self.probability = probability
    
    @abstractmethod
    def apply(self, samples: List[Dict]) -> Optional[List[Dict]]:
        pass

class DataAugmentationPipeline:
    
    def __init__(self, steps: List[DataAugmentationStep]):
        """__init__ DataAugmentationPipeline 
        
        A collection of data augmentation steps

        Parameters
        ----------
        steps : List[DataAugmentationStep]
            A list of data augmentation steps, objects that implement the apply method
        """
        self.steps = steps
        
        
    def apply(self, sample: Dict) -> Optional[List[Dict]]:
        """apply Apply the pipeline to a sample

        Parameters
        ----------
        sample : Dict
            A sample of our NLP dataset

        Returns
        -------
        Dict
            An augmented sample
        """
        
        sample = [sample]
        for step in self.steps:
            sample = step.apply(sample)
            if sample is None:
                return None
        return sample

class DatasetAugmentation:
    
    def __init__(self, pipeline: DataAugmentationPipeline, percentage: float, random_sample: bool = False):
        """__init__ Constructor for the DatasetAugmentation class

        Parameters
        ----------
        pipeline : DataAugmentationPipeline
            A pipeline of data augmentation steps
        percentage : float
            The percentage of the dataset to augment
        random_sample : bool, optional
            Whether to sample randomly from the dataset, if false the dataset gets converted to an 
            augmented version by extracting the indices sequentially from zero up to a given percentage
            of its length, otherwise a given percentage of its indices get sampled without replacement, 
            by default False
        """
        
        
        self.pipeline = pipeline
        self.percentage = percentage
        self.random_sample = random_sample

    def augment(self, dataset) -> Optional[pd.DataFrame]:
        """augment Augment a dataset

        Parameters
        ----------
        dataset : Dataset
            A dataset object from the HuggingFace datasets library

        Returns
        -------
        Dataset
            An augmented dataset
        """
        n_samples = len(dataset)
        n_samples_to_augment = int(n_samples * self.percentage)
        if self.random_sample:
            indices = np.random.choice(n_samples, n_samples_to_augment, replace=False)
        else:
            indices = np.arange(n_samples_to_augment)
        augmented_samples = []
        discarded = 0
        
        try:
            for i in tqdm(indices):
                sample = dataset[int(i)]
                out = self.pipeline.apply(sample)
                if out is not None and len(out) > 0:
                    augmented_samples.extend([{
                        'premise': augmented_sample['premise'],
                        'hypothesis': augmented_sample['hypothesis'],
                        'label': augmented_sample['label']
                    } for augmented_sample in out]
                )
                else:
                    discarded += 1
        except Exception as e:
            print(e)
            print(f"Discarded {discarded} samples")
            raise e
        
        # convert list of dictionaries to pandas dataframe
        augmented_dataset = pd.DataFrame.from_dict(augmented_samples)
        
        print(f"Augmentation done, discarded {discarded} samples")
        return augmented_dataset

In [13]:



# Augmentation steps

class Synonimization(DataAugmentationStep):
    
    def __init__(self, probability: float, apply_to: str, max_synonyms: int = 5):
        """__init__ Constructor for the Synonimization class

        Parameters
        ----------
        probability : float
            The probability of applying the step on each token
        apply_to : str
            The key of the sample to apply the step to, can be 'hypothesis' or 'premise'
        max_synonyms : int, optional
            The maximum number of synonyms to generate, by default 5
        """
        self.probability = probability
        self.apply_to = apply_to
        self.max_synonyms = max_synonyms

    def apply(self, samples: List[Dict]) -> Optional[List[Dict]]:
        """apply Apply the step to a set of samples

        Parameters
        ----------
        samples : List[Dict]
            A set of samples from our NLP dataset

        Returns
        -------
        Optional[List[Dict]]
            An augmented sample
        """
        
        result_set = []
        
        for sample in samples:
            try:
                
                result_set.append(sample)
                
                for i, token in enumerate(sample["wsd"][self.apply_to]):
                    if np.random.rand() < self.probability:
                        text = token["text"]
                        wsd_wnet = token["wnSynsetOffset"]
                        if wsd_wnet == "O":
                            continue
                        
                        synonyms = self.get_synonym(text, wsd_wnet)

                        for j, synonym in enumerate(synonyms):
                            
                            if synonym == text:
                                continue
                            
                            if j >= self.max_synonyms:
                                break
                            
                            new_sample = deepcopy(sample)
                            new_sample["wsd"][self.apply_to][i]["text"] = synonym
                            new_sample["srl"][self.apply_to]["tokens"][i]["rawText"] = synonym
                            text = " ".join([token["text"] for token in new_sample["wsd"][self.apply_to]])
                            new_sample[self.apply_to] = text
                            result_set.append(new_sample)
            except:
                print(f"Error in sample: {sample}")
                print(f"WSD: {sample["wsd"][self.apply_to]}")
                print(f"SRL: {sample["srl"][self.apply_to]}")
                print(f"Lengths: {len(sample["wsd"][self.apply_to])}, {len(sample["srl"][self.apply_to]["tokens"])}")
                raise ValueError("Error in sample")
                    
        return result_set

    def get_synonym(self, text, wsd_wnet):
        """get_synonym Get a synonym for a given word

        Parameters
        ----------
        text : str
            The word to find a synonym for
        wsd_wnet : str
            The WordNet synset offset

        Returns
        -------
        str
            A synonym for the given word
        """
        
        synsets = wn.synsets(text)
        
        strip_char = lambda s : int("".join([c for c in s if c.isdigit()]))
        
        for synset in synsets:
            if synset.offset() == strip_char(wsd_wnet):
                synonyms = [" ".join(w.name().split("_")) for w in synset.lemmas()]
                # shuffle
                for synonym in random.sample(synonyms, len(synonyms)):
                    yield synonym
        yield text

class CopulaInverter(DataAugmentationStep):
    
    def __init__(self, probability: float):
        """__init__ Constructor for the CopulaInverter class

        Parameters
        ----------
        probability : float
            The probability of applying the step on each token

        """
        self.probability = probability
        
    def apply(self, samples: List[Dict]) -> Optional[List[Dict]]:
        """apply Apply the step to a sample

        Parameters
        ----------
        sample : List[Dict]
            A sample of our NLP dataset (a list of dictionaries)

        Returns
        -------
        Optional[List[Dict]]
            A set of augmented samples, or None if the sample is discarded
        """
        result_set = []
        
        for sample in samples:
            
            if np.random.rand() > self.probability:
                result_set.append(sample)
                continue
            
            new_sample = deepcopy(sample)
            
            srl = new_sample["srl"]
            
            existential_copula = None
            
            # linear search for an IS copula
            for ann in srl["hypothesis"]["annotations"]:
                frame = ann["verbatlas"]["frameName"]
                token_lemma = srl["hypothesis"]["tokens"][ann["tokenIndex"]]["rawText"]
            
                if frame == "COPULA" and token_lemma == "is":
                    existential_copula = ann
                    break
                    
            if existential_copula is None:
                result_set.append(new_sample)
                continue
            
            try:
                slice_1 = existential_copula["verbatlas"]["roles"][0]["span"]
                slice_2 = existential_copula["verbatlas"]["roles"][1]["span"]
                
                if slice_1[0] < slice_2[0]:
                    slice_1, slice_2 = slice_2, slice_1
                
                slice_1 = slice(slice_1[0], slice_1[1])
                slice_2 = slice(slice_2[0], slice_2[1])
                
            except IndexError:
                print("Slice creation failed (role assumption failed?), skipping augmentation")
                print(f"Sample was: {new_sample}")
                result_set.append(new_sample)
                continue
            
            # swap the two slices in the wsd token list
            
            new_wsd = []
            new_indices = list(range(len(sample["wsd"]["hypothesis"])))
            
            # swap the two slices, hopefully we invert the sentence (passive -> active, active -> passive)
            new_indices[slice_1], new_indices[slice_2] = new_indices[slice_2], new_indices[slice_1]

            for i in new_indices:
                new_wsd.append(sample["wsd"]["hypothesis"][i])
                
            if len(new_wsd) != len(new_sample["wsd"]["hypothesis"]):
                # This should NEVER happen if we filtered the new_samples before in the pipeline
                print(f"Length mismatch: {len(new_wsd)} != {len(new_sample['wsd']['hypothesis'])}")
                print(f"Sample was: {new_sample}")
                print(f"Slice 1: {slice_1}, Slice 2: {slice_2}")
                text_new_sample = " ".join([token["text"] for token in new_sample["wsd"]["hypothesis"]])
                text_augmented = " ".join([token["text"] for token in new_wsd])
                print(f"Text new_sample: {text_new_sample}")
                print(f"Text augmented: {text_augmented}")
            
            new_sample["wsd"]["hypothesis"] = new_wsd
            new_sample["hypothesis"] = " ".join([token["text"] for token in new_wsd])
            
            result_set.append(new_sample)

        return result_set
    
class LengthFilter(DataAugmentationStep):
    
    def __init__(self):
        pass
    
    def apply(self, samples: List[Dict]) -> Optional[List[Dict]]:
        """apply Apply the step to a sample

        Parameters
        ----------
        sample : Dict
            A sample of our NLP dataset

        Returns
        -------
        Dict
            An augmented sample
        """
        
        predicate = lambda sample : (
            len(sample["wsd"]["premise"]) == len(sample["srl"]["premise"]["tokens"]) 
            and 
            len(sample["wsd"]["hypothesis"]) == len(sample["srl"]["hypothesis"]["tokens"])
            )
        
        return list(filter(predicate, samples))

class CopulaContradictor(DataAugmentationStep):
    
    def __init__(self, probability: float):
        """__init__ Constructor for the CopulaContradictor class

        Parameters
        ----------
        probability : float
            The probability of applying the step on each token

        """
        self.probability = probability
        
    def apply(self, samples: List[Dict]) -> Optional[List[Dict]]:
        """apply Apply the step to a sample

        Parameters
        ----------
        sample : List[Dict]
            A sample of our NLP dataset (a list of dictionaries)

        Returns
        -------
        Optional[List[Dict]]
            A set of augmented samples, or None if the sample is discarded
        """
        result_set = []
        
        for sample in samples:
            
            if np.random.rand() > self.probability:
                result_set.append(sample)
                continue
            
            new_sample = deepcopy(sample)
            
            srl = new_sample["srl"]
            
            existential_copula = None
            
            # linear search for an IS copula
            for ann in srl["hypothesis"]["annotations"]:
                frame = ann["verbatlas"]["frameName"]
                token_lemma = srl["hypothesis"]["tokens"][ann["tokenIndex"]]["rawText"]
            
                if frame == "COPULA" and token_lemma == "is":
                    existential_copula = ann
                    break
                    
            if existential_copula is None:
                result_set.append(new_sample)
                continue
        
            srl["hypothesis"]["tokens"][ann["tokenIndex"]]["rawText"] = "is not"
            new_sample["label"] = (
            "ENTAILMENT" if new_sample["label"] == "CONTRADICTION" 
                            else "CONTRADICTION" if new_sample["label"] == "ENTAILMENT" 
                            else new_sample["label"]
            )
            text = " ".join([token["rawText"] for token in srl["hypothesis"]["tokens"]])
            new_sample["hypothesis"] = text
            new_sample["wsd"]["hypothesis"][ann["tokenIndex"]]["text"] = "is not"

            result_set.append(new_sample)
            
        return result_set

In [14]:
def compare(sample):
    
    premise = sample["premise"]
    hypothesis = sample["hypothesis"]
    wsd_premise = sample["wsd"]["premise"]
    wsd_hypothesis = sample["wsd"]["hypothesis"]
    
    print("Premise")
    print(premise)
    print("Hypothesis")
    print(hypothesis)
    print("WSD Premise")
    print(" ".join([token["text"] for token in wsd_premise]))
    print("WSD Hypothesis")
    print(" ".join([token["text"] for token in wsd_hypothesis]))

## WordNet

In [15]:


augmentor = DatasetAugmentation(
    pipeline=DataAugmentationPipeline([
        LengthFilter(),
        Synonimization(probability=0.15, apply_to="premise", max_synonyms=2),
        CopulaInverter(probability=0.5),
        CopulaContradictor(probability=0.2),
        Synonimization(probability=0.2, apply_to="hypothesis", max_synonyms=3),
    ]),
    percentage=0.05,
    random_sample=False
)

In [16]:
MODEL_NAME = "roberta-base"
DATASET_NAME = "tommasobonomo/sem_augmented_fever_nli"
device = "cuda" if torch.cuda.is_available() else "cpu"

set_seed(42)

BATCH_SIZE = 8
LR = 1e-4
WEIGHT_DECAY = 0.001

In [17]:
ds = load_dataset(DATASET_NAME)

In [18]:
train_data = ds["train"]

In [19]:
def print_sample(ds, idx):
    for f in list(("premise", "hypothesis", "label")):
        print(f"{f}: {ds[f][idx]}")

In [20]:
augmented_train_data = augmentor.augment(train_data)

100%|██████████| 2554/2554 [00:17<00:00, 149.50it/s]

Augmentation done, discarded 1064 samples





In [21]:
augmented_train_data[:20]

Unnamed: 0,premise,hypothesis,label
0,Roman Atwood . He is best known for his vlogs ...,a content creator is Roman Atwood .,ENTAILMENT
1,"Roman Atwood . He is best know for his vlogs ,...",Roman Atwood is not a content creator .,CONTRADICTION
2,"Roman Atwood . He is best know for his vlogs ,...",Roman Atwood is not a contented creator .,CONTRADICTION
3,"Roman Atwood . He is best know for his vlogs ,...",Roman Atwood is not a content creator .,CONTRADICTION
4,"Roman Atwood . He is best know for his vlogs ,...",Roman Atwood is not a content creator .,CONTRADICTION
5,Roman Atwood . He is best known for his vlogs ...,Roman Atwood is not a content creator .,CONTRADICTION
6,Roman Atwood . He is best known for his vlogs ...,a content creator is Roman Atwood .,ENTAILMENT
7,Roman Atwood . He is best known for his vlogs ...,Roman Atwood is a content creator.,ENTAILMENT
8,Roman Atwood . He is best known for his vlogs ...,Roman Atwood is a contented creator .,ENTAILMENT
9,Roman Atwood . He is best known for his vlogs ...,Roman Atwood is a content creator .,ENTAILMENT


In [106]:
augmented_train_data["id"].nunique()

2965

In [104]:
len(augmented_train_data)/augmented_train_data["id"].nunique()

7.740303541315345

In [70]:
IDX = 1000

print("--- AUGMENTED ---")
print(
" ".join([token["text"] for token in augmented_train_data.iloc[IDX]["wsd"]["premise"]])
)
print(
" ".join([token["text"] for token in augmented_train_data.iloc[IDX]["wsd"]["hypothesis"]])
)

--- AUGMENTED ---


KeyError: 'wsd'

In [None]:
augmented_train_data.iloc[IDX]["wsd"]["premise"]

In [None]:
compare(augmented_train_data.iloc[314])

In [None]:
synset = wn.synsets("number")
print(synset)
synset = wn.synsets("number")[7]

In [71]:
synset

NameError: name 'synset' is not defined

In [72]:
[w.name() for w in synset.lemmas()]

NameError: name 'synset' is not defined

In [73]:
[" ".join(w.name().split("_")) for w in synset.lemmas()]

NameError: name 'synset' is not defined

In [74]:
print_sample(train_data, 600)

premise: David Lemieux ( born December 22 , 1988 ) is a Canadian professional boxer who held the IBF middleweight title in 2015 . Gennady Golovkin . Later that year he defeated Marco Antonio Rubio to add the WBC interim middleweight title to his collection , and defeated David Lemieux in 2015 to win the IBF middleweight title . The World Boxing Council ( WBC ) is one of four major organizations which sanction world championship boxing bouts , alongside the International Boxing Federation ( IBF ) , World Boxing Association ( WBA ) and World Boxing Organization ( WBO ) .
hypothesis: Gennady Golovkin boxes.
label: ENTAILMENT


In [87]:
train_data[0]["wsd"]["hypothesis"][0]

{'index': 0,
 'text': 'Roman',
 'pos': 'PROPN',
 'lemma': 'Roman',
 'bnSynsetId': 'O',
 'wnSynsetOffset': 'O',
 'nltkSynset': 'O'}

In [76]:
train_data[0]["srl"]["hypothesis"]

{'tokens': [{'index': 0, 'rawText': 'Roman'},
  {'index': 1, 'rawText': 'Atwood'},
  {'index': 2, 'rawText': 'is'},
  {'index': 3, 'rawText': 'a'},
  {'index': 4, 'rawText': 'content'},
  {'index': 5, 'rawText': 'creator'},
  {'index': 6, 'rawText': '.'}],
 'annotations': [{'tokenIndex': 2,
   'verbatlas': {'frameName': 'COPULA',
    'roles': [{'role': 'Theme', 'score': 1.0, 'span': [0, 2]},
     {'role': 'Attribute', 'score': 1.0, 'span': [3, 6]}]},
   'englishPropbank': {'frameName': 'be.01',
    'roles': [{'role': 'ARG1', 'score': 1.0, 'span': [0, 2]},
     {'role': 'ARG2', 'score': 1.0, 'span': [3, 6]}]}}]}

In [77]:
train_data[0]["srl"]["hypothesis"]["annotations"][0]["verbatlas"]

{'frameName': 'COPULA',
 'roles': [{'role': 'Theme', 'score': 1.0, 'span': [0, 2]},
  {'role': 'Attribute', 'score': 1.0, 'span': [3, 6]}]}

In [None]:
slice1 = train_data[0]["srl"]["hypothesis"]["annotations"][0]["verbatlas"]["roles"][0]["span"]
slice2 = train_data[0]["srl"]["hypothesis"]["annotations"][0]["verbatlas"]["roles"][1]["span"]

In [None]:
train_data[0]["srl"]["hypothesis"]["tokens"][slice(slice1[0], slice1[1])]
train_data[0]["srl"]["hypothesis"]["tokens"][slice(slice2[0], slice2[1])]

In [None]:
train_data[0]

In [None]:

def negate_hypothesis(sample):
    srl = sample["srl"]
    
    # detect IS verb
    
    for ann in srl["hypothesis"]["annotations"]:
        frame = ann["verbatlas"]["frameName"]
        token_lemma = srl["hypothesis"]["tokens"][ann["tokenIndex"]]["rawText"]
        

        if frame == "COPULA" and token_lemma == "is":
            print("Can be negated")
            break

In [None]:
negate_hypothesis(train_data[0])

In [None]:
train_data[0]

In [None]:
train_data[0]["srl"]["hypothesis"]

In [None]:
train_data[0]["wsd"]["hypothesis"]

In [None]:
indices = list(range(len(train_data[0]["wsd"]["hypothesis"])))
new_indices = indices.copy()

In [None]:
new_indices[3:6], new_indices[0:2]  = new_indices[0:2], new_indices[3:6]

In [None]:
indices

In [None]:
new_indices

In [None]:
" ".join([train_data[0]["wsd"]["hypothesis"][i]["text"] for i in new_indices ])