In [1]:
import torch
import numpy as np
import pandas as pd
from typing import Dict
import torch
from datasets import load_dataset
from transformers import DataCollatorWithPadding

from datasets import load_dataset
from transformers import (
    AutoConfig,
    AutoModelForSequenceClassification,
    AutoTokenizer,
    EvalPrediction,
    Trainer,
    TrainingArguments,
    set_seed,
)

from dataclasses import dataclass, field
from typing import List, Optional
from abc import ABC, abstractmethod

## WordNet


In [2]:

import nltk

nltk.download('wordnet')

from nltk.corpus import wordnet as wn

[nltk_data] Downloading package wordnet to /home/dario/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


## Schemas & Classes

In [3]:
# Generic classes for augmenting datasets

class DataAugmentationStep(ABC):
    
    @abstractmethod
    def __init__(self, probability: float):
        self.probability = probability
    
    @abstractmethod
    def apply(self, sample: Dict) -> Optional[Dict]:
        pass

class DataAugmentationPipeline:
    
    def __init__(self, steps: List[DataAugmentationStep]):
        """__init__ DataAugmentationPipeline 
        
        A collection of data augmentation steps

        Parameters
        ----------
        steps : List[DataAugmentationStep]
            A list of data augmentation steps, objects that implement the apply method
        """
        self.steps = steps
        
        
    def apply(self, sample: Dict) -> Optional[Dict]:
        """apply Apply the pipeline to a sample

        Parameters
        ----------
        sample : Dict
            A sample of our NLP dataset

        Returns
        -------
        Dict
            An augmented sample
        """
        for step in self.steps:
            sample = step.apply(sample)
            if sample is None:
                return None
        return sample

class DatasetAugmentation:
    
    def __init__(self, pipeline: DataAugmentationPipeline, percentage: float, random_sample: bool = False):
        """__init__ Constructor for the DatasetAugmentation class

        Parameters
        ----------
        pipeline : DataAugmentationPipeline
            A pipeline of data augmentation steps
        percentage : float
            The percentage of the dataset to augment
        random_sample : bool, optional
            Whether to sample randomly from the dataset, if false the dataset gets converted to an 
            augmented version by extracting the indices sequentially from zero up to a given percentage
            of its length, otherwise a given percentage of its indices get sampled without replacement, 
            by default False
        """
        
        
        self.pipeline = pipeline
        self.percentage = percentage
        self.random_sample = random_sample

    def augment(self, dataset) -> Optional[pd.DataFrame]:
        """augment Augment a dataset

        Parameters
        ----------
        dataset : Dataset
            A dataset object from the HuggingFace datasets library

        Returns
        -------
        Dataset
            An augmented dataset
        """
        n_samples = len(dataset)
        n_samples_to_augment = int(n_samples * self.percentage)
        if self.random_sample:
            indices = np.random.choice(n_samples, n_samples_to_augment, replace=False)
        else:
            indices = np.arange(n_samples_to_augment)
        augmented_samples = []
        discarded = 0
        
        try:
            for i in indices:
                sample = dataset[int(i)]
                augmented_sample = self.pipeline.apply(sample)
                if augmented_sample is not None:
                    augmented_samples.append(augmented_sample)
                else:
                    discarded += 1
        except Exception as e:
            print(e)
            print(f"Discarded {discarded} samples")
            raise e
        augmented_dataset = pd.DataFrame(augmented_samples)
        print(f"Augmentation done, discarded {discarded} samples")
        return augmented_dataset

In [4]:

# Augmentation steps

class Synonimization(DataAugmentationStep):
    
    def __init__(self, probability: float, apply_to: str):
        """__init__ Constructor for the Synonimization class

        Parameters
        ----------
        probability : float
            The probability of applying the step on each token
        apply_to : str
            The key of the sample to apply the step to, can be 'hypothesis' or 'premise'
        """
        self.probability = probability
        self.apply_to = apply_to

    def apply(self, sample: Dict) -> Dict:
        """apply Apply the step to a sample

        Parameters
        ----------
        sample : Dict
            A sample of our NLP dataset

        Returns
        -------
        Dict
            An augmented sample
        """
        
        # Use wordnet to find synonyms *only* if the word-sense-disambiguation is the same
        
        try:
            for i, token in enumerate(sample["wsd"][self.apply_to]):
                if np.random.rand() < self.probability:
                    text = token["text"]
                    wsd_wnet = token["wnSynsetOffset"]
                    if wsd_wnet == "O":
                        continue
                    synonym = self.get_synonym(text, wsd_wnet)

                    sample["wsd"][self.apply_to][i]["text"] = synonym
                    sample["srl"][self.apply_to]["tokens"][i]["rawText"] = synonym
        except:
            print(f"Error in sample: {sample}")
            print(f"WSD: {sample["wsd"][self.apply_to]}")
            print(f"SRL: {sample["srl"][self.apply_to]}")
            raise ValueError("Error in sample")
                
        text = " ".join([token["text"] for token in sample["wsd"][self.apply_to]])
        sample[self.apply_to] = text

        return sample

    def get_synonym(self, text, wsd_wnet):
        """get_synonym Get a synonym for a given word

        Parameters
        ----------
        text : str
            The word to find a synonym for
        wsd_wnet : str
            The WordNet synset offset

        Returns
        -------
        str
            A synonym for the given word
        """
        
        synsets = wn.synsets(text)
        
        strip_char = lambda s : int("".join([c for c in s if c.isdigit()]))
        
        for synset in synsets:
            if synset.offset() == strip_char(wsd_wnet):
                synonyms = [" ".join(w.name().split("_")) for w in synset.lemmas()]
                if len(synonyms) > 1:
                    return np.random.choice(synonyms)
        return text

class CopulaInverter(DataAugmentationStep):
    
    def __init__(self, probability: float, apply_to: float):
        """__init__ Constructor for the CopulaInverter class

        Parameters
        ----------
        probability : float
            The probability of applying the step on each token

        """
        self.probability = probability
        
    def apply(self, sample: Dict) -> Dict:
        """apply Apply the step to a sample

        Parameters
        ----------
        sample : Dict
            A sample of our NLP dataset

        Returns
        -------
        Dict
            An augmented sample
        """
        
        srl = sample["srl"]
        
        existential_copula = None
        
        # linear search for an IS copula
        for ann in srl["hypothesis"]["annotations"]:
            frame = ann["verbatlas"]["frameName"]
            token_lemma = srl["hypothesis"]["tokens"][ann["tokenIndex"]]["rawText"]
        
            if frame == "COPULA" and token_lemma == "is":
                existential_copula = ann
                break
                
        if existential_copula is None:
            return sample
        
        pos = existential_copula["tokenIndex"]
        
        try:
            slice_1 = existential_copula["verbatlas"]["roles"][0]["span"]
            slice_2 = existential_copula["verbatlas"]["roles"][1]["span"]
            
            if slice_1[0] < slice_2[0]:
                slice_1, slice_2 = slice_2, slice_1
            
            slice_1 = slice(slice_1[0], slice_1[1])
            slice_2 = slice(slice_2[0], slice_2[1])
            
        except:
            print("Slice creation failed (role assumption failed?), skipping augmentation")
            print(f"Sample was: {sample}")
            return sample
        
        # swap the two slices in the wsd token list
        
        new_wsd = []
        indices = list(range(len(sample["wsd"]["hypothesis"])))
        new_indices = indices.copy()
        
        # swap the two slices, hopefully we invert the sentence (passive -> active, active -> passive)
        new_indices[slice_1], new_indices[slice_2] = new_indices[slice_2], new_indices[slice_1]
        
        for i in new_indices:
            new_wsd.append(sample["wsd"]["hypothesis"][i])
            
        if len(new_wsd) != len(sample["wsd"]["hypothesis"]):
            print(f"Length mismatch: {len(new_wsd)} != {len(sample['wsd']['hypothesis'])}")
            print(f"Sample was: {sample}")
            print(f"Slice 1: {slice_1}, Slice 2: {slice_2}")
            text_sample = " ".join([token["text"] for token in sample["wsd"]["hypothesis"]])
            text_augmented = " ".join([token["text"] for token in new_wsd])
            print(f"Text sample: {text_sample}")
            print(f"Text augmented: {text_augmented}")
            raise ValueError("Length mismatch")
        
        sample["wsd"]["hypothesis"] = new_wsd
        
        return sample
    
class LengthFilter(DataAugmentationStep):
    
    def __init__(self):
        pass
    
    def apply(self, sample: Dict) -> Optional[Dict]:
        """apply Apply the step to a sample

        Parameters
        ----------
        sample : Dict
            A sample of our NLP dataset

        Returns
        -------
        Dict
            An augmented sample
        """
        if (
            len(sample["wsd"]["premise"]) != len(sample["srl"]["premise"]["tokens"]) 
            or 
            len(sample["wsd"]["hypothesis"]) != len(sample["srl"]["hypothesis"]["tokens"])
            ):
            return None
        return sample
        
#sample["label"] = (
#    "ENTAILMENT" if sample["label"] == "CONTRADICTION" 
#                    else "CONTRADICTION" if sample["label"] == "ENTAILMENT" 
#                    else sample["label"]
#    ) 

In [5]:
def compare(sample):
    
    premise = sample["premise"]
    hypothesis = sample["hypothesis"]
    wsd_premise = sample["wsd"]["premise"]
    wsd_hypothesis = sample["wsd"]["hypothesis"]
    
    print("Premise")
    print(premise)
    print("Hypothesis")
    print(hypothesis)
    print("WSD Premise")
    print(" ".join([token["text"] for token in wsd_premise]))
    print("WSD Hypothesis")
    print(" ".join([token["text"] for token in wsd_hypothesis]))

## WordNet

In [59]:

augmentor = DatasetAugmentation(
    pipeline=DataAugmentationPipeline([
        LengthFilter(),
        Synonimization(probability=0.80, apply_to="hypothesis"),
        Synonimization(probability=0.75, apply_to="premise"),
        CopulaInverter(probability=0.5, apply_to="hypothesis")
    ]),
    percentage=1.0,
    random_sample=False
)

In [60]:
MODEL_NAME = "roberta-base"
DATASET_NAME = "tommasobonomo/sem_augmented_fever_nli"
device = "cuda" if torch.cuda.is_available() else "cpu"

set_seed(42)

BATCH_SIZE = 8
LR = 1e-4
WEIGHT_DECAY = 0.001

In [61]:
ds = load_dataset(DATASET_NAME)

In [62]:
train_data = ds["train"]

In [63]:
def print_sample(ds, idx):
    for f in list(("premise", "hypothesis", "label")):
        print(f"{f}: {ds[f][idx]}")

In [64]:
augmented_train_data = augmentor.augment(train_data)

Slice creation failed (role assumption failed?), skipping augmentation
Sample was: {'id': '89372', 'premise': "Charles Patrick Ryan O'Neal ( deliver April 20 , 1941 ) , known professionally as Ryan O'Neal , is an American histrion and erstwhile pugilist . O'Neal prepare as an recreational boxer before commence his vocation in play in 1960 .", 'hypothesis': "Ryan O'Neal begin is his life in 1941 .", 'label': 'ENTAILMENT', 'wsd': {'premise': [{'index': 0, 'text': 'Charles', 'pos': 'PROPN', 'lemma': 'Charles', 'bnSynsetId': 'O', 'wnSynsetOffset': 'O', 'nltkSynset': 'O'}, {'index': 1, 'text': 'Patrick', 'pos': 'PROPN', 'lemma': 'Patrick', 'bnSynsetId': 'O', 'wnSynsetOffset': 'O', 'nltkSynset': 'O'}, {'index': 2, 'text': 'Ryan', 'pos': 'PROPN', 'lemma': 'Ryan', 'bnSynsetId': 'O', 'wnSynsetOffset': 'O', 'nltkSynset': 'O'}, {'index': 3, 'text': "O'Neal", 'pos': 'PROPN', 'lemma': "O'Neal", 'bnSynsetId': 'O', 'wnSynsetOffset': 'O', 'nltkSynset': 'O'}, {'index': 4, 'text': '(', 'pos': 'PUNCT', '

In [65]:
IDX = 2

print(train_data["premise"][IDX])
print(train_data["hypothesis"][IDX])
print("--- AUGMENTED ---")
print(
" ".join([token["text"] for token in augmented_train_data.iloc[IDX]["wsd"]["premise"]])
)
print(
" ".join([token["text"] for token in augmented_train_data.iloc[IDX]["wsd"]["hypothesis"]])
)

The Hunger Games is a 2012 American dystopian science fiction adventure film directed by Gary Ross and based on the novel of the same name by Suzanne Collins . The film stars Jennifer Lawrence , Josh Hutcherson , Liam Hemsworth , Woody Harrelson , Elizabeth Banks , Lenny Kravitz , Stanley Tucci , and Donald Sutherland .
There is a movie called The Hunger Games.
--- AUGMENTED ---
Stranger than Fiction is a 2006 American fantasy comedy - drama motion picture directed by Marc Forster , bring out by Lindsay Doran , and indite by Zach Helm .
a motion-picture show is strange than Fiction .


In [47]:
augmented_train_data.iloc[IDX]["wsd"]["premise"]

[{'index': 0,
  'text': 'Roman',
  'pos': 'ADJ',
  'lemma': 'roman',
  'bnSynsetId': 'bn:00109913a',
  'wnSynsetOffset': '2921569a',
  'nltkSynset': 'roman.a.01'},
 {'index': 1,
  'text': 'Atwood',
  'pos': 'PROPN',
  'lemma': 'Atwood',
  'bnSynsetId': 'O',
  'wnSynsetOffset': 'O',
  'nltkSynset': 'O'},
 {'index': 2,
  'text': '.',
  'pos': 'PUNCT',
  'lemma': '.',
  'bnSynsetId': 'O',
  'wnSynsetOffset': 'O',
  'nltkSynset': 'O'},
 {'index': 3,
  'text': 'He',
  'pos': 'PRON',
  'lemma': 'he',
  'bnSynsetId': 'O',
  'wnSynsetOffset': 'O',
  'nltkSynset': 'O'},
 {'index': 4,
  'text': 'is',
  'pos': 'AUX',
  'lemma': 'be',
  'bnSynsetId': 'O',
  'wnSynsetOffset': 'O',
  'nltkSynset': 'O'},
 {'index': 5,
  'text': 'best',
  'pos': 'ADV',
  'lemma': 'well',
  'bnSynsetId': 'bn:00117603r',
  'wnSynsetOffset': '12779r',
  'nltkSynset': 'well.r.02'},
 {'index': 6,
  'text': 'known',
  'pos': 'VERB',
  'lemma': 'know',
  'bnSynsetId': 'bn:00090143v',
  'wnSynsetOffset': '594337v',
  'nltkSyn

In [36]:
compare(augmented_train_data.iloc[314])

Premise
Horseshoe Falls , besides known as Canadian Falls , is one of three falls which together with form Niagara Falls on the Niagara River along the Canada - US Border .
Hypothesis
Horseshoe Falls is one of three waterfall on the Niagara River .
WSD Premise
Horseshoe Falls , besides known as Canadian Falls , is one of three falls which together with form Niagara Falls on the Niagara River along the Canada - US Border .
WSD Hypothesis
one of three waterfall on the Niagara River is Horseshoe Falls .


In [None]:
synset = wn.synsets("number")
print(synset)
synset = wn.synsets("number")[7]

In [None]:
synset

In [None]:
[w.name() for w in synset.lemmas()]

In [None]:
[" ".join(w.name().split("_")) for w in synset.lemmas()]

In [None]:
print_sample(train_data, 600)

In [None]:
train_data[0]["wsd"]["hypothesis"]

In [None]:
train_data[0]["srl"]["hypothesis"]

In [None]:
train_data[0]["srl"]["hypothesis"]["annotations"][0]["verbatlas"]

In [None]:
slice1 = train_data[0]["srl"]["hypothesis"]["annotations"][0]["verbatlas"]["roles"][0]["span"]
slice2 = train_data[0]["srl"]["hypothesis"]["annotations"][0]["verbatlas"]["roles"][1]["span"]

In [None]:
train_data[0]["srl"]["hypothesis"]["tokens"][slice(slice1[0], slice1[1])]
train_data[0]["srl"]["hypothesis"]["tokens"][slice(slice2[0], slice2[1])]

In [None]:
train_data[0]

In [None]:

def negate_hypothesis(sample):
    srl = sample["srl"]
    
    # detect IS verb
    
    for ann in srl["hypothesis"]["annotations"]:
        frame = ann["verbatlas"]["frameName"]
        token_lemma = srl["hypothesis"]["tokens"][ann["tokenIndex"]]["rawText"]
        

        if frame == "COPULA" and token_lemma == "is":
            print("Can be negated")
            break

In [None]:
negate_hypothesis(train_data[0])

In [None]:
train_data[0]

In [None]:
train_data[0]["srl"]["hypothesis"]

In [None]:
train_data[0]["wsd"]["hypothesis"]

In [None]:
indices = list(range(len(train_data[0]["wsd"]["hypothesis"])))
new_indices = indices.copy()

In [None]:
new_indices[3:6], new_indices[0:2]  = new_indices[0:2], new_indices[3:6]

In [None]:
indices

In [None]:
new_indices

In [None]:
" ".join([train_data[0]["wsd"]["hypothesis"][i]["text"] for i in new_indices ])