In [1]:
# Simple parameters of this augmentation
OUTPUT_PATH = 'Augmented Datasets/'

import pandas as pd
import os

# Create the directory if it doesn't exist
if not os.path.exists(OUTPUT_PATH):   
    os.makedirs(OUTPUT_PATH)


import torch
# Check if GPU is available, otherwise use CPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")


In [2]:
# Load datasets
from datasets import load_dataset

fever_plus = load_dataset("tommasobonomo/sem_augmented_fever_nli")

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
from nltk.corpus import wordnet as wn

def get_synonym(word, pos='n'):
    try:
        word_sense = wn.synset(word["nltkSynset"])
        lemmas = word_sense.lemmas()
        
        # Find a synonym that is not the original word
        for lemma in lemmas:
            if lemma.name() != word["text"]:  # Compare with original word text
                # print(f"Replacing '{word['text']}' with '{lemma.name()}'")
                return lemma.name()  # Return the synonym
    except:
        return word["text"]  # Return the original word if any error occurs
    
    return word["text"]  # Return the original word if no synonym is found


def replace_nouns_with_synonyms(entry, pos='NOUN'):
    updated_hypothesis = []  # Collect updated words
    for word in entry["wsd"]["hypothesis"]:
        if word["pos"] == pos:  # Check if the word is a noun
            synonym = get_synonym(word, pos='n')
            updated_hypothesis.append(synonym)  # Replace with the synonym
        else:
            updated_hypothesis.append(word["text"])  # Keep the original word
    # Join the updated words into a single string hypothesis
    entry["hypothesis"] = ' '.join(updated_hypothesis)
    return entry



# Apply the synonym replacement and print updated hypotheses (test)
for i in range(10):
    print("Original hyp: ", fever_plus['train'][i]["hypothesis"])
    updated_entry = replace_nouns_with_synonyms(fever_plus["train"][i], pos='NOUN')
    print("Updated hyp: ", updated_entry['hypothesis'])


Original hyp:  Roman Atwood is a content creator.
Updated hyp:  Roman Atwood is a content creator .
Original hyp:  The Boston Celtics play their home games at TD Garden.
Updated hyp:  The Boston Celtics play their home games at TD Garden .
Original hyp:  There is a movie called The Hunger Games.
Updated hyp:  There is a movie called The Hunger Games .
Original hyp:  Ryan Seacrest is a person.
Updated hyp:  Ryan Seacrest is a person .
Original hyp:  Stranger than Fiction is a film.
Updated hyp:  Stranger than Fiction is a film .
Original hyp:  Selena recorded music.
Updated hyp:  Selena recorded music .
Original hyp:  Selena recorded music.
Updated hyp:  Selena recorded music .
Original hyp:  Selena recorded music.
Updated hyp:  Selena recorded music .
Original hyp:  Selena recorded music.
Updated hyp:  Selena recorded music .
Original hyp:  John Wick: Chapter 2 was theatrically released in the Oregon.
Updated hyp:  John Wick : Chapter 2 was theatrically released in the Oregon .


In [4]:

#############################
# FULL AUGMENT LOOP:        #
#    SYNONIMS SWAPPER       #
#############################

from tqdm import tqdm

# Synonims Swapper aumenter function for a single dataset
def SYN_augment(dataset):
  syn_list = []

  for entry in tqdm(dataset, desc="Augmenting Dataset"):
      
      updated_entry = replace_nouns_with_synonyms(entry, pos='NOUN')

      syn_list.append({
                      'id' : entry['id'],
                      'premise' : entry['premise'], 
                      'old_hypothesis': entry['hypothesis'], 
                      'augmented_hypothesis' : updated_entry['hypothesis'], 
                      'label' : entry['label'], 
                      'wsd' : entry['wsd'], 
                      'srl' : entry['srl']
                    })  

  return syn_list



syn_augmented_fever_train = []
syn_augmented_fever_validation = []
syn_augmented_fever_test = []

# WR-Augmented datasets creation
syn_augmented_fever_train = pd.DataFrame(SYN_augment(fever_plus['train']))
print('train dataset done')
syn_augmented_fever_validation = pd.DataFrame(SYN_augment(fever_plus['validation']))
print('validation dataset done')
syn_augmented_fever_test = pd.DataFrame(SYN_augment(fever_plus['test']))
print('test dataset done')


Augmenting Dataset: 100%|██████████| 51086/51086 [03:37<00:00, 234.85it/s]


train dataset done


Augmenting Dataset: 100%|██████████| 2288/2288 [00:09<00:00, 232.47it/s]


validation dataset done


Augmenting Dataset: 100%|██████████| 2287/2287 [00:09<00:00, 234.43it/s]

test dataset done





In [5]:
# Write DataFrames to jsonl files
syn_augmented_fever_train.to_json(OUTPUT_PATH + 'fever_train_syn.jsonl', orient='records', lines=True)
syn_augmented_fever_validation.to_json(OUTPUT_PATH + 'fever_validation_syn.jsonl', orient='records', lines=True)
syn_augmented_fever_test.to_json(OUTPUT_PATH + 'fever_test_syn.jsonl', orient='records', lines=True)
print('output files created')

output files created


In [6]:
# Due to huge dimension of train file (and possible the others) it is needed to compress them
import zipfile

def compress_file(file_name, zip_name, path):
    # Create ZIP file in the specified output path
    zip_path = path + zip_name
    file_path = path + file_name
    
    with zipfile.ZipFile(zip_path, 'w') as zipf:
        zipf.write(file_path, arcname=file_name, compress_type=zipfile.ZIP_DEFLATED)


# Compress augmented datasets
compress_file('fever_train_syn.jsonl', 'fever_train_syn.zip', OUTPUT_PATH)
compress_file('fever_validation_syn.jsonl', 'fever_validation_syn.zip', OUTPUT_PATH)
compress_file('fever_test_syn.jsonl', 'fever_test_syn.zip', OUTPUT_PATH)

