In [2]:
import json
import random
import os
import logging
import pickle
import string
import re
from pathlib import Path
from collections import Counter, OrderedDict, defaultdict as ddict
import torch
import numpy as np
from tqdm import tqdm
from torch.utils.data import Dataset

from util import read_squad

In [4]:
duorc = read_squad('datasets/oodomain_train/duorc')

print(duorc.keys())
print(type(duorc))
print(type(duorc['question']))
print(duorc['context'][:3])

dict_keys(['question', 'context', 'id', 'answer'])
<class 'dict'>
<class 'list'>


# Synonym Replacement (SR)

In [8]:
import nltk
nltk.download('wordnet')
nltk.download('omw-1.4')
from nltk.corpus import wordnet 

[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/thomaslemenestrel/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     /Users/thomaslemenestrel/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [9]:
stop_words = ['i', 'me', 'my', 'myself', 'we', 'our', 
              'ours', 'ourselves', 'you', 'your', 'yours', 
              'yourself', 'yourselves', 'he', 'him', 'his', 
              'himself', 'she', 'her', 'hers', 'herself', 
              'it', 'its', 'itself', 'they', 'them', 'their', 
              'theirs', 'themselves', 'what', 'which', 'who', 
              'whom', 'this', 'that', 'these', 'those', 'am', 
              'is', 'are', 'was', 'were', 'be', 'been', 'being', 
              'have', 'has', 'had', 'having', 'do', 'does', 'did',
              'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or',
              'because', 'as', 'until', 'while', 'of', 'at', 
              'by', 'for', 'with', 'about', 'against', 'between',
              'into', 'through', 'during', 'before', 'after', 
              'above', 'below', 'to', 'from', 'up', 'down', 'in',
              'out', 'on', 'off', 'over', 'under', 'again', 
              'further', 'then', 'once', 'here', 'there', 'when', 
              'where', 'why', 'how', 'all', 'any', 'both', 'each', 
              'few', 'more', 'most', 'other', 'some', 'such', 'no', 
              'nor', 'not', 'only', 'own', 'same', 'so', 'than', 'too', 
              'very', 's', 't', 'can', 'will', 'just', 'don', 'should', 'now', '']

In [10]:
def synonym_replacement(words, n):
    new_words = words.copy()
    
    # Skip the word if it is in the stop words or capitalized
    random_word_list = list(set([word for word in words if word not in stop_words and word[0].isupper() == False]))
    random.shuffle(random_word_list)
    num_replaced = 0
    for random_word in random_word_list:
        synonyms = get_synonyms(random_word)
        if len(synonyms) >= 1:
            synonym = random.choice(list(synonyms))
            new_words = [synonym if word == random_word else word for word in new_words]
            #print("replaced", random_word, "with", synonym)
            num_replaced += 1
        if num_replaced >= n: #only replace up to n words
            break

    sentence = ' '.join(new_words)
    new_words = sentence.split(' ')

    return new_words

In [11]:
def get_synonyms(word):
    synonyms = set()
    for syn in wordnet.synsets(word): 
        for l in syn.lemmas(): 
            synonym = l.name().replace("_", " ").replace("-", " ").lower()
            synonym = "".join([char for char in synonym if char in ' qwertyuiopasdfghjklzxcvbnm'])
            synonyms.add(synonym) 
    if word in synonyms:
        synonyms.remove(word)
    return list(synonyms)

In [12]:
def augment_dataset(data):

    aug_qs = []
    for i in range(len(data['question'])):
        new_pair = []

        question = duorc['question'][i]
        context  = duorc['context'][i]    
        id_      = duorc['id'][i]
        answer   = duorc['answer'][i]

        question = question.split(' ')
        output = synonym_replacement(question, 3)
        new_question = ' '.join(output)
        
        data['question'].append(new_question)
        data['context'].append(context)
        data['id'].append(id_)
        data['answer'].append(answer)
    
    return data

### Check results

In [13]:
print(len(duorc['question']))

new_duorc = augment_dataset(duorc)

print(len(new_duorc['question']))

print(duorc['question'][0])
print(new_duorc['question'][127])

print(duorc['answer'][0])
print(new_duorc['answer'][127])

print(duorc['question'][10])
print(new_duorc['question'][137])

print(duorc['answer'][10])
print(new_duorc['answer'][137])

127
254
What was Jill's mother's face burned by?
What was Jill's mother's typeface glow by?
{'answer_start': [2476], 'text': ['acid']}
{'answer_start': [2476], 'text': ['acid']}
Who uses scientific skill over trickery?
Who usance scientific accomplishment over trickery?
{'answer_start': [959], 'text': ['Merlin']}
{'answer_start': [959], 'text': ['Merlin']}


In [None]:
def save_json(obj, path):
    with open(path, 'wb') as f:
        json.dump(obj, f)
    return

#save_json(new_duorc, 'datasets/oodomain_train/new_duorc')