In [1]:
import pandas as pd
from nlpaug.augmenter.word import SynonymAug, RandomWordAug
import nltk

In [2]:
data_path = '../data/SQuAD_train_DF.h5'

In [3]:
df = pd.read_hdf(data_path, 'df_train')

In [15]:
df.head()

Unnamed: 0,question,context,text
14981,How many students are in Boston Public schools?,"The Boston Public Schools enrolls 57,000 stude...",57000
76522,What was the name of the deal in which Arizona...,"Arizona, south of the Gila River was legally b...",Gadsden Purchase
60357,What can an exhibition game raise money for?,An exhibition game may also be used to settle ...,charities
86267,Who wrote 'City Boy: The Adventures of Herbie ...,The Bronx has been featured significantly in f...,Herman Wouk
24617,The New Delhi Municipal Government oversees wh...,New Delhi is governed through a municipal gove...,New Delhi


In [4]:
df_augment = df.sample(frac=0.1)

In [10]:
def synonym_augmentation(text):
    """
    This function augments a list of text by using synonyms
     
    Args:
        text_list: A list containing the text to process.
    
    Returns:
        A list of augmented text.
    """   
    aug = SynonymAug()
    augmented_text = aug.augment(text)
    return augmented_text

In [6]:
def random_word_augmentation(text):
    """
    This function augments a list of text by using synonyms
     
    Args:
        text_list: A string containing the text to process.
    
    Returns:
        A list of augmented text.
    """ 
    aug = RandomWordAug()
    augmented_text = aug.augment(text)
    return augmented_text

In [7]:
df_synonym = pd.DataFrame(columns=['context', 'question', 'text'])

In [11]:
df_synonym['context'] = df_augment['context'].apply(synonym_augmentation)
df_synonym['question'] = df_augment['question'].apply(synonym_augmentation)
df_synonym['text'] = df_augment['text'].apply(synonym_augmentation)

In [12]:
df_synonym.head()

Unnamed: 0,context,question,text
23065,[Qur ' anic school (also known as dugsi) remai...,[What live another public figure for a Qur ' a...,[dugsi]
73562,"[When Empress Dowager Deng died, Emperor An (r...",[Congenator of what family were exiled after S...,[Yan]
73980,[The Greek shipping tradition recovered during...,[What was brought around by the Ottomans that ...,[Hellenic transportation tradition recovered d...
29996,"[High, inhuman water ice clouds such as Cirrus...","[Along with Stratus, what swarm are lower and ...",[Stratocumulus]
14929,"[In 1950, whites represented 94. septenary% of...",[What percent of Boston ' s universe be white ...,[ninety four. sevener%]


In [19]:
df_synonym['context'] = df_synonym['context'].explode()
df_synonym['question'] = df_synonym['question'].explode()
df_synonym['text'] = df_synonym['text'].explode()

In [20]:
df_random = pd.DataFrame(columns=['context', 'question', 'text'])

In [21]:
df_random['context'] = df_augment['context'].apply(random_word_augmentation)
df_random['question'] = df_augment['question'].apply(random_word_augmentation)
df_random['text'] = df_augment['text'].apply(random_word_augmentation)

In [22]:
df_random.head()

Unnamed: 0,context,question,text
23065,[Qur ' anic schools (also known as dugsi) rema...,[Is name for a Qur ' school?],[dugsi]
73562,"[When Empress Deng died, Emperor An (r. 106 – ...",[Relatives were after Sun Cheng had the regime?],[Yan]
73980,[The Greek shipping tradition during Ottoman r...,[What was brought by the Ottomans that uplift ...,[Greek shipping recovered during Ottoman when ...
29996,"[High, cold ice clouds such as or Cumulonimbus...","[Along Stratus, what clouds and?]",[Stratocumulus]
14929,"[In 1950, whites represented 94. 7% of Boston ...",[Percent ' s population was white 1950?],[.%]


In [23]:
df_random['context'] = df_random['context'].explode()
df_random['question'] = df_random['question'].explode()
df_random['text'] = df_random['text'].explode()

In [24]:
df = pd.concat([df, df_synonym, df_random], ignore_index=True)

In [25]:
df.to_hdf('../data/SQuAD_train_augmented_DF.h5', key = 'df_train')