# Data Augmentation Script

In [1]:
import pandas as pd
from nlpaug.augmenter.word import SynonymAug, RandomWordAug
import nltk

In [2]:
data_path = '../data/SQuAD_train_DF.h5'

In [3]:
df = pd.read_hdf(data_path, 'df_train')

In [4]:
df.head()

Unnamed: 0,question,context,text
14981,How many students are in Boston Public schools?,"The Boston Public Schools enrolls 57,000 stude...",57000
76522,What was the name of the deal in which Arizona...,"Arizona, south of the Gila River was legally b...",Gadsden Purchase
60357,What can an exhibition game raise money for?,An exhibition game may also be used to settle ...,charities
86267,Who wrote 'City Boy: The Adventures of Herbie ...,The Bronx has been featured significantly in f...,Herman Wouk
24617,The New Delhi Municipal Government oversees wh...,New Delhi is governed through a municipal gove...,New Delhi


## Only augment 10% of the data for compute

In [5]:
df_augment = df.sample(frac=0.1)

## Define synonym and random word augmentation functions

In [6]:
def synonym_augmentation(text):
    """
    This function augments a list of text by using synonyms
     
    Args:
        text_list: A list containing the text to process.
    
    Returns:
        A list of augmented text.
    """       
    aug = SynonymAug()
    augmented_text = aug.augment(text)
    return augmented_text

In [7]:
def random_word_augmentation(text):
    """
    This function augments a list of text by using synonyms
     
    Args:
        text_list: A string containing the text to process.
    
    Returns:
        A list of augmented text.
    """ 
    aug = RandomWordAug()
    augmented_text = aug.augment(text)
    return augmented_text

## Create the synonym and random dataframes for storage of the augmented data

In [8]:
df_synonym = pd.DataFrame(columns=['context', 'question', 'text'])

In [9]:
df_synonym['context'] = df_augment['context'].apply(synonym_augmentation)
df_synonym['question'] = df_augment['question'].apply(synonym_augmentation)
df_synonym['text'] = df_augment['text'].apply(synonym_augmentation)

In [10]:
df_synonym['context'] = df_synonym['context'].apply(lambda x: x[0] if isinstance(x, list) else x)
df_synonym['question'] = df_synonym['question'].apply(lambda x: x[0] if isinstance(x, list) else x)
df_synonym['text'] = df_synonym['text'].apply(lambda x: x[0] if isinstance(x, list) else x)

In [11]:
df_synonym.head()

Unnamed: 0,context,question,text
70052,"Until recently, in most critical writing the p...",What get along Simon zelotes Reynolds describe...,the 1960s
84032,By the end of the regal period Rome had develo...,What gods were in the Capitoline ternary?,"Jove, Juno and Minerva"
75787,Although the city is not particularly noted fo...,What did Sir alec guinness Worldly concern Rec...,world ' s large movie studio
82019,Electroluminescence as a phenomenon was discov...,World health organization constitute the Sovie...,Oleg Losev
56926,"The northern side of Miami includes Midtown, a...",What northerly Miami neighborhood be named for...,Small Haiti


In [12]:
df_random = pd.DataFrame(columns=['context', 'question', 'text'])

In [13]:
df_random['context'] = df_augment['context'].apply(random_word_augmentation)
df_random['question'] = df_augment['question'].apply(random_word_augmentation)
df_random['text'] = df_augment['text'].apply(random_word_augmentation)

In [14]:
df_random['context'] = df_random['context'].apply(lambda x: x[0] if isinstance(x, list) else x)
df_random['question'] = df_random['question'].apply(lambda x: x[0] if isinstance(x, list) else x)
df_random['text'] = df_random['text'].apply(lambda x: x[0] if isinstance(x, list) else x)

In [15]:
df_random.head()

Unnamed: 0,context,question,text
70052,"Until recently, in most critical writing the p...",What did Simon the era post - as a for in of g...,the
84032,By the end of regal period Rome had developed ...,Gods were in Capitoline?,", and Minerva"
75787,Although the is not particularly noted for and...,Did Guinness World of Ramoji Film was 20015?,' s largest film
82019,Electroluminescence as phenomenon was discover...,Who the Soviet created the LED?,Losev
56926,"The northern side of Miami includes Midtown, a...",What northern neighborhood named a country?,Haiti


## Add the synonym and random augmented data to the training dataframe and save

In [16]:
df = pd.concat([df, df_synonym, df_random], ignore_index=True)

In [17]:
df.to_hdf('../data/SQuAD_train_augmented_DF.h5', key = 'df_train')