In [1]:
import pandas as pd
import nlpaug.augmenter.char as nac
import nlpaug.augmenter.word as naw
import nlpaug.augmenter.sentence as nas
import nlpaug.flow as nafc
import nltk
from nlpaug.util import Action

In [2]:
train_df = pd.read_csv("./data_generated/train_data.csv")

In [3]:
train_df.Category.value_counts()

Transmission            56
Societal Effects        30
Prevention              24
Societal Response       24
Origin                  20
Reporting               20
Economic Effects        15
Speculation             15
Treatment               15
Individual Response     14
Comparison              13
Testing                 12
Having COVID            11
Nomenclature            11
Symptoms                 7
Other                    6
Name: Category, dtype: int64

In [4]:
# we have many number of questions for transmission question types so we will augment using only few techniques and not all techniques only for that data type to avoid imbalance

### char level augmentations

In [5]:
def char_level_typo_error(sentence):
    ''' function will replace random characters by keyboard distance, i.e. character will be replaced by item closer to itself in keyboard
    input - sentence that needs to be augmented '''
    aug = nac.KeyboardAug(name='Keyboard_Aug')#, aug_char_min=1, aug_char_max=10, aug_char_p=0.3, aug_word_p=0.3, 
#                       aug_word_min=1, aug_word_max=10, stopwords=None, tokenizer=None, reverse_tokenizer=None, 
#                       include_upper_case=True, lang='en', verbose=0, 
#                       stopwords_regex=None, model_path=None, min_char=4)
 
    sentence_aug = aug.augment(sentence)
    return sentence_aug

In [6]:
# take any sentence from dataset
sentence = train_df.Question.iloc[0]
sentence

'is covid worse then swine flu'

In [7]:
# testing function for keyboardAug
char_level_typo_error(sentence)

'is covid wo$se then swine flu'

In [8]:
def char_level_random_error(sentence): 
    '''function applies random character error to textual input, by substituting random characters
    input - sentence that needs to be augmented '''
    aug = nac.RandomCharAug(action='substitute', name='RandomChar_Aug')#, aug_char_min=1, aug_char_max=10, aug_char_p=0.3, 
#                         aug_word_p=0.3, aug_word_min=1, aug_word_max=10, include_upper_case=True, include_lower_case=True, 
#                         include_numeric=True, min_char=4, swap_mode='adjacent', spec_char='!@#$%^&*()_+', stopwords=None, 
#                         tokenizer=None, reverse_tokenizer=None, verbose=0, stopwords_regex=None, candidiates=None)
 
    sentence_aug = aug.augment(sentence)
    return sentence_aug

In [9]:
def char_level_random_deletion_error(sentence): 
    '''function applies random character error to textual input, by deleting random characters
    input - sentence that needs to be augmented '''
    aug = nac.RandomCharAug(action='delete', name='RandomChar_Aug')#, aug_char_min=1, aug_char_max=10, aug_char_p=0.3, 
#                         aug_word_p=0.3, aug_word_min=1, aug_word_max=10, include_upper_case=True, include_lower_case=True, 
#                         include_numeric=True, min_char=4, swap_mode='adjacent', spec_char='!@#$%^&*()_+', stopwords=None, 
#                         tokenizer=None, reverse_tokenizer=None, verbose=0, stopwords_regex=None, candidiates=None)
 
    sentence_aug = aug.augment(sentence)
    return sentence_aug

In [10]:
# test RandomCharAug deletion error
char_level_random_deletion_error(sentence)

'is covid worse then sine flu'

In [11]:
def char_level_random_insertion_error(sentence): 
    '''function applies random character error to textual input, by inserting random characters
    input - sentence that needs to be augmented '''
    aug = nac.RandomCharAug(action='insert', name='RandomChar_Aug')#, aug_char_min=1, aug_char_max=10, aug_char_p=0.3, 
#                         aug_word_p=0.3, aug_word_min=1, aug_word_max=10, include_upper_case=True, include_lower_case=True, 
#                         include_numeric=True, min_char=4, swap_mode='adjacent', spec_char='!@#$%^&*()_+', stopwords=None, 
#                         tokenizer=None, reverse_tokenizer=None, verbose=0, stopwords_regex=None, candidiates=None)
 
    sentence_aug = aug.augment(sentence)
    return sentence_aug

In [12]:
# test RandomCharAug insertion error
char_level_random_insertion_error(sentence)

'is covid worse then Pswine flu'

In [13]:
def char_level_random_swap_error(sentence): 
    '''function applies random character error to textual input, by swaping random characters
    input - sentence that needs to be augmented '''
    aug = nac.RandomCharAug(action='swap', name='RandomChar_Aug')#, aug_char_min=1, aug_char_max=10, aug_char_p=0.3, 
#                         aug_word_p=0.3, aug_word_min=1, aug_word_max=10, include_upper_case=True, include_lower_case=True, 
#                         include_numeric=True, min_char=4, swap_mode='adjacent', spec_char='!@#$%^&*()_+', stopwords=None, 
#                         tokenizer=None, reverse_tokenizer=None, verbose=0, stopwords_regex=None, candidiates=None)
 
    sentence_aug = aug.augment(sentence)
    return sentence_aug

In [14]:
# test RandomCharAug insertion error
char_level_random_swap_error(sentence)

'is covdi worse then swine flu'

### word level augmentation

In [15]:
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\LENOVO\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

In [16]:
def word_level_synonym_replacement(sentence):
    ''' function where random word is replaced by the word with similar meaning according to wordnet
    input - sentence that needs to be augmented '''
    aug = naw.SynonymAug(aug_src='wordnet', model_path=None, name='Synonym_Aug')#, aug_min=1, aug_max=10, aug_p=0.3, lang='eng', 
#                      stopwords=None, tokenizer=None, reverse_tokenizer=None, stopwords_regex=None, force_reload=False, 
#                      verbose=0)
 
    sentence_aug = aug.augment(sentence)
    return sentence_aug

In [17]:
# testing SymonmAug function 
word_level_synonym_replacement(sentence)

'be covid worse then swine flu'

In [18]:
def word_level_random_apply(sentence):
    ''' function where random word is deleted in the sentence
    input - sentence that needs to be augmented '''
    aug = naw.RandomWordAug(action='delete', name='RandomWord_Aug')#, aug_min=1, aug_max=10, aug_p=0.3, stopwords=None, 
#                         target_words=None, tokenizer=None, reverse_tokenizer=None, stopwords_regex=None, verbose=0)
 
    sentence_aug = aug.augment(sentence)
    return sentence_aug

In [19]:
# testing RandomWordAug function
word_level_random_apply(sentence)

'is covid worse swine flu'

In [20]:
def word_level_spelling_error(sentence):
    '''function that inserts spelling error in any random words
    input - sentence that needs to be augmented '''
    aug = naw.SpellingAug(dict_path=None, name='Spelling_Aug', aug_min=1, aug_max=10, aug_p=0.3, stopwords=None, 
                          tokenizer=None, reverse_tokenizer=None, include_reverse=True, stopwords_regex=None, verbose=0)

    sentence_aug = aug.augment(sentence)
    return sentence_aug

In [21]:
# testing SpellingAug function
word_level_spelling_error(sentence)

'is covid worse then swim flu'

In [22]:
def word_level_splitting_error(sentence):
    '''function that splits any random words
    input - sentence that needs to be augmented '''
    aug = naw.SplitAug(name='Split_Aug', aug_min=1, aug_max=10, aug_p=0.3, min_char=4, stopwords=None, tokenizer=None, 
                   reverse_tokenizer=None, stopwords_regex=None, verbose=0)
 
    sentence_aug = aug.augment(sentence)
    return sentence_aug

In [23]:
# testing SplitAug function
word_level_splitting_error(sentence)

'is covid worse t hen swine flu'

In [24]:
# create copy of train data to append augmented data
train_df_aug = train_df.copy()
train_df_aug.shape

(293, 2)

In [25]:
#find out unique categories
train_df_aug.Category.unique()

array(['Comparison ', 'Economic Effects ', 'Having COVID ',
       'Individual Response ', 'Nomenclature ', 'Origin ', 'Other ',
       'Prevention ', 'Reporting ', 'Societal Effects ',
       'Societal Response ', 'Speculation ', 'Symptoms ', 'Testing ',
       'Transmission ', 'Treatment '], dtype=object)

In [26]:
# transmission has large number of samples hence  consider it for lesser augmentation than other categories
train_df_aug1 = train_df_aug[train_df_aug['Category']!='Transmission ']
train_df_aug1.shape

(237, 2)

In [27]:
# apply all types of augments for categories except transmission and append to original data
for i, row in train_df_aug1.iterrows():
#     print(row['Category'])
#     print(row['Question'])
#     df1 = pd.DataFrame()
    sentence = row['Question']
    aug1 = char_level_typo_error(sentence)
    aug2 = char_level_random_error(sentence)
    aug3 = char_level_random_deletion_error(sentence)
    aug4 = char_level_random_insertion_error(sentence)
    aug5 = char_level_random_swap_error(sentence)
    aug6 = word_level_synonym_replacement(sentence)
    aug7 = word_level_random_apply(sentence)
    aug8 = word_level_spelling_error(sentence)
    aug9 = word_level_splitting_error(sentence)
    df1 = pd.DataFrame({'Category':row['Category'], 'Question':[aug1, aug2,aug3, aug4, aug5, aug5, aug6, aug7, aug8, aug9]})
    train_df_aug = train_df_aug.append(df1)

In [28]:
# apply only word types of augments for category transmission and append to original data
for i, row in train_df_aug[train_df_aug['Category']=='Transmission '].iterrows():
#     print(row['Category'])
#     print(row['Question'])
#     df1 = pd.DataFrame()
    sentence = row['Question']
#     aug1 = char_level_typo_error(sentence)
#     aug2 = char_level_random_error(sentence)
#     aug3 = char_level_random_deletion_error(sentence)
#     aug4 = char_level_random_insertion_error(sentence)
#     aug5 = char_level_random_swap_error(sentence)
    aug6 = word_level_synonym_replacement(sentence)
    aug7 = word_level_random_apply(sentence)
    aug8 = word_level_spelling_error(sentence)
    aug9 = word_level_splitting_error(sentence)
    df1 = pd.DataFrame({'Category':row['Category'], 'Question':[ aug6,aug7, aug8, aug9]})
    train_df_aug = train_df_aug.append(df1)

In [29]:
train_df_aug

Unnamed: 0,Category,Question
0,Comparison,is covid worse then swine flu
1,Comparison,did covid exist years before scientists discov...
2,Comparison,how are covid and sars cov similar
3,Comparison,is covid more contagious than the flu
4,Comparison,is covid worse than spanish flu
...,...,...
3,Transmission,who cov id airborne
0,Transmission,what follow the risk of my child becoming sick...
1,Transmission,what the risk of child becoming sick with
2,Transmission,whay is the risck of my child becoming seek wi...


In [30]:
# drop any duplicates that might have created
train_df_aug2  =train_df_aug.drop_duplicates(['Question'])

In [31]:
train_df_aug2

Unnamed: 0,Category,Question
0,Comparison,is covid worse then swine flu
1,Comparison,did covid exist years before scientists discov...
2,Comparison,how are covid and sars cov similar
3,Comparison,is covid more contagious than the flu
4,Comparison,is covid worse than spanish flu
...,...,...
3,Transmission,who cov id airborne
0,Transmission,what follow the risk of my child becoming sick...
1,Transmission,what the risk of child becoming sick with
2,Transmission,whay is the risck of my child becoming seek wi...


In [32]:
train_df_aug1.shape

(237, 2)

In [33]:
train_df_aug.Category.unique()

array(['Comparison ', 'Economic Effects ', 'Having COVID ',
       'Individual Response ', 'Nomenclature ', 'Origin ', 'Other ',
       'Prevention ', 'Reporting ', 'Societal Effects ',
       'Societal Response ', 'Speculation ', 'Symptoms ', 'Testing ',
       'Transmission ', 'Treatment '], dtype=object)

In [34]:
train_df_aug.Category.value_counts()

Societal Effects        330
Transmission            280
Prevention              264
Societal Response       264
Origin                  220
Reporting               220
Economic Effects        165
Speculation             165
Treatment               165
Individual Response     154
Comparison              143
Testing                 132
Having COVID            121
Nomenclature            121
Symptoms                 77
Other                    66
Name: Category, dtype: int64

In [35]:
# save the dataset along with augmented dataset
train_df_aug.to_csv("./data_generated/train_data_augmented.csv")

In [36]:
print("train augmented data shape : ", train_df_aug.shape)

train augmented data shape :  (2887, 2)
