In [1]:
import nltk
from nltk.corpus import wordnet
from nltk.corpus import stopwords
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')
import random
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import pickle

[nltk_data] Downloading package stopwords to /home/rezett/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /home/rezett/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /home/rezett/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


## EDA Techniques
- Random Synonym replacement
- Random Insertion
- Random Deletion
- Random Swap

In [2]:
def synonym_replacement(originalSentence, n):
    """
        Paper Methodology -> Randomly choose n words from the sentence that are not stop words. 
                            Replace each of these words with one of its synonyms chosen at random.
        originalSentence -> The sentence on which EDA is to be applied
        n -> The number of words to be chosen for random synonym replacement
    """
    stops = set(stopwords.words('english'))
    splitSentence = list(originalSentence.split(" "))
    splitSentenceCopy = splitSentence.copy()
    # Since We Make Changes to The Original Sentence List The Indexes Change and Hence an initial copy proves useful to get values
    ls_nonStopWordIndexes = []
    for i in range(len(splitSentence)):
        if splitSentence[i].lower() not in stops:
            ls_nonStopWordIndexes.append(i)
    if (n > len(ls_nonStopWordIndexes)):
        raise Exception("The number of replacements exceeds the number of non stop word words")
    for i in range(n):
        indexChosen = random.choice(ls_nonStopWordIndexes)
        ls_nonStopWordIndexes.remove(indexChosen)
        synonyms = []
        originalWord = splitSentenceCopy[indexChosen]
        for synset in wordnet.synsets(originalWord):
            for lemma in synset.lemmas():
                if lemma.name() != originalWord:
                    synonyms.append(lemma.name())
        if (synonyms == []):
            continue
        splitSentence[indexChosen] = random.choice(synonyms).replace('_', ' ')
    return " ".join(splitSentence)

In [3]:
print(synonym_replacement('I love to play football', 2))

I love life to play football game


In [4]:
def random_insertion(originalSentence, n):
    """
        Paper Methodology -> Find a random synonym of a random word in the sentence that is not a stop word. 
                            Insert that synonym into a random position in the sentence. Do this n times
        originalSentence -> The sentence on which EDA is to be applied
        n -> The number of times the process has to be repeated
    """
    stops = set(stopwords.words('english'))
    splitSentence = list(originalSentence.split(" "))
    splitSentenceCopy = splitSentence.copy() 
    # Since We Make Changes to The Original Sentence List The Indexes Change and Hence an initial copy proves useful to get values
    ls_nonStopWordIndexes = []
    for i in range(len(splitSentence)):
        if splitSentence[i].lower() not in stops:
            ls_nonStopWordIndexes.append(i)
    if (n > len(ls_nonStopWordIndexes)):
        raise Exception("The number of replacements exceeds the number of non stop word words")
    WordCount = len(splitSentence)
    for i in range(n):
        indexChosen = random.choice(ls_nonStopWordIndexes)
        ls_nonStopWordIndexes.remove(indexChosen)
        synonyms = []
        originalWord = splitSentenceCopy[indexChosen]
        for synset in wordnet.synsets(originalWord):
            for lemma in synset.lemmas():
                if lemma.name() != originalWord:
                    synonyms.append(lemma.name())
        if (synonyms == []):
            continue
        splitSentence.insert(random.randint(0,WordCount-1), random.choice(synonyms).replace('_', ' '))
    return " ".join(splitSentence)

In [5]:
print(random_insertion('I love to play football', 2))

I gambling love to have a go at it play football


In [6]:
def random_deletion(originalSentence, p):
    """
        Paper Methodology -> Randomly remove each word in the sentence with probability p.
        originalSentence -> The sentence on which EDA is to be applied
        p -> Probability of a Word Being Removed
    """
    og = originalSentence
    if (p == 1):
        raise Exception("Always an Empty String Will Be Returned") 
    if (p > 1 or p < 0):
        raise Exception("Improper Probability Value")
    splitSentence = list(originalSentence.split(" "))
    lsIndexesRemoved = []
    WordCount = len(splitSentence)
    for i in range(WordCount):
        randomDraw = random.random()
        if randomDraw <= p:
            lsIndexesRemoved.append(i)
    lsRetainingWords = []
    for i in range(len(splitSentence)):
        if i not in lsIndexesRemoved:
            lsRetainingWords.append(splitSentence[i])
    if (lsRetainingWords == []):
        return og
    return " ".join(lsRetainingWords)

In [7]:
print(random_deletion("I love to play football", 0.3))

I love to play


In [8]:
def random_swap(originalSentence, n):
    """
    Paper Methodology -> Find a random synonym of a random word in the sentence that is not a stop word. 
                        Insert that synonym into a random position in the sentence. Do this n times
    originalSentence -> The sentence on which EDA is to be applied
    n -> The number of times the process has to be repeated
    """
    splitSentence = list(originalSentence.split(" "))
    WordCount = len(splitSentence)
    for i in range(n):
        firstIndex = random.randint(0,WordCount-1)
        secondIndex = random.randint(0,WordCount-1)
        while (secondIndex == firstIndex and WordCount != 1):
            secondIndex = random.randint(0,WordCount-1)
        splitSentence[firstIndex], splitSentence[secondIndex] = splitSentence[secondIndex], splitSentence[firstIndex]
    return " ".join(splitSentence)

In [9]:
print(random_swap("I love to play football", 2))

I love football to play


In [22]:
def eda(sentence, alpha_sr=0.2, alpha_ri=0.2, alpha_rs=0.2, p_rd=0.2, num_aug=9):
    sentence = sentence.lower()
    words = sentence.split()
    num_words = len(words)

    augmented_sentences = set()
    n_sr = max(1, int(alpha_sr * num_words))
    n_ri = max(1, int(alpha_ri * num_words))
    n_rs = max(1, int(alpha_rs * num_words))

    aug_techniques = ['SR', 'RI', 'RD', 'RS']

    for _ in range(num_aug):
        aug_sentence = sentence
        while aug_sentence == sentence and len(augmented_sentences) < num_aug:
            techniqueChosen = random.choice(aug_techniques)
            if (techniqueChosen == 'SR'):
                aug_sentence = synonym_replacement(sentence, n_sr)
            elif (techniqueChosen == 'RI'):
                aug_sentence = random_insertion(sentence, n_ri)
            elif (techniqueChosen == 'RS'):
                aug_sentence = random_swap(sentence, n_rs)
            elif (techniqueChosen == 'RD'):
                aug_sentence = random_deletion(sentence, p_rd)

        augmented_sentences.add(aug_sentence)

    augmented_sentences = list(augmented_sentences)

    return augmented_sentences

In [23]:
input_csv = 'final_annotations.csv'
output_csv = 'emotion_aug_data.csv'
num_aug = 4

# Step 1: Read the CSV data
data = pd.read_csv(input_csv)

# Step 2: Initialize list to store augmented data
augmented_data = []

# Step 3: Iterate over each row in the dataframe
for index, row in data.iterrows():
    original_text = row['text']
    emotion = row['emotion']
    augmented_texts = []

    # Append the original row
    augmented_data.append([row['id'], original_text, emotion])

    while len(augmented_texts) < num_aug:
        # Generate augmented sentences
        augmented_texts = eda(original_text, num_aug=num_aug)

    # Append augmented data
    for i, aug_text in enumerate(augmented_texts):
        augmented_data.append([f"{row['id']}_aug_{i+1}", aug_text, emotion])

# Step 4: Create a new dataframe with augmented data
augmented_df = pd.DataFrame(augmented_data, columns=['id', 'text', 'emotion'])

# Print the number of original samples
print(f"Number of original samples: {len(data)}")

# Print the number of augmented samples
print(f"Number of augmented samples: {len(augmented_df) - len(data)}")

# Print the total number of samples
print(f"Total number of samples: {len(augmented_df)}")

# Step 5: Save the augmented data to a new CSV file
augmented_df.to_csv(output_csv, index=False)
print(f"Augmented data saved to {output_csv}")

Number of original samples: 471
Number of augmented samples: 1884
Total number of samples: 2355
Augmented data saved to emotion_aug_data.csv
