In [23]:
import numpy as np
import nltk
from nltk.corpus import wordnet
import random


In [24]:
# Synonym replacement
from nltk.corpus import stopwords

stopwords_set = set(stopwords.words('english'))

def synonym_replacement(words, n):
    new_words = words.copy()
    #stopwords = set(stopwords.words('english'))
    random_word_list = list(set([word for word in words if word not in stopwords_set]))
    random.shuffle(random_word_list)
    num_replaced = 0
    for random_word in random_word_list:
        synonyms = get_synonyms(random_word)
        if len(synonyms) >= 1:
            synonym = random.choice(list(synonyms))
            new_words = [synonym if word == random_word else word for word in new_words]
            num_replaced += 1
        if num_replaced >= n: 
            break
    sentence = ' '.join(new_words)
    new_words = sentence.split(' ')
    return new_words


def get_synonyms(word):
    synonyms = set()
    for syn in wordnet.synsets(word): 
        for lemma in syn.lemmas():
            synonym = lemma.name().replace("_", " ").replace("-", " ").lower()
            synonym = "".join([char for char in synonym if char in ' qwertyuiopasdfghjklzxcvbnm'])
            synonyms.add(synonym) 
    if word in synonyms:
        synonyms.remove(word)
    return list(synonyms)

# Random deletion
def random_deletion(words, p):
    if len(words) == 1:
        return words
    remaining = list(filter(lambda x: random.uniform(0,1) > p,words)) 
    if len(remaining) == 0:
        return [random.choice(words)]
    else:
        return remaining

# Random swap
def random_swap(sentence, n=5): 
    length = range(len(sentence)) 
    for _ in range(n):
        idx1, idx2 = random.sample(length, 2)
        sentence[idx1], sentence[idx2] = sentence[idx2], sentence[idx1] 
    return sentence


In [25]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [26]:
#pipeline

def augment_sentence(sentence, num_new_sentences=5):
    augmented_sentences = []
    words = sentence.split(' ')
    for _ in range(num_new_sentences):
        augmented = words
        function_list = [synonym_replacement, random_deletion, random_swap]
        random.shuffle(function_list)
        for function in function_list:
            if function.__name__ == 'synonym_replacement':
                augmented = function(augmented, n=2)
            elif function.__name__ == 'random_deletion':
                augmented = function(augmented, p=0.1)
            elif function.__name__ == 'random_swap':
                augmented = function(augmented, n=2)
        augmented_sentences.append(' '.join(augmented))
    return augmented_sentences


In [29]:
#Avoid if preprocessing will be done, I added this since this is a generic template.

import string

def preprocess_sentence(sentence):
    # Remove punctuation
    sentence = sentence.translate(str.maketrans('', '', string.punctuation))
    
    # Convert to lower case
    sentence = sentence.lower()
    
    return sentence


In [31]:
data = ['This is the first.', 'Here is another one.', 'This is the third sentence.']

augmented_data = []
for sentence in data:
    sentence = preprocess_sentence(sentence)
    augmented_data.extend(augment_sentence(sentence))

print(augmented_data)


['this the time for the first is', 'offset the this', 'is foremost the this', 'this is the kickoff', 'first class honours degree is this', 'other is unmatchable here', 'other single is some', 'some other is here', 'other here some', 'some peerless here is', 'is the this tierce condemnation', 'is thirdly the this prison term', 'this is the thirdly prison term', 'is the this one third conviction', 'this is one the doom third']


In [28]:
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True