In [72]:
import os, sys

import random
import numpy as np
import pandas as pd

import nltk

from icecream import ic

In [73]:
nltk.download('omw')

[nltk_data] Downloading package omw to
[nltk_data]     C:\Users\HP\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\omw.zip.


True

# 1. Synonym Replacement

In [74]:
from nltk.corpus import wordnet

def get_synonyms(word):
    
    synonyms = set()
    
    for syn in wordnet.synsets(word, lang="ind"):
        for l in syn.lemmas():
            synonym = l.name().replace("_", " ").replace("-", " ").lower()
            synonym = "".join([char for char in synonym if char in ' qwertyuiopasdfghjklzxcvbnm'])
            synonyms.add(synonym) 
    if word in synonyms:
        synonyms.remove(word)
    
    return list(synonyms)

get_synonyms("kawan")

['quaker',
 'associate',
 'sister',
 'blighter',
 'chap',
 'mate',
 'flock',
 'fella',
 'lad',
 'crony',
 'pal',
 'cuss',
 'bloke',
 'bedfellow',
 'friend',
 'sidekick',
 'teammate',
 'truelove',
 'steady',
 'gent',
 'familiar',
 'sweetie',
 'acquaintance',
 'brother',
 'companion',
 'feller',
 'ally',
 'buddy',
 'chum',
 'fellow',
 'covey',
 'comrade',
 'sweetheart']

In [75]:
from nltk.corpus import stopwords
stop_words = []
for w in stopwords.words('indonesian'):
    stop_words.append(w)
# print(stop_words)

In [87]:
def synonym_replacement(words, n):
    
    words = words.split()
    
    new_words = words.copy()
    random_word_list = list(set([word for word in words if word not in stop_words]))
    # ic(random_word_list)

    random.shuffle(random_word_list)
    num_replaced = 0
    
    for random_word in random_word_list:
        synonyms = get_synonyms(random_word)
        # ic(random_word)
        # ic(synonyms)
        
        if len(synonyms) >= 1:
            synonym = random.choice(list(synonyms))
            new_words = [synonym if word == random_word else word for word in new_words]
            num_replaced += 1
        
        if num_replaced >= n: #only replace up to n words
            break

    sentence = ' '.join(new_words)

    return sentence

In [77]:
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory


factory = StemmerFactory()
stemmer = factory.create_stemmer()


In [84]:
from spacy.lang.id import Indonesian

teks = 'hai kawan bagaimana kabarmu'
# tokenizer = Indonesian()


# sent = " ".join([token.lemma_ if len(token.lemma_) > 0 else token.text for token in tokenizer(teks)])
# print(sent)
teks = stemmer.stem(teks)
print(teks)

hai kawan bagaimana kabar


In [88]:
for n in range(3):
    print(f" Example of Synonym Replacement: {synonym_replacement(teks,n)}")
    

 Example of Synonym Replacement: hai kawan bagaimana kabar
 Example of Synonym Replacement: sigh kawan bagaimana kabar
 Example of Synonym Replacement: suspire crony bagaimana kabar


In [80]:
for syn in wordnet.synsets("kawan", lang="ind"):
    print(syn)

Synset('flock.n.02')
Synset('covey.n.01')
Synset('friend.n.05')
Synset('acquaintance.n.03')
Synset('ally.n.02')
Synset('associate.n.01')
Synset('bedfellow.n.01')
Synset('brother.n.04')
Synset('buddy.n.01')
Synset('chap.n.01')
Synset('companion.n.01')
Synset('friend.n.01')
Synset('mate.n.08')
Synset('sister.n.03')
Synset('sweetheart.n.01')
Synset('teammate.n.01')


In [81]:
kawan_lemmas = wordnet.lemmas("kawan", lang="ind")
for lem in kawan_lemmas:
    print(lem.synset().hypernyms())

print(kawan_lemmas)

[Synset('animal_group.n.01')]
[Synset('gathering.n.01')]
[Synset('christian.n.01')]
[Synset('person.n.01')]
[Synset('associate.n.01')]
[Synset('peer.n.01')]
[Synset('associate.n.01')]
[Synset('friend.n.01')]
[Synset('friend.n.01')]
[Synset('male.n.02')]
[Synset('friend.n.01')]
[Synset('person.n.01')]
[Synset('friend.n.01')]
[Synset('member.n.01')]
[Synset('lover.n.01')]
[Synset('associate.n.01')]
[Lemma('flock.n.02.kawan'), Lemma('covey.n.01.kawan'), Lemma('friend.n.05.kawan'), Lemma('acquaintance.n.03.kawan'), Lemma('ally.n.02.kawan'), Lemma('associate.n.01.kawan'), Lemma('bedfellow.n.01.kawan'), Lemma('brother.n.04.kawan'), Lemma('buddy.n.01.kawan'), Lemma('chap.n.01.kawan'), Lemma('companion.n.01.kawan'), Lemma('friend.n.01.kawan'), Lemma('mate.n.08.kawan'), Lemma('sister.n.03.kawan'), Lemma('sweetheart.n.01.kawan'), Lemma('teammate.n.01.kawan')]


In [82]:
hypernyms = []
for lem in kawan_lemmas:
    hypernyms.append(lem.synset().hypernyms())
    
print(hypernyms)

[[Synset('animal_group.n.01')], [Synset('gathering.n.01')], [Synset('christian.n.01')], [Synset('person.n.01')], [Synset('associate.n.01')], [Synset('peer.n.01')], [Synset('associate.n.01')], [Synset('friend.n.01')], [Synset('friend.n.01')], [Synset('male.n.02')], [Synset('friend.n.01')], [Synset('person.n.01')], [Synset('friend.n.01')], [Synset('member.n.01')], [Synset('lover.n.01')], [Synset('associate.n.01')]]


In [83]:
for hypernym in hypernyms:
    print(hypernym[0].lemmas(lang="ind"))

[]
[Lemma('gathering.n.01.kumpulan'), Lemma('gathering.n.01.perhimpunan'), Lemma('gathering.n.01.persatuan'), Lemma('gathering.n.01.pertemuan'), Lemma('gathering.n.01.rapat'), Lemma('gathering.n.01.rapat_umum')]
[Lemma('christian.n.01.Nasrani')]
[Lemma('person.n.01.individu'), Lemma('person.n.01.insan'), Lemma('person.n.01.manusia'), Lemma('person.n.01.orang'), Lemma('person.n.01.seorang'), Lemma('person.n.01.seseorang'), Lemma('person.n.01.sukma'), Lemma('person.n.01.unik')]
[Lemma('associate.n.01.bersekutu'), Lemma('associate.n.01.kawan'), Lemma('associate.n.01.rekan'), Lemma('associate.n.01.sekutu'), Lemma('associate.n.01.teman'), Lemma('associate.n.01.teman_sejawat'), Lemma('associate.n.01.kolega')]
[Lemma('peer.n.01.sesama'), Lemma('peer.n.01.sama'), Lemma('peer.n.01.setara'), Lemma('peer.n.01.tolok')]
[Lemma('associate.n.01.bersekutu'), Lemma('associate.n.01.kawan'), Lemma('associate.n.01.rekan'), Lemma('associate.n.01.sekutu'), Lemma('associate.n.01.teman'), Lemma('associate.n.0

# 2. Random Deletion

In [92]:
def random_deletion(words, p):

    words = words.split()
    
    #obviously, if there's only one word, don't delete it
    if len(words) == 1:
        return words

    #randomly delete words with probability p
    new_words = []
    for word in words:
        r = random.uniform(0, 1)
        if r > p:
            new_words.append(word)

    #if you end up deleting all words, just return a random word
    if len(new_words) == 0:
        rand_int = random.randint(0, len(words)-1)
        return [words[rand_int]]

    sentence = ' '.join(new_words)
    
    return sentence

In [101]:
print(random_deletion(teks,0.2))
print(random_deletion(teks,0.3))
print(random_deletion(teks,0.4))

hai kawan bagaimana kabar
hai kawan bagaimana
kabar


# 3. Random Swap

In [89]:
def swap_word(new_words):
    
    random_idx_1 = random.randint(0, len(new_words)-1)
    random_idx_2 = random_idx_1
    counter = 0
    
    while random_idx_2 == random_idx_1:
        random_idx_2 = random.randint(0, len(new_words)-1)
        counter += 1
        
        if counter > 3:
            return new_words
    
    new_words[random_idx_1], new_words[random_idx_2] = new_words[random_idx_2], new_words[random_idx_1] 
    return new_words

# This will Swap the words

In [90]:
def random_swap(words, n):
    
    words = words.split()
    new_words = words.copy()
    # n is the number of words to be swapped
    for _ in range(n):
        new_words = swap_word(new_words)
        
    sentence = ' '.join(new_words)
    
    return sentence

In [91]:
print(random_swap(teks,1))
print(random_swap(teks,2))
print(random_swap(teks,3))

hai kawan kabar bagaimana
kawan bagaimana hai kabar
bagaimana hai kabar kawan


# 4. Random Insertion

In [102]:
def random_insertion(words, n):
    
    words = words.split()
    new_words = words.copy()
    
    for _ in range(n):
        add_word(new_words)
        
    sentence = ' '.join(new_words)
    return sentence

def add_word(new_words):
    
    synonyms = []
    counter = 0
    
    while len(synonyms) < 1:
        random_word = new_words[random.randint(0, len(new_words)-1)]
        synonyms = get_synonyms(random_word)
        counter += 1
        if counter >= 10:
            return
        
    random_synonym = synonyms[0]
    random_idx = random.randint(0, len(new_words)-1)
    new_words.insert(random_idx, random_synonym)

In [103]:
print(random_insertion(teks,1))
print(random_insertion(teks,2))
print(random_insertion(teks,3))

hai kawan however bagaimana kabar
hai quaker kawan bagaimana quaker kabar
however sigh hai kawan however bagaimana kabar


In [105]:
def aug(sent,n,p):
    print(f" Original Sentence : {sent}")
    print(f" SR Augmented Sentence : {synonym_replacement(sent,n)}")
    print(f" RD Augmented Sentence : {random_deletion(sent,p)}")
    print(f" RS Augmented Sentence : {random_swap(sent,n)}")
    print(f" RI Augmented Sentence : {random_insertion(sent,n)}")
    
aug(teks,2,0.3)

 Original Sentence : hai kawan bagaimana kabar
 SR Augmented Sentence : sigh mate bagaimana kabar
 RD Augmented Sentence : bagaimana kabar
 RS Augmented Sentence : kawan bagaimana hai kabar
 RI Augmented Sentence : quaker hai kawan sigh bagaimana kabar
