In [1]:
# !pip install icecream
# !pip install deep_translator -q
# !pip install python-crfsuite -q
# !pip install tensorflow-hub==0.7.0 -q
# !pip install tensorflow -q
# !pip install --upgrade pip
# !pip install --upgrade "jax[cuda]"
# !pip install jaxlib
# !pip install pandarallel
# !pip install swifter

In [1]:
import os, sys
import gc
import swifter

from sklearn.metrics import accuracy_score
import time

gc.collect()
os.environ['CUDA_LAUNCH_BLOCKING'] = "1"
os.environ["CUDA_VISIBLE_DEVICES"] = "0,1"  

import random
import numpy as np
# import jax.numpy as np
import pandas as pd

import torch
from torch import optim
import torch.nn.functional as F
torch.cuda.empty_cache()


import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.corpus import wordnet
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('omw')

stop_words_set = []
for w in stopwords.words('indonesian'):
    stop_words_set.append(w)

import math
import re
import copy

from operator import itemgetter
from deep_translator import GoogleTranslator

from utils.utils_init_dataset import set_seed, load_dataset_loader
from utils.utils_semantic_use import USE
from utils.utils_data_utils import DocumentSentimentDataset, DocumentSentimentDataLoader, EmotionDetectionDataset, EmotionDetectionDataLoader
from utils.utils_metrics import document_sentiment_metrics_fn
from utils.utils_init_model import text_logit, fine_tuning_model, eval_model, init_model

# debugger
from icecream import ic

from pandarallel import pandarallel
pandarallel.initialize()

from tqdm.notebook import tqdm
tqdm.pandas()
pd.set_option('display.max_colwidth', None)

device = torch.device("cuda:2" if torch.cuda.is_available() else "cuda:3")

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/m13518040/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /home/m13518040/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package omw to /home/m13518040/nltk_data...
[nltk_data]   Unzipping corpora/omw.zip.


Defaulting to user installation because normal site-packages is not writeable
Instructions for updating:
non-resource variables are not supported in the long term
INFO: Pandarallel will run on 80 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.


In [2]:
def get_synonyms(word):
    
    word_lemmas = wordnet.lemmas(word, lang="ind")
    
    hypernyms = []
    for lem in word_lemmas:
        hypernyms.append(lem.synset().hypernyms())

    if not any(hypernyms):
        return [word]
    
    lemma_corp = []
    
    for hypernym in hypernyms:
        if(len(hypernym) < 1):
            continue
        else:
            lemma_corp.append(hypernym[0].lemmas(lang="ind"))
            
    lemmas = set()
    for list_lemmas in lemma_corp:
        if(len(list_lemmas) < 1):
            lemmas.add(word)
        else:
            for l in list_lemmas:
                lemmas.add(l.name())
    
    clean_synonyms = set()
    for syn in lemmas.copy():
        synonym = syn.replace("_", " ").replace("-", " ").lower()
        synonym = "".join([char for char in synonym if char in ' qwertyuiopasdfghjklzxcvbnm'])
        clean_synonyms.add(synonym) 
    if word in clean_synonyms:
        clean_synonyms.remove(word)
    
    if len(list(clean_synonyms)) < 1:
        return [word]
    else:
        return list(clean_synonyms)

def codemix_perturbation(words, target_lang, words_perturb):
    """
    'su': 'sundanese'
    'jw': 'javanese'
    'ms': 'malay'
    'en': 'english'
    """
    
    translator = GoogleTranslator(source="id", target=target_lang)
    
    supported_langs = ["su", "jw", "ms", "en"]
    
    if target_lang not in supported_langs:
        raise ValueError('Language Unavailable')
    
    new_words = words.copy()
    
    if len(words_perturb) >= 1:
        for perturb_word in words_perturb:
            new_words = [translator.translate(word) if word == perturb_word[1] and word.isalpha() else word for word in new_words]

    sentence = ' '.join(new_words)

    return sentence

def synonym_replacement(words, words_perturb):    
    new_words = words.copy()
       
    if len(words_perturb) >= 1:
        for perturb_word in words_perturb:
            new_words = [get_synonyms(word)[0] if word == perturb_word[1] and word.isalpha() else word for word in new_words]

    sentence = ' '.join(new_words)
    
    return sentence

# fungsi untuk mencari kandidat lain ketika sebuah kandidat perturbasi kurang dari sim_score_threshold
def swap_minimum_importance_words(words_perturb, top_importance_words):
    def get_minimums(word_tups):
        arr = []
        for wt in word_tups:
            if wt[2] == min(top_importance_words, key = lambda t: t[2])[2]:
                arr.append(wt)
        return arr
    minimum_import = get_minimums(top_importance_words)
    unlisted = list(set(words_perturb).symmetric_difference(set(top_importance_words)))

    len_wp = len(top_importance_words)
    len_ul = len(unlisted)
    
    res = []
    for i in range(len_wp):
        if top_importance_words[i] in minimum_import:
            temp_wp = list(copy.deepcopy(top_importance_words))
            temp_wp.pop(i)
            swapped_wp = np.array([(temp_wp) for i in range(len_ul)])
            for j in range(len(swapped_wp)):
                temp_sm = np.vstack((swapped_wp[j], tuple(unlisted[j])))
                
                res.append(temp_sm.tolist())
                
    return res

def logit_prob(text_ls, predictor, tokenizer):
    original_text = text_ls
    subwords = tokenizer.encode(text_ls)
    subwords = torch.LongTensor(subwords).view(1, -1).to(predictor.device)

    logits = predictor(subwords)[0]
    orig_label = torch.topk(logits, k=1, dim=-1)[1].squeeze().item()
    
    orig_probs = F.softmax(logits, dim=-1).squeeze()
    orig_prob = F.softmax(logits, dim=-1).squeeze()[orig_label].detach().cpu().numpy()
    
    return orig_label, orig_probs, orig_prob

def attack(text_ls,
           true_label,
           predictor,
           tokenizer,
           att_ratio,
           attack_strategy,
           lang_codemix=None,
           sim_predictor=None,
           sim_score_threshold=0.5,
           sim_score_window=15,
           batch_size=32, 
           import_score_threshold=-1.):
    
    start_time = time.time()
    
    label_dict = {
        'positive': 0, 
        'neutral': 1, 
        'negative': 2}
    
    original_text = text_ls
    orig_label, orig_probs, orig_prob = logit_prob(text_ls, predictor, tokenizer)
        
#     SEK SALAAHHHHHH
    if true_label != orig_label:
        running_time = round(time.time() - start_time, 2)
        # perturbed_text, perturbed_semantic_sim, orig_label, orig_prob, perturbed_label, perturbed_prob, running_time
        return original_text, 1.000, orig_label, orig_prob, orig_label, orig_prob, running_time
    else:
        text_ls = word_tokenize(text_ls)
        text_ls = [word for word in text_ls if word.isalnum()]
        len_text = len(text_ls)
        half_sim_score_window = (sim_score_window - 1) // 2
        # num_queries = 1
        
        leave_1_texts = [' '.join(text_ls[:ii] + [tokenizer.mask_token] + text_ls[min(ii + 1, len_text):]) for ii in range(len_text)]
                
        leave_1_probs = []
        leave_1_probs_argmax = []
        # num_queries += len(leave_1_texts)
        for text_leave_1 in leave_1_texts:
            subwords_leave_1 = tokenizer.encode(text_leave_1)
            subwords_leave_1 = torch.LongTensor(subwords_leave_1).view(1, -1).to(predictor.device)
            logits_leave_1 = predictor(subwords_leave_1)[0]
            orig_label_leave_1 = torch.topk(logits_leave_1, k=1, dim=-1)[1].squeeze().item()
            
            leave_1_probs_argmax.append(orig_label_leave_1)
            leave_1_probs.append(F.softmax(logits_leave_1, dim=-1).squeeze().detach().cpu().numpy())
            
        leave_1_probs = torch.tensor(leave_1_probs).to("cuda:1")
        
        orig_prob_extended=np.empty(len_text)
        orig_prob_extended.fill(orig_prob)
        orig_prob_extended = torch.tensor(orig_prob_extended).to("cuda:1")
        
        arr1 = orig_prob_extended - leave_1_probs[:,orig_label] + float(leave_1_probs_argmax != orig_label)
        arr2 = (leave_1_probs.max(dim=-1)[0].to("cuda:1") - orig_probs[leave_1_probs_argmax].to("cuda:1"))
        
        import_scores = arr1*arr2
        import_scores = [im * -1 for im in import_scores]
        
        words_perturb = []
        for idx, score in sorted(enumerate(import_scores), key=lambda x: x[1], reverse=True):
            try:
                if score > import_score_threshold and text_ls[idx] not in stop_words_set:
                    words_perturb.append((idx, text_ls[idx], score.item()))
            except Exception as e:
                print(e)
                print(idx, len(text_ls), import_scores.shape, text_ls, len(leave_1_texts))
        
        num_perturbation = math.floor(len(words_perturb)*att_ratio)
        
#       top words perturb berisi list kata terpenting yang tidak akan diswitch ketika first_codemix_sim_score < sim_score_threshold
        top_words_perturb = words_perturb[:num_perturbation]
        
        
        if attack_strategy == "codemixing":
            perturbed_text = codemix_perturbation(text_ls, lang_codemix, words_perturb)
        elif attack_strategy == "synonym_replacement":
            perturbed_text = synonym_replacement(text_ls, words_perturb)
        
        first_perturbation_sim_score = sim_predictor.semantic_sim(original_text, perturbed_text)
        first_perturbation_sim_score = 0.4        
#       cek semantic similarity
#       kalo top wordsnya cuma 1 diskip
        if len(top_words_perturb) > 1:
            words_perturb_candidates = []
            if first_perturbation_sim_score < sim_score_threshold:
                words_perturb_candidates.append(top_words_perturb)
                swapped = swap_minimum_importance_words(words_perturb, top_words_perturb)
                for s in swapped:
                    words_perturb_candidates.append(s)

                words_perturb_candidates = [[tuple(w) for w in wpc] for wpc in words_perturb_candidates]

                candidate_comparison = {}
                for wpc in words_perturb_candidates:
                    if attack_strategy == "codemixing":
                        perturbed_candidate = codemix_perturbation(text_ls, lang_codemix, words_perturb)
                    elif attack_strategy == "synonym_replacement":
                        perturbed_candidate = synonym_replacement(text_ls, words_perturb)
                    
                    perturbed_candidate_sim_score = sim_predictor.semantic_sim(original_text, perturbed_candidate)
                    candidate_comparison[perturbed_candidate] = (perturbed_candidate_sim_score, wpc[-1][-1])

                sorted_candidate_comparison = sorted(candidate_comparison.keys(), key=lambda x: (candidate_comparison[x][0], candidate_comparison[x][1]), reverse=True)
                perturbed_text = sorted_candidate_comparison[0]
        else:
            if first_perturbation_sim_score < sim_score_threshold:
                perturbed_text = original_text
        
        perturbed_semantic_sim = sim_predictor.semantic_sim(original_text, perturbed_text)
        if perturbed_semantic_sim < sim_score_threshold:
            perturbed_text = original_text
            perturbed_semantic_sim = 1.000
        
        perturbed_label, perturbed_probs, perturbed_prob = logit_prob(perturbed_text, predictor, tokenizer)
        
        running_time = round(time.time() - start_time, 2)
        
        return perturbed_text, perturbed_semantic_sim, orig_label, orig_prob, perturbed_label, perturbed_prob, running_time
    
def load_word_index(downstream_task):
    w2i, i2w = None, None
    if downstream_task == 'sentiment':
        w2i, i2w = DocumentSentimentDataset.LABEL2INDEX, DocumentSentimentDataset.INDEX2LABEL
        return w2i, i2w
    elif downstream_task == 'emotion':
        w2i, i2w = EmotionDetectionDataset.LABEL2INDEX, EmotionDetectionDataset.INDEX2LABEL
        return w2i, i2w
    else:
        return None

In [4]:
# dataset: https://huggingface.co/datasets/indonlu
# saved variables:
# %pert
# logit prob before v
# logit prob after v
# prediction before v
# prediction after v
# running time
# semantic sim v
# adv training score

In [9]:
def main(
    model_target,
    downstream_task,
    attack_strategy,
    perturbation_technique,
    perturb_ratio,
    finetune_epoch,
    num_sample,
    result_file,
    seed=26092020
):
    set_seed(seed)
    use = USE()

    tokenizer, config, model = init_model(model_target, downstream_task)
    w2i, i2w = load_word_index(downstream_task)
    
    train_dataset, train_loader, train_path = load_dataset_loader(downstream_task, 'train', tokenizer)
    valid_dataset, valid_loader, valid_path = load_dataset_loader(downstream_task, 'valid', tokenizer)
    test_dataset, test_loader, test_path = load_dataset_loader(downstream_task, 'test', tokenizer)

    finetuned_model = fine_tuning_model(model, i2w, train_loader, valid_loader, 5)

    exp_dataset = valid_dataset.load_dataset(valid_path).head(num_sample)

    text,label = None,None
    if downstream_task == 'sentiment':
        text = 'text'
        label = 'sentiment'
        exp_dataset[['perturbed_text', 'perturbed_semantic_sim', 'orig_label', 'orig_prob', 'perturbed_label', 'perturbed_prob', 'running_time(s)']] = exp_dataset.swifter.apply(
            lambda row: attack(
                row.text,
                row.sentiment,
                finetuned_model,
                tokenizer, 0.2,
                "codemixing",
                "en",
                use), axis=1, result_type='expand'
        )
    elif downstream_task == 'emotion':
        text = 'tweet'
        label = 'label'
        exp_dataset[['perturbed_text', 'perturbed_semantic_sim', 'orig_label', 'orig_prob', 'perturbed_label', 'perturbed_prob', 'running_time(s)']] = exp_dataset.swifter.apply(
            lambda row: attack(
                row.tweet,
                row.label,
                finetuned_model,
                tokenizer, 0.2,
                "codemixing",
                "en",
                use), axis=1, result_type='expand'
        )

    before_attack = accuracy_score(exp_dataset[label], exp_dataset['orig_label'])
    after_attack = accuracy_score(exp_dataset[label], exp_dataset['perturbed_label'])

    exp_dataset.loc[exp_dataset.index[0], 'before_attack_acc'] = before_attack
    exp_dataset.loc[exp_dataset.index[0], 'after_attack_acc'] = after_attack
    exp_dataset.to_csv(os.getcwd() + r'/result/'+result_file+".csv", index=False)        
    
if __name__ == "__main__":
    main(
        model_target="IndoBERT", # IndoBERT, XLM-R, mBERT
        downstream_task="emotion", # sentiment, emotion
        attack_strategy="synonym_replacement", # codemixing, synonym replacement
        perturbation_technique="adversarial", # adversarial, random
        perturb_ratio=0.2, # 0.2, 0.4, 0.6, 0.8
        finetune_epoch=5,
        num_sample=2,
        result_file="test-indobert-emotion-synonym_replacement-adversarial-0.2",
        seed=26092020
    )


2022-02-27 11:21:13.227499: W tensorflow/core/common_runtime/gpu/gpu_device.cc:1850] Cannot dlopen some GPU libraries. Please make sure the missing libraries mentioned above are installed properly if you would like to use GPU. Follow the guide at https://www.tensorflow.org/install/gpu for how to download and setup the required libraries for your platform.
Skipping registering GPU devices...
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at indobenchmark/indobert-base-p2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
(Epoch 1) TRAIN LOSS:1.4313 LR:0.00000300: 100%|██████████████████| 111/111 [00:24<00:00,  4.46it/s]


(Epoch 1) TRAIN LOSS:1.4313 ACC:0.40 F1:0.34 REC:0.36 PRE:0.41 LR:0.00000300


VALID LOSS:1.2297 ACC:0.54 F1:0.52 REC:0.52 PRE:0.57: 100%|█████████| 14/14 [00:02<00:00,  5.10it/s]


(Epoch 1) VALID LOSS:1.2297 ACC:0.54 F1:0.52 REC:0.52 PRE:0.57


(Epoch 2) TRAIN LOSS:1.0873 LR:0.00000300: 100%|██████████████████| 111/111 [00:24<00:00,  4.48it/s]


(Epoch 2) TRAIN LOSS:1.0873 ACC:0.60 F1:0.58 REC:0.58 PRE:0.60 LR:0.00000300


VALID LOSS:0.9515 ACC:0.62 F1:0.61 REC:0.62 PRE:0.61: 100%|█████████| 14/14 [00:02<00:00,  5.10it/s]


(Epoch 2) VALID LOSS:0.9515 ACC:0.62 F1:0.61 REC:0.62 PRE:0.61


(Epoch 3) TRAIN LOSS:0.8165 LR:0.00000300: 100%|██████████████████| 111/111 [00:24<00:00,  4.51it/s]


(Epoch 3) TRAIN LOSS:0.8165 ACC:0.71 F1:0.71 REC:0.71 PRE:0.72 LR:0.00000300


VALID LOSS:0.8281 ACC:0.68 F1:0.67 REC:0.68 PRE:0.69: 100%|█████████| 14/14 [00:02<00:00,  5.13it/s]


(Epoch 3) VALID LOSS:0.8281 ACC:0.68 F1:0.67 REC:0.68 PRE:0.69


(Epoch 4) TRAIN LOSS:0.6862 LR:0.00000300: 100%|██████████████████| 111/111 [00:24<00:00,  4.50it/s]


(Epoch 4) TRAIN LOSS:0.6862 ACC:0.77 F1:0.77 REC:0.77 PRE:0.78 LR:0.00000300


VALID LOSS:0.7716 ACC:0.71 F1:0.72 REC:0.72 PRE:0.72: 100%|█████████| 14/14 [00:02<00:00,  5.09it/s]


(Epoch 4) VALID LOSS:0.7716 ACC:0.71 F1:0.72 REC:0.72 PRE:0.72


(Epoch 5) TRAIN LOSS:0.5650 LR:0.00000300: 100%|██████████████████| 111/111 [00:24<00:00,  4.53it/s]


(Epoch 5) TRAIN LOSS:0.5650 ACC:0.81 F1:0.81 REC:0.81 PRE:0.82 LR:0.00000300


VALID LOSS:0.7707 ACC:0.70 F1:0.70 REC:0.70 PRE:0.71: 100%|█████████| 14/14 [00:02<00:00,  5.07it/s]


(Epoch 5) VALID LOSS:0.7707 ACC:0.70 F1:0.70 REC:0.70 PRE:0.71


Pandas Apply:   0%|          | 0/2 [00:00<?, ?it/s]

In [10]:
!nvidia-smi

Sun Feb 27 11:28:59 2022       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 450.119.04   Driver Version: 450.119.04   CUDA Version: 11.0     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla V100-SXM2...  On   | 00000000:06:00.0 Off |                    0 |
| N/A   39C    P0    56W / 300W |  31398MiB / 32510MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
|   1  Tesla V100-SXM2...  On   | 00000000:07:00.0 Off |                    0 |
| N/A   41C    P0    58W / 300W |   5400MiB / 32510MiB |      0%      Default |
|       

# Emotion

In [3]:
# {0: 'sadness', 1: 'anger', 2: 'love', 3: 'fear', 4: 'happy'}
set_seed(26092020)

use = USE()

print("\nModel initialization..")
downstream_task = "emotion"
tokenizer, config, model = init_model("IndoBERT", downstream_task)
w2i, i2w = load_word_index(downstream_task)

print("\nLoading dataset..")
train_dataset, train_loader, train_path = load_dataset_loader(downstream_task, 'train', tokenizer)
valid_dataset, valid_loader, valid_path = load_dataset_loader(downstream_task, 'valid', tokenizer)
test_dataset, test_loader, test_path_ = load_dataset_loader(downstream_task, 'test', tokenizer)

text0 = 'lokasi di alun alun masakan padang ini cukup terkenal dengan kepala ikan kakap gule , biasa saya pesan nasi bungkus padang berisikan rendang , ayam pop dan perkedel . porsi banyak dan mengenyangkan'
text1 = 'meski masa kampanye sudah selesai , bukan berati habis pula upaya mengerek tingkat kedipilihan elektabilitas .'
text2 = 'kamar nya sempit tidak ada tempat menyimpan barang malah menambah barang . by the way ini kipas2 mau diletakkan mana . mana uchiwa segede ini pula .'


print("\nTest initial model on sample text..")
text_logit(text0, model, tokenizer, i2w)
text_logit(text1, model, tokenizer, i2w)
text_logit(text2, model, tokenizer, i2w)

print("\nModel finetuning...")
finetuned_model = fine_tuning_model(model, i2w, train_loader, valid_loader, 5)
del model

INFO:absl:Using /tmp/tfhub_modules to cache modules.
2022-02-27 11:15:59.592851: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcusolver.so.11'; dlerror: libcusolver.so.11: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /usr/local/cuda/lib:/usr/local/cuda/lib
2022-02-27 11:15:59.593521: W tensorflow/core/common_runtime/gpu/gpu_device.cc:1850] Cannot dlopen some GPU libraries. Please make sure the missing libraries mentioned above are installed properly if you would like to use GPU. Follow the guide at https://www.tensorflow.org/install/gpu for how to download and setup the required libraries for your platform.
Skipping registering GPU devices...
2022-02-27 11:16:00.606676: W tensorflow/core/common_runtime/gpu/gpu_device.cc:1850] Cannot dlopen some GPU libraries. Please make sure the missing libraries mentioned above are installed properly if you would like to use GPU. Follow the guide at https://www.tensor


Model initialization..


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at indobenchmark/indobert-base-p2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



Loading dataset..

Test initial model on sample text..

Model finetuning...


(Epoch 1) TRAIN LOSS:1.4313 LR:0.00000300: 100%|██████████████████| 111/111 [00:24<00:00,  4.47it/s]


(Epoch 1) TRAIN LOSS:1.4313 ACC:0.40 F1:0.34 REC:0.36 PRE:0.41 LR:0.00000300


VALID LOSS:1.2297 ACC:0.54 F1:0.52 REC:0.52 PRE:0.57: 100%|█████████| 14/14 [00:02<00:00,  5.79it/s]


(Epoch 1) VALID LOSS:1.2297 ACC:0.54 F1:0.52 REC:0.52 PRE:0.57


(Epoch 2) TRAIN LOSS:1.0873 LR:0.00000300: 100%|██████████████████| 111/111 [00:24<00:00,  4.49it/s]


(Epoch 2) TRAIN LOSS:1.0873 ACC:0.60 F1:0.58 REC:0.58 PRE:0.60 LR:0.00000300


VALID LOSS:0.9515 ACC:0.62 F1:0.61 REC:0.62 PRE:0.61: 100%|█████████| 14/14 [00:02<00:00,  6.19it/s]


(Epoch 2) VALID LOSS:0.9515 ACC:0.62 F1:0.61 REC:0.62 PRE:0.61


(Epoch 3) TRAIN LOSS:0.8165 LR:0.00000300: 100%|██████████████████| 111/111 [00:24<00:00,  4.51it/s]


(Epoch 3) TRAIN LOSS:0.8165 ACC:0.71 F1:0.71 REC:0.71 PRE:0.72 LR:0.00000300


VALID LOSS:0.8281 ACC:0.68 F1:0.67 REC:0.68 PRE:0.69: 100%|█████████| 14/14 [00:02<00:00,  6.29it/s]


(Epoch 3) VALID LOSS:0.8281 ACC:0.68 F1:0.67 REC:0.68 PRE:0.69


(Epoch 4) TRAIN LOSS:0.6862 LR:0.00000300: 100%|██████████████████| 111/111 [00:24<00:00,  4.55it/s]


(Epoch 4) TRAIN LOSS:0.6862 ACC:0.77 F1:0.77 REC:0.77 PRE:0.78 LR:0.00000300


VALID LOSS:0.7716 ACC:0.71 F1:0.72 REC:0.72 PRE:0.72: 100%|█████████| 14/14 [00:02<00:00,  6.28it/s]


(Epoch 4) VALID LOSS:0.7716 ACC:0.71 F1:0.72 REC:0.72 PRE:0.72


(Epoch 5) TRAIN LOSS:0.5650 LR:0.00000300: 100%|██████████████████| 111/111 [00:24<00:00,  4.52it/s]


(Epoch 5) TRAIN LOSS:0.5650 ACC:0.81 F1:0.81 REC:0.81 PRE:0.82 LR:0.00000300


VALID LOSS:0.7707 ACC:0.70 F1:0.70 REC:0.70 PRE:0.71: 100%|█████████| 14/14 [00:02<00:00,  6.30it/s]

(Epoch 5) VALID LOSS:0.7707 ACC:0.70 F1:0.70 REC:0.70 PRE:0.71





In [6]:
exp_dataset = valid_dataset.load_dataset(valid_path).head(2)
exp_dataset

Unnamed: 0,label,tweet
0,1,[USERNAME] jaringannya mati ya? Tidak bisa dibuka mobile jkn. Saya mau ke puskesmes trus piye mau tunjukkan kartu elektoniknya? #kecewa
1,1,"It's like a circle of stupidity. Atlit gak diurusin, duitnya buat bebenah ini itu biar atlit yang dateng dari luar negri bisa nyaman disini tp atlit sendiri kurang gizi. Belum beres bebenah eh salah satu stadion dirusak ""supporter"" karna timnya (atlit) main jelek. What a moron."


In [7]:
exp_dataset[['perturbed_text', 'perturbed_semantic_sim', 'orig_label', 'orig_prob', 'perturbed_label', 'perturbed_prob', 'running_time(s)']] = exp_dataset.apply(
    lambda row: attack(
        row.tweet,
        row.label,
        finetuned_model,
        tokenizer, 0.2,
        "codemixing",
        "en",
        use), axis=1, result_type='expand'
)
exp_dataset

  leave_1_probs = torch.tensor(leave_1_probs).to("cuda:1")
INFO:absl:Unable to initialize backend 'tpu_driver': NOT_FOUND: Unable to find driver in registry given worker: 
INFO:absl:Unable to initialize backend 'gpu': NOT_FOUND: Could not find registered platform with name: "cuda". Available platform names are: Interpreter Host
INFO:absl:Unable to initialize backend 'tpu': INVALID_ARGUMENT: TpuPlatform is not available.


Unnamed: 0,label,tweet,perturbed_text,perturbed_semantic_sim,orig_label,orig_prob,perturbed_label,perturbed_prob,running_time(s)
0,1,[USERNAME] jaringannya mati ya? Tidak bisa dibuka mobile jkn. Saya mau ke puskesmes trus piye mau tunjukkan kartu elektoniknya? #kecewa,[USERNAME] jaringannya mati ya? Tidak bisa dibuka mobile jkn. Saya mau ke puskesmes trus piye mau tunjukkan kartu elektoniknya? #kecewa,1.0,1,0.83189756,1,0.83189756,63.49
1,1,"It's like a circle of stupidity. Atlit gak diurusin, duitnya buat bebenah ini itu biar atlit yang dateng dari luar negri bisa nyaman disini tp atlit sendiri kurang gizi. Belum beres bebenah eh salah satu stadion dirusak ""supporter"" karna timnya (atlit) main jelek. What a moron.","It's like a circle of stupidity. Atlit gak diurusin, duitnya buat bebenah ini itu biar atlit yang dateng dari luar negri bisa nyaman disini tp atlit sendiri kurang gizi. Belum beres bebenah eh salah satu stadion dirusak ""supporter"" karna timnya (atlit) main jelek. What a moron.",1.0,0,0.41411155,0,0.41411155,0.02


In [8]:
# exp_dataset = exp_dataset.append([{'before_attack_acc':32, 'after_attack_acc':43}], ignore_index=True)
# exp_dataset.loc[exp_dataset.before_attack_acc == 0, "before_attack_acc"] = 92
# exp_dataset.loc[exp_dataset.after_attack_acc == 0, "after_attack_acc"] = 91
from sklearn.metrics import accuracy_score


before_attack = accuracy_score(exp_dataset['label'], exp_dataset['orig_label'])
after_attack = accuracy_score(exp_dataset['label'], exp_dataset['perturbed_label'])

exp_dataset.loc[exp_dataset.index[0], 'before_attack_acc'] = before_attack
exp_dataset.loc[exp_dataset.index[0], 'after_attack_acc'] = after_attack
exp_dataset

Unnamed: 0,label,tweet,perturbed_text,perturbed_semantic_sim,orig_label,orig_prob,perturbed_label,perturbed_prob,running_time(s),before_attack_acc,after_attack_acc
0,1,[USERNAME] jaringannya mati ya? Tidak bisa dibuka mobile jkn. Saya mau ke puskesmes trus piye mau tunjukkan kartu elektoniknya? #kecewa,[USERNAME] jaringannya mati ya? Tidak bisa dibuka mobile jkn. Saya mau ke puskesmes trus piye mau tunjukkan kartu elektoniknya? #kecewa,1.0,1,0.83189756,1,0.83189756,63.49,0.5,0.5
1,1,"It's like a circle of stupidity. Atlit gak diurusin, duitnya buat bebenah ini itu biar atlit yang dateng dari luar negri bisa nyaman disini tp atlit sendiri kurang gizi. Belum beres bebenah eh salah satu stadion dirusak ""supporter"" karna timnya (atlit) main jelek. What a moron.","It's like a circle of stupidity. Atlit gak diurusin, duitnya buat bebenah ini itu biar atlit yang dateng dari luar negri bisa nyaman disini tp atlit sendiri kurang gizi. Belum beres bebenah eh salah satu stadion dirusak ""supporter"" karna timnya (atlit) main jelek. What a moron.",1.0,0,0.41411155,0,0.41411155,0.02,,


# Sentiment

In [None]:
# def main():
set_seed(26092020)

use = USE()

print("\nModel initialization..")
tokenizer, config, model = init_model("IndoBERT", 'sentiment')
w2i, i2w = DocumentSentimentDataset.LABEL2INDEX, DocumentSentimentDataset.INDEX2LABEL

print("\nLoading dataset..")
train_dataset, train_loader, train_path = load_dataset_loader('sentiment', 'train', tokenizer)
valid_dataset, valid_loader, valid_path = load_dataset_loader('sentiment', 'valid', tokenizer)
test_dataset, test_loader, test_path_ = load_dataset_loader('sentiment', 'test', tokenizer)

text0 = 'lokasi di alun alun masakan padang ini cukup terkenal dengan kepala ikan kakap gule , biasa saya pesan nasi bungkus padang berisikan rendang , ayam pop dan perkedel . porsi banyak dan mengenyangkan'
text1 = 'meski masa kampanye sudah selesai , bukan berati habis pula upaya mengerek tingkat kedipilihan elektabilitas .'
text2 = 'kamar nya sempit tidak ada tempat menyimpan barang malah menambah barang . by the way ini kipas2 mau diletakkan mana . mana uchiwa segede ini pula .'


print("\nTest initial model on sample text..")
text_logit(text0, model, tokenizer, i2w)
text_logit(text1, model, tokenizer, i2w)
text_logit(text2, model, tokenizer, i2w)

print("\nModel finetuning...")
finetuned_model = fine_tuning_model(model, i2w, train_loader, valid_loader, 5)
del model


print("\nTest finetuned model on sample text..")
text_logit(text0, finetuned_model, tokenizer, i2w)
text_logit(text1, finetuned_model, tokenizer, i2w)
text_logit(text2, finetuned_model, tokenizer, i2w)



# if __name__ == "__main__":
#     main()

In [4]:
# dset['val'] = dset['val'].map(perturb_fb, num_proc=40)

exp_dataset = valid_dataset.load_dataset(valid_path).head(2)
exp_dataset

Unnamed: 0,label,tweet
0,1,[USERNAME] jaringannya mati ya? Tidak bisa dibuka mobile jkn. Saya mau ke puskesmes trus piye mau tunjukkan kartu elektoniknya? #kecewa
1,1,"It's like a circle of stupidity. Atlit gak diurusin, duitnya buat bebenah ini itu biar atlit yang dateng dari luar negri bisa nyaman disini tp atlit sendiri kurang gizi. Belum beres bebenah eh salah satu stadion dirusak ""supporter"" karna timnya (atlit) main jelek. What a moron."


In [5]:
exp_dataset[['perturbed_text', 'perturbed_semantic_sim', 'orig_label', 'orig_prob', 'perturbed_label', 'perturbed_prob', 'running_time(s)']] = exp_dataset.apply(
    lambda row: attack(
        row.text,
        row.sentiment,
        finetuned_model,
        tokenizer, 0.2,
        "codemixing",
        "en",
        use), axis=1, result_type='expand'
)
exp_dataset

AttributeError: 'Series' object has no attribute 'text'

In [None]:
# exp_dataset = exp_dataset.append([{'before_attack_acc':32, 'after_attack_acc':43}], ignore_index=True)
# exp_dataset.loc[exp_dataset.before_attack_acc == 0, "before_attack_acc"] = 92
# exp_dataset.loc[exp_dataset.after_attack_acc == 0, "after_attack_acc"] = 91
from sklearn.metrics import accuracy_score

downstream_task = 'sentiment'

before_attack = accuracy_score(exp_dataset[downstream_task], exp_dataset['orig_label'])
after_attack = accuracy_score(exp_dataset[downstream_task], exp_dataset['perturbed_label'])

exp_dataset.loc[exp_dataset.index[0], 'before_attack_acc'] = before_attack
exp_dataset.loc[exp_dataset.index[0], 'after_attack_acc'] = after_attack
exp_dataset

In [12]:
filename = 'test'
exp_dataset.to_csv(os.getcwd() + r'/result/'+filename+".csv", index=False)

In [None]:
print("\nAttacking text using codemixing...")
# perturbed_text, perturbed_semantic_sim, orig_prob, perturbed_prob
codemixed0 = attack(text0,0, finetuned_model, tokenizer, 0.2, 'jw', "codemixing", sim_predictor=use)
codemixed1 = attack(text1,1, finetuned_model, tokenizer, 0.2, 'en', "codemixing", sim_predictor=use)
codemixed2 = attack(text2,2, finetuned_model, tokenizer, 0.2, 'su', "codemixing", sim_predictor=use)

print("\nCalculating logit on codemixed data...")
text_logit(codemixed0, finetuned_model, tokenizer, i2w)
text_logit(codemixed1, finetuned_model, tokenizer, i2w)
text_logit(codemixed2, finetuned_model, tokenizer, i2w)

print("\nCalculating similarity score...")
print(use.semantic_sim(text0, codemixed0))
print(use.semantic_sim(text1, codemixed1))
print(use.semantic_sim(text2, codemixed2))

In [None]:
codemixed1 = attack(text1,1, finetuned_model, tokenizer, 0.2, 'en', "codemixing", sim_predictor=use)
ic(codemixed1)
text_logit(codemixed1, finetuned_model, tokenizer, i2w)

In [17]:
print("\nAttacking text using synonym replacement...")
codemixed0 = attack(text0,0, finetuned_model, tokenizer, 0.2, 'id', "synonym_replacement", sim_predictor=use)
codemixed1 = attack(text1,1, finetuned_model, tokenizer, 0.2, 'id', "synonym_replacement", sim_predictor=use)
codemixed2 = attack(text2,2, finetuned_model, tokenizer, 0.2, 'id', "synonym_replacement", sim_predictor=use)

print("\nCalculating logit on codemixed data...")
text_logit(codemixed0, finetuned_model, tokenizer, i2w)
text_logit(codemixed1, finetuned_model, tokenizer, i2w)
text_logit(codemixed2, finetuned_model, tokenizer, i2w)

print("\nCalculating similarity score...")
print(use.semantic_sim(text0, codemixed0))
print(use.semantic_sim(text1, codemixed1))
print(use.semantic_sim(text2, codemixed2))


Attacking text using synonym replacement...

Calculating logit on codemixed data...
Text: pendudukan di gelombang gelombang minuman keras stadion ini cukup tokoh dengan kewibawaan manusia ikan laut gule biasa saya surat biji bijian menubuhkan stadion berisikan rendang daging pop dan perkedel porsi banyak dan berhasil | Label : positive (97.747%)
Text: meski masa kampanye sudah selesai , bukan berati habis pula upaya mengerek tingkat kedipilihan elektabilitas . | Label : neutral (83.620%)
Text: bilik nya sempit tidak ada tempat menyarangkan keberadaan malah menceritakan keberadaan by the way ini kipas2 mau diletakkan mana mana uchiwa segede ini pula | Label : negative (98.721%)

Calculating similarity score...
0.8651537
1.0
0.9570405


In [18]:
print("\nAttacking text using synonym replacement...")
codemixed0 = attack(text0,0, finetuned_model, tokenizer, 0.4, 'id', "synonym_replacement", sim_predictor=use)
codemixed1 = attack(text1,1, finetuned_model, tokenizer, 0.4, 'id', "synonym_replacement", sim_predictor=use)
codemixed2 = attack(text2,2, finetuned_model, tokenizer, 0.4, 'id', "synonym_replacement", sim_predictor=use)

print("\nCalculating logit on synonym replaced data...")
text_logit(codemixed0, finetuned_model, tokenizer, i2w)
text_logit(codemixed1, finetuned_model, tokenizer, i2w)
text_logit(codemixed2, finetuned_model, tokenizer, i2w)

print("\nCalculating similarity score...")
print(use.semantic_sim(text0, codemixed0))
print(use.semantic_sim(text1, codemixed1))
print(use.semantic_sim(text2, codemixed2))


Attacking text using synonym replacement...

Calculating logit on synonym replaced data...
Text: pendudukan di gelombang gelombang minuman keras stadion ini cukup tokoh dengan kewibawaan manusia ikan laut gule biasa saya surat biji bijian menubuhkan stadion berisikan rendang daging pop dan perkedel porsi banyak dan berhasil | Label : positive (97.747%)
Text: meski masa usaha niaga sudah melengkapi bukan berati memenatkan pula pendudukan membesarkan menyangga kedipilihan elektabilitas | Label : neutral (59.589%)
Text: bilik nya sempit tidak ada tempat menyarangkan keberadaan malah menceritakan keberadaan by the way ini kipas2 mau diletakkan mana mana uchiwa segede ini pula | Label : negative (98.721%)

Calculating similarity score...
0.8651537
0.8801627
0.9570405


In [19]:
print("\nAttacking text using synonym replacement...")
codemixed0 = attack(text0,0, finetuned_model, tokenizer, 0.6, 'id', "synonym_replacement", sim_predictor=use)
codemixed1 = attack(text1,1, finetuned_model, tokenizer, 0.6, 'id', "synonym_replacement", sim_predictor=use)
codemixed2 = attack(text2,2, finetuned_model, tokenizer, 0.6, 'id', "synonym_replacement", sim_predictor=use)

print("\nCalculating logit on synonym replaced data...")
text_logit(codemixed0, finetuned_model, tokenizer, i2w)
text_logit(codemixed1, finetuned_model, tokenizer, i2w)
text_logit(codemixed2, finetuned_model, tokenizer, i2w)

print("\nCalculating similarity score...")
print(use.semantic_sim(text0, codemixed0))
print(use.semantic_sim(text1, codemixed1))
print(use.semantic_sim(text2, codemixed2))


Attacking text using synonym replacement...

Calculating logit on synonym replaced data...
Text: pendudukan di gelombang gelombang minuman keras stadion ini cukup tokoh dengan kewibawaan manusia ikan laut gule biasa saya surat biji bijian menubuhkan stadion berisikan rendang daging pop dan perkedel porsi banyak dan berhasil | Label : positive (97.747%)
Text: meski masa usaha niaga sudah melengkapi bukan berati memenatkan pula pendudukan membesarkan menyangga kedipilihan elektabilitas | Label : neutral (59.589%)
Text: bilik nya sempit tidak ada tempat menyarangkan keberadaan malah menceritakan keberadaan by the way ini kipas2 mau diletakkan mana mana uchiwa segede ini pula | Label : negative (98.721%)

Calculating similarity score...
0.8651537
0.8801627
0.9570405


In [20]:
print("\nAttacking text using synonym replacement...")
codemixed0 = attack(text0,0, finetuned_model, tokenizer, 0.8, 'id', "synonym_replacement", sim_predictor=use)
codemixed1 = attack(text1,1, finetuned_model, tokenizer, 0.8, 'id', "synonym_replacement", sim_predictor=use)
codemixed2 = attack(text2,2, finetuned_model, tokenizer, 0.8, 'id', "synonym_replacement", sim_predictor=use)

print("\nCalculating logit on synonym replaced data...")
text_logit(codemixed0, finetuned_model, tokenizer, i2w)
text_logit(codemixed1, finetuned_model, tokenizer, i2w)
text_logit(codemixed2, finetuned_model, tokenizer, i2w)

print("\nCalculating similarity score...")
print(use.semantic_sim(text0, codemixed0))
print(use.semantic_sim(text1, codemixed1))
print(use.semantic_sim(text2, codemixed2))


Attacking text using synonym replacement...

Calculating logit on synonym replaced data...
Text: pendudukan di gelombang gelombang minuman keras stadion ini cukup tokoh dengan kewibawaan manusia ikan laut gule biasa saya surat biji bijian menubuhkan stadion berisikan rendang daging pop dan perkedel porsi banyak dan berhasil | Label : positive (97.747%)
Text: meski masa usaha niaga sudah melengkapi bukan berati memenatkan pula pendudukan membesarkan menyangga kedipilihan elektabilitas | Label : neutral (59.589%)
Text: bilik nya sempit tidak ada tempat menyarangkan keberadaan malah menceritakan keberadaan by the way ini kipas2 mau diletakkan mana mana uchiwa segede ini pula | Label : negative (98.721%)

Calculating similarity score...
0.8651537
0.8801627
0.9570405
