In [1]:
from google.colab import drive
drive.mount('/content/drive/')

Drive already mounted at /content/drive/; to attempt to forcibly remount, call drive.mount("/content/drive/", force_remount=True).


In [2]:
pip install sastrawi

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [3]:
pip install nltk

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [4]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [5]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [6]:
pip install nlp_id

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [7]:
pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.27.4-py3-none-any.whl (6.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.8/6.8 MB[0m [31m75.2 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.11.0
  Downloading huggingface_hub-0.13.3-py3-none-any.whl (199 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m199.8/199.8 KB[0m [31m22.9 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Downloading tokenizers-0.13.2-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.6/7.6 MB[0m [31m57.0 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.13.3 tokenizers-0.13.2 transformers-4.27.4


In [8]:
import re
import random
import pandas as pd
import torch
import tensorflow as tf
import numpy as np

from nlp_id.lemmatizer import Lemmatizer
from nltk.corpus import stopwords
from tqdm import tqdm
from sklearn.preprocessing import KBinsDiscretizer
from sklearn.metrics import f1_score, cohen_kappa_score
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.initializers import TruncatedNormal
from tensorflow.keras.losses import CategoricalCrossentropy
from tensorflow.keras.metrics import CategoricalAccuracy
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.layers import Input, Dense
from transformers import BertTokenizer, BertForSequenceClassification
from transformers import AdamW, get_linear_schedule_with_warmup
from torch.utils.data import TensorDataset
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler
from nltk.corpus import stopwords


seed_val = 17
random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)

def f1_score_func(preds, labels):
    preds_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return f1_score(labels_flat, preds_flat, average='weighted')

def qwk_score_func(preds, labels):
    preds_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return cohen_kappa_score(labels_flat, preds_flat)

def accuracy_per_class(preds, labels):
    label_dict_inverse = {v: k for k, v in label_dict.items()}

    preds_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()

    for label in np.unique(labels_flat):
        y_preds = preds_flat[labels_flat==label]
        y_true = labels_flat[labels_flat==label]
        print(f'Class: {label_dict_inverse[label]}')
        print(f'Accuracy: {len(y_preds[y_preds==label])}/{len(y_true)}\n')

def evaluate(dataloader_val, device, model):

    model.eval()

    loss_val_total = 0
    predictions, true_vals = [], []

    for batch in dataloader_val:

        batch = tuple(b.to(device) for b in batch)

        inputs = {'input_ids':      batch[0],
                  'attention_mask': batch[1],
                  'labels':         batch[2],
                 }

        with torch.no_grad():
            outputs = model(**inputs)

        loss = outputs[0]
        logits = outputs[1]
        loss_val_total += loss.item()

        logits = logits.detach().cpu().numpy()
        label_ids = inputs['labels'].cpu().numpy()
        predictions.append(logits)
        true_vals.append(label_ids)

    loss_val_avg = loss_val_total/len(dataloader_val)

    predictions = np.concatenate(predictions, axis=0)
    true_vals = np.concatenate(true_vals, axis=0)

    return loss_val_avg, predictions, true_vals

def train_eval(df_final, pretrainedmodel):
    # bin nilai (continuous variable) into intervals
    df_final['nilai'] = pd.qcut(df_final['nilai'], 5, labels=False, duplicates='drop')

    # concatenate soal and jawaban
    df_final['soal-jawaban'] = df_final['soal']+df_final['jawaban']

    # preprocessing
    # lowercasing
    df_final['soal-jawaban'] = df_final['soal-jawaban'].apply(lambda x: x.lower())
    # lemmatization
    lemmatizer = Lemmatizer()
    df_final['soal-jawaban'] = df_final['soal-jawaban'].apply(lambda x: lemmatizer.lemmatize(x))
    # stopword removal
    list_stopwords = set(stopwords.words('indonesian'))
    df_final['soal-jawaban'] = df_final['soal-jawaban'].apply(lambda x: ' '.join([item for item in x.split() if item not in list_stopwords]))
    # punctuation removal
    df_final['soal-jawaban'] = df_final['soal-jawaban'].apply(lambda x: re.sub(r'[^\w\s]', '', x))

    # make sure that the training set and test set ratio is 80:20
    add = len(df_final[df_final['tipe'] == 'test']) - (round(0.2*(len(df_final[df_final['tipe'] == 'train'])+len(df_final[df_final['tipe'] == 'test']))))
    for i in df_final[df_final['tipe'] == 'test'].sample(n = add).itertuples():
        df_final.at[i.Index, 'tipe'] = 'train'

    # load model and tokenizer
    tokenizer = BertTokenizer.from_pretrained(pretrainedmodel, ignore_mismatched_sizes=True)

    encoded_data_train = tokenizer.batch_encode_plus(
        df_final[df_final.tipe=='train']['soal-jawaban'].values,
        add_special_tokens=True,
        return_attention_mask=True,
        pad_to_max_length=True,
        truncation=True,
        max_length=256,
        padding='max_length',
        return_tensors='pt'
    )

    encoded_data_val = tokenizer.batch_encode_plus(
        df_final[df_final.tipe=='test']['soal-jawaban'].values,
        add_special_tokens=True,
        return_attention_mask=True,
        pad_to_max_length=True,
        truncation=True,
        max_length=256,
        padding='max_length',
        return_tensors='pt'
    )

    input_ids_train = encoded_data_train['input_ids']
    attention_masks_train = encoded_data_train['attention_mask']
    labels_train = torch.tensor(df_final[df_final.tipe=='train'].nilai.values)

    input_ids_val = encoded_data_val['input_ids']
    attention_masks_val = encoded_data_val['attention_mask']
    labels_val = torch.tensor(df_final[df_final.tipe=='test'].nilai.values)

    dataset_train = TensorDataset(input_ids_train, attention_masks_train, labels_train)
    dataset_val = TensorDataset(input_ids_val, attention_masks_val, labels_val)

    model = BertForSequenceClassification.from_pretrained(pretrainedmodel,
                                                          num_labels=5,
                                                          output_attentions=False,
                                                          output_hidden_states=False, ignore_mismatched_sizes=True)

    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model.to(device)

    batch_size = 4

    dataloader_train = DataLoader(dataset_train,
                                  sampler=RandomSampler(dataset_train),
                                  batch_size=batch_size)

    dataloader_validation = DataLoader(dataset_val,
                                       sampler=SequentialSampler(dataset_val),
                                       batch_size=batch_size)

    optimizer = torch.optim.AdamW(model.parameters(),
                      lr=2e-5,
                      eps=1e-8)

    epochs = 4

    scheduler = get_linear_schedule_with_warmup(optimizer,
                                                num_warmup_steps=0,
                                                num_training_steps=len(dataloader_train)*epochs)

    for epoch in tqdm(range(1, epochs+1)):

        model.train()

        loss_train_total = 0

        progress_bar = tqdm(dataloader_train, desc='Epoch {:1d}'.format(epoch), leave=False, disable=False)
        for batch in progress_bar:

            model.zero_grad()

            batch = tuple(b.to(device) for b in batch)

            inputs = {'input_ids':      batch[0],
                      'attention_mask': batch[1],
                      'labels':         batch[2],
                     }

            outputs = model(**inputs)

            loss = outputs[0]
            loss_train_total += loss.item()
            loss.backward()

            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

            optimizer.step()
            scheduler.step()

            progress_bar.set_postfix({'training_loss': '{:.3f}'.format(loss.item()/len(batch))})

        tqdm.write(f'\nEpoch {epoch}')

        loss_train_avg = loss_train_total/len(dataloader_train)
        tqdm.write(f'Training loss: {loss_train_avg}')

        val_loss, predictions, true_vals = evaluate(dataloader_validation, device, model)
        val_f1 = f1_score_func(predictions, true_vals)
        val_qwk = qwk_score_func(predictions, true_vals)
        tqdm.write(f'Validation loss: {val_loss}')
        tqdm.write(f'F1 Score (Weighted): {val_f1}')
        tqdm.write(f'QWK Score: {val_qwk}')

In [None]:
import os
import pandas as pd

path_dir = '/content/drive/MyDrive/Paper_TA_ASAG/DATASET_TA/Data/Data_Lagi/Lifestyle'
list_dir = os.listdir(path_dir)

list_pre_trained_model = ['indobenchmark/indobert-lite-base-p2']

for m in list_pre_trained_model:
    print(m)
    for idx, ele in enumerate(list_dir):
        df_raw = pd.read_excel(open(path_dir+'/'+ele, 'rb'),
                               sheet_name='Soal',
                               header=1,
                               index_col=0,
                               usecols='B:D')

        list_final = []

        for i in df_raw.itertuples():
            list_final.append(
                {
                    'soal': i[1],
                    'jawaban': i[2],
                    'nilai': 100,
                    'tipe': 'train'
                }
            )
            df_tmp = pd.read_excel(open(path_dir+'/'+ele, 'rb'),
                                        sheet_name='No.'+str(i.Index),
                                        header=1,
                                        index_col=0,
                                        usecols='B:N')
            df_tmp = df_tmp.dropna()
            for j in df_tmp.itertuples():
                list_final.append(
                    {
                        'soal': i[1],
                        'jawaban': j[2],
                        'nilai': j[12],
                        'tipe': 'test'
                    }
                )
        if idx == 0:
            df_final = pd.DataFrame(list_final)
        else:
            df_final.append(pd.DataFrame(list_final), ignore_index=True)

        print(' '.join(ele.rstrip('.xslx').split('_')))
        train_eval(df_final, m)

indobenchmark/indobert-lite-base-p2
Analisis Essay Grading Lifestyle


Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/225k [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/2.00 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/1.54k [00:00<?, ?B/s]

The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'AlbertTokenizerFast'. 
The class this function is called from is 'BertTokenizer'.
You are using a model of type albert to instantiate a model of type bert. This is not supported for all configurations of models and can yield errors.


Downloading pytorch_model.bin:   0%|          | 0.00/46.7M [00:00<?, ?B/s]

Some weights of the model checkpoint at indobenchmark/indobert-lite-base-p2 were not used when initializing BertForSequenceClassification: ['encoder.albert_layer_groups.0.albert_layers.0.attention.dense.weight', 'encoder.albert_layer_groups.0.albert_layers.0.attention.key.weight', 'encoder.albert_layer_groups.0.albert_layers.0.ffn.weight', 'encoder.albert_layer_groups.0.albert_layers.0.attention.query.bias', 'encoder.albert_layer_groups.0.albert_layers.0.ffn.bias', 'encoder.albert_layer_groups.0.albert_layers.0.full_layer_layer_norm.bias', 'encoder.albert_layer_groups.0.albert_layers.0.attention.dense.bias', 'encoder.albert_layer_groups.0.albert_layers.0.attention.query.weight', 'encoder.albert_layer_groups.0.albert_layers.0.ffn_output.weight', 'encoder.albert_layer_groups.0.albert_layers.0.attention.key.bias', 'encoder.embedding_hidden_mapping_in.weight', 'encoder.embedding_hidden_mapping_in.bias', 'encoder.albert_layer_groups.0.albert_layers.0.attention.LayerNorm.bias', 'encoder.albe

In [None]:
!cp /content/drive/MyDrive/Paper_TA_ASAG/DATASET_TA/dataset.py
!cp /content/drive/MyDrive/Paper_TA_ASAG/DATASET_TA/dict.json

cp: missing destination file operand after '/content/drive/MyDrive/Paper_TA_ASAG/DATASET_TA/dataset.py'
Try 'cp --help' for more information.
cp: missing destination file operand after '/content/drive/MyDrive/Paper_TA_ASAG/DATASET_TA/dict.json'
Try 'cp --help' for more information.


In [28]:
import random
import itertools
import json
from google.colab import files

py_file_location = "/content/drive/MyDrive/Paper_TA_ASAG/DATASET_TA/dict.json"

with open(py_file_location, 'r') as file:
  data_json = json.load(file)

In [29]:
def swap_words(text):
    words = text.split()
    for i in range(len(words)):
        if i+1 < len(words):
            if random.random() < 0.2: # swap with 20% probability
                words[i], words[i+1] = words[i+1], words[i]
    return ' '.join(words)

In [39]:
def synonym_replacement(text, dictionary):
    new_sentences = []
    word_synonyms = []
    base_word = []
    words = text.split()
    for i in range(min(10, len(words))):
        synonym = []
        if words[i] in dictionary:
            synonyms = dictionary[words[i]]['sinonim'][0]
            synonym.append(synonyms)
            synonym.append(words[i]) 
            word_synonyms.append(synonym)
        else:
          synonym.append(words[i])
          word_synonyms.append(synonym)
    
    for j in range(5, len(words)):
      a = []
      a.append(words[j])
      word_synonyms.append(a)

    new_sentences = list(itertools.product(*word_synonyms))
    new_sentences = [' '.join(pasangan) for pasangan in new_sentences]
    return new_sentences

In [31]:
def insert_words(text, dictionary):
    words = text.split()
    n_words = len(words)
    n_insertions = int(n_words * 0.2) # 20% dari total kata akan dimasukkan
    insertion_positions = random.sample(range(n_words), n_insertions) # indeks acak untuk penyisipan kata
    new_sentences = []
    for pos in insertion_positions:
        if words[pos] in dictionary:
            synonyms = dictionary[words[pos]]['sinonim']
            if synonyms:
                new_word = random.choice(synonyms)
                new_words = words[:pos] + [new_word] + words[pos:]
                new_sentences.append(' '.join(new_words))
    return ' '.join(new_sentences)

In [32]:
def delete_word(text):
    words = text.split()
    num_words = len(words)
    if num_words < 2: # cannot delete if only one word in the sentence
        return text
    index = random.randint(0, num_words-1) # choose a random index to delete
    new_words = [word for i, word in enumerate(words) if i != index] # delete the chosen word
    return ' '.join(new_words)

### **teks tebal** Random Swap

In [33]:
import os
import pandas as pd

path_dir = '/content/drive/MyDrive/Paper_TA_ASAG/DATASET_TA/Data/Data_Lagi/Lifestyle'
list_dir = os.listdir(path_dir)

list_pre_trained_model = ['indobenchmark/indobert-lite-base-p2']

for m in list_pre_trained_model:
    print(m)
    for idx, ele in enumerate(list_dir):
        df_raw = pd.read_excel(open(path_dir+'/'+ele, 'rb'),
                               sheet_name='Soal',
                               header=1,
                               index_col=0,
                               usecols='B:D')

        list_final = []

        for i in df_raw.itertuples():
            list_final.append(
                {
                    'soal': i[1],
                    'jawaban': i[2],
                    'nilai': 100,
                    'tipe': 'train'
                }
            )
            df_tmp = pd.read_excel(open(path_dir+'/'+ele, 'rb'),
                                        sheet_name='No.'+str(i.Index),
                                        header=1,
                                        index_col=0,
                                        usecols='B:N')
            df_tmp = df_tmp.dropna()
            df_tmp.rename(columns= {'Siswa ': 'Siswa', 'Jawaban ' : 'Jawaban', 'Manual 1 ' : 'Manual 1', 'Manual 2 ' : 'Manual 2', 'Manual 3 ' : 'Manual 3', 
                        ' Manual 3' : 'Manual 3', 'Manual  3' : 'Manual 3'}, inplace = True)
            
            for x, row in df_tmp.iterrows():
                text = row['Jawaban']
                augmented_data = [swap_words(text) for _ in range(5)] # generate 5 random swaps
                for a in range(len(augmented_data)):
                    row['Jawaban'] = augmented_data[a]
                    new_row = row
                    df_tmp = df_tmp.append(new_row)

            df_tmp.reset_index(drop=True, inplace=True)

            for j in df_tmp.itertuples():
                list_final.append(
                    {
                        'soal': i[1],
                        'jawaban': j[2],
                        'nilai': j[12],
                        'tipe': 'test'
                    }
                )
        if idx == 0:
            df_final = pd.DataFrame(list_final)
        else:
            df_final.append(pd.DataFrame(list_final), ignore_index=True)

        print(' '.join(ele.rstrip('.xslx').split('_')))
        train_eval(df_final, m)

indobenchmark/indobert-lite-base-p2


[1;30;43mOutput streaming akan dipotong hingga 5000 baris terakhir.[0m
  df_tmp = df_tmp.append(new_row)
  df_tmp = df_tmp.append(new_row)
  df_tmp = df_tmp.append(new_row)
  df_tmp = df_tmp.append(new_row)
  df_tmp = df_tmp.append(new_row)
  df_tmp = df_tmp.append(new_row)
  df_tmp = df_tmp.append(new_row)
  df_tmp = df_tmp.append(new_row)
  df_tmp = df_tmp.append(new_row)
  df_tmp = df_tmp.append(new_row)
  df_tmp = df_tmp.append(new_row)
  df_tmp = df_tmp.append(new_row)
  df_tmp = df_tmp.append(new_row)
  df_tmp = df_tmp.append(new_row)
  df_tmp = df_tmp.append(new_row)
  df_tmp = df_tmp.append(new_row)
  df_tmp = df_tmp.append(new_row)
  df_tmp = df_tmp.append(new_row)
  df_tmp = df_tmp.append(new_row)
  df_tmp = df_tmp.append(new_row)
  df_tmp = df_tmp.append(new_row)
  df_tmp = df_tmp.append(new_row)
  df_tmp = df_tmp.append(new_row)
  df_tmp = df_tmp.append(new_row)
  df_tmp = df_tmp.append(new_row)
  df_tmp = df_tmp.append(new_row)
  df_tmp = df_tmp.append(new_row)
  df_tmp 

Analisis Essay Grading Lifestyle


The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'AlbertTokenizerFast'. 
The class this function is called from is 'BertTokenizer'.
You are using a model of type albert to instantiate a model of type bert. This is not supported for all configurations of models and can yield errors.
Some weights of the model checkpoint at indobenchmark/indobert-lite-base-p2 were not used when initializing BertForSequenceClassification: ['encoder.albert_layer_groups.0.albert_layers.0.ffn.weight', 'encoder.albert_layer_groups.0.albert_layers.0.attention.value.weight', 'encoder.albert_layer_groups.0.albert_layers.0.full_layer_layer_norm.weight', 'encoder.albert_layer_groups.0.albert_layers.0.attention.query.weight', 'encoder.albert_layer_groups.0.albert_layers.0.attention.query.bias', 'encoder.albert_layer_groups.0.albert_layers.0.attention.value.b


Epoch 1
Training loss: 1.3053288453232665


 25%|██▌       | 1/4 [02:38<07:56, 158.78s/it]

Validation loss: 1.096661926535835
F1 Score (Weighted): 0.49571529945419496
QWK Score: 0.40710588961740635



Epoch 2:   0%|          | 0/683 [00:00<?, ?it/s][A
Epoch 2:   0%|          | 0/683 [00:00<?, ?it/s, training_loss=0.453][A
Epoch 2:   0%|          | 1/683 [00:00<02:29,  4.55it/s, training_loss=0.453][A
Epoch 2:   0%|          | 1/683 [00:00<02:29,  4.55it/s, training_loss=0.208][A
Epoch 2:   0%|          | 2/683 [00:00<02:26,  4.64it/s, training_loss=0.208][A
Epoch 2:   0%|          | 2/683 [00:00<02:26,  4.64it/s, training_loss=0.460][A
Epoch 2:   0%|          | 3/683 [00:00<02:26,  4.63it/s, training_loss=0.460][A
Epoch 2:   0%|          | 3/683 [00:00<02:26,  4.63it/s, training_loss=0.238][A
Epoch 2:   1%|          | 4/683 [00:00<02:26,  4.64it/s, training_loss=0.238][A
Epoch 2:   1%|          | 4/683 [00:01<02:26,  4.64it/s, training_loss=0.443][A
Epoch 2:   1%|          | 5/683 [00:01<02:25,  4.66it/s, training_loss=0.443][A
Epoch 2:   1%|          | 5/683 [00:01<02:25,  4.66it/s, training_loss=0.339][A
Epoch 2:   1%|          | 6/683 [00:01<02:25,  4.64it/s, trainin


Epoch 2
Training loss: 0.8735563126699467


 50%|█████     | 2/4 [05:17<05:16, 158.46s/it]

Validation loss: 0.9086942219921545
F1 Score (Weighted): 0.6224555884255222
QWK Score: 0.5333782252546806



Epoch 3:   0%|          | 0/683 [00:00<?, ?it/s][A
Epoch 3:   0%|          | 0/683 [00:00<?, ?it/s, training_loss=0.328][A
Epoch 3:   0%|          | 1/683 [00:00<02:23,  4.74it/s, training_loss=0.328][A
Epoch 3:   0%|          | 1/683 [00:00<02:23,  4.74it/s, training_loss=0.375][A
Epoch 3:   0%|          | 2/683 [00:00<02:24,  4.70it/s, training_loss=0.375][A
Epoch 3:   0%|          | 2/683 [00:00<02:24,  4.70it/s, training_loss=0.036][A
Epoch 3:   0%|          | 3/683 [00:00<02:25,  4.67it/s, training_loss=0.036][A
Epoch 3:   0%|          | 3/683 [00:00<02:25,  4.67it/s, training_loss=0.506][A
Epoch 3:   1%|          | 4/683 [00:00<02:25,  4.67it/s, training_loss=0.506][A
Epoch 3:   1%|          | 4/683 [00:01<02:25,  4.67it/s, training_loss=0.354][A
Epoch 3:   1%|          | 5/683 [00:01<02:25,  4.64it/s, training_loss=0.354][A
Epoch 3:   1%|          | 5/683 [00:01<02:25,  4.64it/s, training_loss=0.440][A
Epoch 3:   1%|          | 6/683 [00:01<02:26,  4.62it/s, trainin


Epoch 3
Training loss: 0.4522538413682279


 75%|███████▌  | 3/4 [07:55<02:38, 158.37s/it]

Validation loss: 0.3963515521245001
F1 Score (Weighted): 0.8621204282418987
QWK Score: 0.8269507451526611



Epoch 4:   0%|          | 0/683 [00:00<?, ?it/s][A
Epoch 4:   0%|          | 0/683 [00:00<?, ?it/s, training_loss=0.004][A
Epoch 4:   0%|          | 1/683 [00:00<02:22,  4.79it/s, training_loss=0.004][A
Epoch 4:   0%|          | 1/683 [00:00<02:22,  4.79it/s, training_loss=0.126][A
Epoch 4:   0%|          | 2/683 [00:00<02:25,  4.69it/s, training_loss=0.126][A
Epoch 4:   0%|          | 2/683 [00:00<02:25,  4.69it/s, training_loss=0.131][A
Epoch 4:   0%|          | 3/683 [00:00<02:25,  4.66it/s, training_loss=0.131][A
Epoch 4:   0%|          | 3/683 [00:00<02:25,  4.66it/s, training_loss=0.001][A
Epoch 4:   1%|          | 4/683 [00:00<02:25,  4.68it/s, training_loss=0.001][A
Epoch 4:   1%|          | 4/683 [00:01<02:25,  4.68it/s, training_loss=0.002][A
Epoch 4:   1%|          | 5/683 [00:01<02:26,  4.62it/s, training_loss=0.002][A
Epoch 4:   1%|          | 5/683 [00:01<02:26,  4.62it/s, training_loss=0.187][A
Epoch 4:   1%|          | 6/683 [00:01<02:25,  4.64it/s, trainin


Epoch 4
Training loss: 0.14465526529140552


100%|██████████| 4/4 [10:33<00:00, 158.38s/it]

Validation loss: 0.20941007514388915
F1 Score (Weighted): 0.9470899098286638
QWK Score: 0.9338794900382956





In [None]:
for m in list_pre_trained_model:
    print(m)
    for idx, ele in enumerate(list_dir):
        df_raw = pd.read_excel(open(path_dir+'/'+ele, 'rb'),
                               sheet_name='Soal',
                               header=1,
                               index_col=0,
                               usecols='B:D')

        list_final = []

        for i in df_raw.itertuples():
            list_final.append(
                {
                    'soal': i[1],
                    'jawaban': i[2],
                    'nilai': 100,
                    'tipe': 'train'
                }
            )
            df_tmp = pd.read_excel(open(path_dir+'/'+ele, 'rb'),
                                        sheet_name='No.'+str(i.Index),
                                        header=1,
                                        index_col=0,
                                        usecols='B:N')
            df_tmp = df_tmp.dropna()
            df_tmp.rename(columns= {'Siswa ': 'Siswa', 'Jawaban ' : 'Jawaban', 'Manual 1 ' : 'Manual 1', 'Manual 2 ' : 'Manual 2', 'Manual 3 ' : 'Manual 3', 
                        ' Manual 3' : 'Manual 3', 'Manual  3' : 'Manual 3'}, inplace = True)
            
            for y, row in df_tmp.iterrows():
                text = row['Jawaban']
                augmented_data = synonym_replacement(text, data_json)
                for b in range(0, len(augmented_data) - 1):
                    row['Jawaban'] = augmented_data[b]
                    new_row = row
                    df_tmp = df_tmp.append(new_row)

            df_tmp.reset_index(drop=True, inplace=True)

            for j in df_tmp.itertuples():
                list_final.append(
                    {
                        'soal': i[1],
                        'jawaban': j[2],
                        'nilai': j[12],
                        'tipe': 'test'
                    }
                )
        if idx == 0:
            df_final = pd.DataFrame(list_final)
        else:
            df_final.append(pd.DataFrame(list_final), ignore_index=True)

        print(' '.join(ele.rstrip('.xslx').split('_')))
        train_eval(df_final, m)

indobenchmark/indobert-lite-base-p2


  df_tmp = df_tmp.append(new_row)
  df_tmp = df_tmp.append(new_row)
  df_tmp = df_tmp.append(new_row)
  df_tmp = df_tmp.append(new_row)
  df_tmp = df_tmp.append(new_row)
  df_tmp = df_tmp.append(new_row)
  df_tmp = df_tmp.append(new_row)
  df_tmp = df_tmp.append(new_row)
  df_tmp = df_tmp.append(new_row)
  df_tmp = df_tmp.append(new_row)
  df_tmp = df_tmp.append(new_row)
  df_tmp = df_tmp.append(new_row)
  df_tmp = df_tmp.append(new_row)
  df_tmp = df_tmp.append(new_row)
  df_tmp = df_tmp.append(new_row)
  df_tmp = df_tmp.append(new_row)
  df_tmp = df_tmp.append(new_row)
  df_tmp = df_tmp.append(new_row)
  df_tmp = df_tmp.append(new_row)
  df_tmp = df_tmp.append(new_row)
  df_tmp = df_tmp.append(new_row)
  df_tmp = df_tmp.append(new_row)
  df_tmp = df_tmp.append(new_row)
  df_tmp = df_tmp.append(new_row)
  df_tmp = df_tmp.append(new_row)
  df_tmp = df_tmp.append(new_row)
  df_tmp = df_tmp.append(new_row)
  df_tmp = df_tmp.append(new_row)
  df_tmp = df_tmp.append(new_row)
  df_tmp = df_

In [None]:
for m in list_pre_trained_model:
    print(m)
    for idx, ele in enumerate(list_dir):
        df_raw = pd.read_excel(open(path_dir+'/'+ele, 'rb'),
                               sheet_name='Soal',
                               header=1,
                               index_col=0,
                               usecols='B:D')

        list_final = []

        for i in df_raw.itertuples():
            list_final.append(
                {
                    'soal': i[1],
                    'jawaban': i[2],
                    'nilai': 100,
                    'tipe': 'train'
                }
            )
            df_tmp = pd.read_excel(open(path_dir+'/'+ele, 'rb'),
                                        sheet_name='No.'+str(i.Index),
                                        header=1,
                                        index_col=0,
                                        usecols='B:N')
            df_tmp = df_tmp.dropna()
            df_tmp.rename(columns= {'Siswa ': 'Siswa', 'Jawaban ' : 'Jawaban', 'Manual 1 ' : 'Manual 1', 'Manual 2 ' : 'Manual 2', 'Manual 3 ' : 'Manual 3', 
                        ' Manual 3' : 'Manual 3', 'Manual  3' : 'Manual 3'}, inplace = True)
            
            augmentasi_data = []
            for i, row in df.iterrows():
                text = row['Jawaban']
                augmented_data = [insert_words(text, data_json) for _ in range(5)] # generate 5 random insertions
                for j in range(len(augmented_data)):
                    row['Jawaban'] = augmented_data[j]
                    new_row = row
                    df = df.append(new_row)

            df_tmp.reset_index(drop=True, inplace=True)

            for j in df_tmp.itertuples():
                list_final.append(
                    {
                        'soal': i[1],
                        'jawaban': j[2],
                        'nilai': j[12],
                        'tipe': 'test'
                    }
                )
        if idx == 0:
            df_final = pd.DataFrame(list_final)
        else:
            df_final.append(pd.DataFrame(list_final), ignore_index=True)

        print(' '.join(ele.rstrip('.xslx').split('_')))
        train_eval(df_final, m)

In [None]:
for m in list_pre_trained_model:
    print(m)
    for idx, ele in enumerate(list_dir):
        df_raw = pd.read_excel(open(path_dir+'/'+ele, 'rb'),
                               sheet_name='Soal',
                               header=1,
                               index_col=0,
                               usecols='B:D')

        list_final = []

        for i in df_raw.itertuples():
            list_final.append(
                {
                    'soal': i[1],
                    'jawaban': i[2],
                    'nilai': 100,
                    'tipe': 'train'
                }
            )
            df_tmp = pd.read_excel(open(path_dir+'/'+ele, 'rb'),
                                        sheet_name='No.'+str(i.Index),
                                        header=1,
                                        index_col=0,
                                        usecols='B:N')
            df_tmp = df_tmp.dropna()
            df_tmp.rename(columns= {'Siswa ': 'Siswa', 'Jawaban ' : 'Jawaban', 'Manual 1 ' : 'Manual 1', 'Manual 2 ' : 'Manual 2', 'Manual 3 ' : 'Manual 3', 
                        ' Manual 3' : 'Manual 3', 'Manual  3' : 'Manual 3'}, inplace = True)
            
            for i, row in df.iterrows():
                text = row['Jawaban']
                augmented_data = [delete_word(text) for _ in range(5)] # generate 5 random deletions
                for j in range(len(augmented_data)):
                    siswa_id_number = siswa_id_number + 1
                    row['Siswa'] = "siswa_" + str(siswa_id_number)
                    row['Jawaban'] = augmented_data[j]
                    new_row = row
                    df = df.append(new_row)

            df_tmp.reset_index(drop=True, inplace=True)

            for j in df_tmp.itertuples():
                list_final.append(
                    {
                        'soal': i[1],
                        'jawaban': j[2],
                        'nilai': j[12],
                        'tipe': 'test'
                    }
                )
        if idx == 0:
            df_final = pd.DataFrame(list_final)
        else:
            df_final.append(pd.DataFrame(list_final), ignore_index=True)

        print(' '.join(ele.rstrip('.xslx').split('_')))
        train_eval(df_final, m)

In [None]:
def train_eval_raw(df_final, pretrainedmodel):
    # bin nilai (continuous variable) into intervals
    df_final['nilai'] = pd.qcut(df_final['nilai'], 5, labels=False, duplicates='drop')

    # concatenate soal and jawaban
    df_final['soal-jawaban'] = df_final['soal']+df_final['jawaban']

    # make sure that the training set and test set ratio is 80:20
    add = len(df_final[df_final['tipe'] == 'test']) - (round(0.2*(len(df_final[df_final['tipe'] == 'train'])+len(df_final[df_final['tipe'] == 'test']))))
    for i in df_final[df_final['tipe'] == 'test'].sample(n = add).itertuples():
        df_final.at[i.Index, 'tipe'] = 'train'

    # load model and tokenizer
    tokenizer = BertTokenizer.from_pretrained(pretrainedmodel, ignore_mismatched_sizes=True)

    encoded_data_train = tokenizer.batch_encode_plus(
        df_final[df_final.tipe=='train']['soal-jawaban'].values,
        add_special_tokens=True,
        return_attention_mask=True,
        pad_to_max_length=True,
        truncation=True,
        max_length=256,
        padding='max_length',
        return_tensors='pt'
    )

    encoded_data_val = tokenizer.batch_encode_plus(
        df_final[df_final.tipe=='test']['soal-jawaban'].values,
        add_special_tokens=True,
        return_attention_mask=True,
        pad_to_max_length=True,
        truncation=True,
        max_length=256,
        padding='max_length',
        return_tensors='pt'
    )

    input_ids_train = encoded_data_train['input_ids']
    attention_masks_train = encoded_data_train['attention_mask']
    labels_train = torch.tensor(df_final[df_final.tipe=='train'].nilai.values)

    input_ids_val = encoded_data_val['input_ids']
    attention_masks_val = encoded_data_val['attention_mask']
    labels_val = torch.tensor(df_final[df_final.tipe=='test'].nilai.values)

    dataset_train = TensorDataset(input_ids_train, attention_masks_train, labels_train)
    dataset_val = TensorDataset(input_ids_val, attention_masks_val, labels_val)

    model = BertForSequenceClassification.from_pretrained(pretrainedmodel,
                                                          num_labels=5,
                                                          output_attentions=False,
                                                          output_hidden_states=False, ignore_mismatched_sizes=True)

    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model.to(device)

    batch_size = 4

    dataloader_train = DataLoader(dataset_train,
                                  sampler=RandomSampler(dataset_train),
                                  batch_size=batch_size)

    dataloader_validation = DataLoader(dataset_val,
                                       sampler=SequentialSampler(dataset_val),
                                       batch_size=batch_size)

    optimizer = torch.optim.AdamW(model.parameters(),
                      lr=2e-5,
                      eps=1e-8)

    epochs = 4

    scheduler = get_linear_schedule_with_warmup(optimizer,
                                                num_warmup_steps=0,
                                                num_training_steps=len(dataloader_train)*epochs)

    for epoch in tqdm(range(1, epochs+1)):

        model.train()

        loss_train_total = 0

        progress_bar = tqdm(dataloader_train, desc='Epoch {:1d}'.format(epoch), leave=False, disable=False)
        for batch in progress_bar:

            model.zero_grad()

            batch = tuple(b.to(device) for b in batch)

            inputs = {'input_ids':      batch[0],
                      'attention_mask': batch[1],
                      'labels':         batch[2],
                     }

            outputs = model(**inputs)

            loss = outputs[0]
            loss_train_total += loss.item()
            loss.backward()

            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

            optimizer.step()
            scheduler.step()

            progress_bar.set_postfix({'training_loss': '{:.3f}'.format(loss.item()/len(batch))})

        torch.save(model.state_dict(), f'/content/drive/MyDrive/Paper_TA_ASAG/DATASET_TA/Data/Data_Lagi/Lifestyle_Save/finetuned_BERT_raw_epoch_{epoch}.model')
        tqdm.write(f'\nEpoch {epoch}')

        loss_train_avg = loss_train_total/len(dataloader_train)
        tqdm.write(f'Training loss: {loss_train_avg}')

        val_loss, predictions, true_vals = evaluate(dataloader_validation, device, model)
        val_f1 = f1_score_func(predictions, true_vals)
        val_qwk = qwk_score_func(predictions, true_vals)
        tqdm.write(f'Validation loss: {val_loss}')
        tqdm.write(f'F1 Score (Weighted): {val_f1}')
        tqdm.write(f'QWK Score: {val_qwk}')

In [None]:
while 1:
  print(1)

[1;30;43mOutput streaming akan dipotong hingga 5000 baris terakhir.[0m
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1

In [None]:
import os
import pandas as pd

path_dir = '/content/drive/MyDrive/Paper_TA_ASAG/DATASET_TA/Data/Data_Lagi/Lifestyle'
list_dir = os.listdir(path_dir)

list_pre_trained_model = ['indobenchmark/indobert-lite-base-p2']

for m in list_pre_trained_model:
    print(m)
    for idx, ele in enumerate(list_dir):
        df_raw = pd.read_excel(open(path_dir+'/'+ele, 'rb'),
                               sheet_name='Soal',
                               header=1,
                               index_col=0,
                               usecols='B:D')

        list_final = []

        for i in df_raw.itertuples():
            list_final.append(
                {
                    'soal': i[1],
                    'jawaban': i[2],
                    'nilai': 100,
                    'tipe': 'train'
                }
            )
            df_tmp = pd.read_excel(open(path_dir+'/'+ele, 'rb'),
                                        sheet_name='No.'+str(i.Index),
                                        header=1,
                                        index_col=0,
                                        usecols='B:N')
            df_tmp = df_tmp.dropna()
            for j in df_tmp.itertuples():
                list_final.append(
                    {
                        'soal': i[1],
                        'jawaban': j[2],
                        'nilai': j[12],
                        'tipe': 'test'
                    }
                )
        if idx == 0:
            df_final = pd.DataFrame(list_final)
        else:
            df_final.append(pd.DataFrame(list_final), ignore_index=True)

        print(' '.join(ele.rstrip('.xslx').split('_')))
        train_eval_raw(df_final, m)
