In [1]:
from google.colab import drive
drive.mount('/gdrive')

Mounted at /gdrive


In [2]:
!pip install transformers
!pip install sklearn_crfsuite
!pip install seqeval

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.19.2-py3-none-any.whl (4.2 MB)
[K     |████████████████████████████████| 4.2 MB 4.2 MB/s 
Collecting huggingface-hub<1.0,>=0.1.0
  Downloading huggingface_hub-0.7.0-py3-none-any.whl (86 kB)
[K     |████████████████████████████████| 86 kB 5.9 MB/s 
[?25hCollecting tokenizers!=0.11.3,<0.13,>=0.11.1
  Downloading tokenizers-0.12.1-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (6.6 MB)
[K     |████████████████████████████████| 6.6 MB 72.8 MB/s 
Collecting pyyaml>=5.1
  Downloading PyYAML-6.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (596 kB)
[K     |████████████████████████████████| 596 kB 77.7 MB/s 
Installing collected packages: pyyaml, tokenizers, huggingface-hub, transformers
  Attempting uninstall: pyyaml
    Found existing installation: PyYAML 3.13
    Uninstallin

In [3]:
import numpy as np
import pandas as pd 
import os
from tqdm import tqdm

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.sequence import pad_sequences

import torch
import torch.nn as nn

import transformers
from transformers import AutoTokenizer, AutoModel

from sklearn_crfsuite import metrics

In [4]:
# CONFIG
ROOT = '/gdrive/MyDrive/kuliah/Tugas Akhir/Eksperimen'
class Config : 
    MAX_LEN = 256
    MODEL = AutoModel.from_pretrained("indobenchmark/indobert-base-p1")
    TOKENIZER = AutoTokenizer.from_pretrained("indobenchmark/indobert-base-p1")
    TRAIN_BATCH_SIZE = 32
    TEST_BATCH_SIZE = 1
    EPOCH = 5
    BEST_MODEL_PATH = "/gdrive/MyDrive/kuliah/Tugas Akhir/Eksperimen/nested/results-model/final-model-indobert"

Downloading:   0%|          | 0.00/1.50k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/475M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/2.00 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/224k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/112 [00:00<?, ?B/s]

In [5]:
tag2idx = {
  'B-PER' : 0, 
  'I-PER' : 1, 
  'O' : 2, 
  '[CLS]' : 3, 
  '[SEP]' : 4, 
  'X' : 5,
  'O+I-PER' : 6,
  'O+O' : 7,
  'I-PER+I-PER' : 8,
  'B-PER+B-PER' : 9,

}

idx2tag = {
    0 : 'B-PER',
    1 : 'I-PER', 
    2 : 'O', 
    3 : '[CLS]',
    4 : '[SEP]',
    5 : 'X',
    6 : 'O+I-PER',
    7 : 'O+O',
    8 : 'I-PER+I-PER',
    9 : 'B-PER+B-PER',

}

In [6]:
class EntityDataset : 
    def __init__(self, sentences, tag, config) : 
        self.sentences = sentences
        self.tags = tag
        self.config = config
    
    def __len__(self) : 
        return len(self.sentences)
    
    def __getitem__(self, item) :
        text = self.sentences[item]
        tags = self.tags[item]

        temp_tags = []
        temp_tags_raw = []
        temp_tokens = []

        temp_tags.append('[CLS]')
        temp_tags_raw.append('[CLS]')
        temp_tokens.append('[CLS]')
        for word, corr_tag in zip(text, tags) : 
            token_list = self.config.TOKENIZER.tokenize(word) # tadinya gaada self nya, coba di peratiin
            prev = ""
            for i, w_piece in enumerate(token_list) :
                temp_tokens.append(w_piece)
                if i == 0 : 
                    temp_tags.append(corr_tag)
                    temp_tags_raw.append(corr_tag)
                    prev = corr_tag
                else : 
                    temp_tags_raw.append('X')
                    if prev == 'B-PER+B-PER' or prev == 'I-PER+I-PER' :
                      temp_tags.append('I-PER+I-PER')
                      prev = "I-PER+I-PER"
                    else : 
                      temp_tags.append('O+O')
                      prev = "O+O"

          
        temp_tags.append('[SEP]')
        temp_tags_raw.append('[SEP]')
        temp_tokens.append('[SEP]')

        input_ids = self.config.TOKENIZER.convert_tokens_to_ids(temp_tokens)

        padded = pad_sequences([
          input_ids,
          [1 for ii in input_ids],
          [0 for ii in input_ids],
          [tag2idx[tag] for tag in temp_tags],
          [tag2idx[tag] for tag in temp_tags_raw],

        ], maxlen=self.config.MAX_LEN, dtype="long", truncating="post", padding="post")


        return {
            "input_ids": torch.tensor(padded[0], dtype=torch.long),
            "attention_mask": torch.tensor(padded[1], dtype=torch.long),
            "token_type_ids": torch.tensor(padded[2], dtype=torch.long),
            "target_tag": torch.tensor(padded[3], dtype=torch.long),
            "target_tag_raw": torch.tensor(padded[4], dtype=torch.long),

        }
    
class NERModel(nn.Module) : 
    def __init__(self, num_tag, config) :
        super(NERModel, self).__init__()
        self.num_tag = num_tag
        self.bert = config.MODEL
        
        self.bert_drop_1 = nn.Dropout(0.5)
        
        self.out_tag = nn.Linear(768, num_tag)
        
    def forward(self, input_ids, token_type_ids, attention_mask, target_tag) :
        output = self.bert(
            input_ids, 
            attention_mask=attention_mask, 
            token_type_ids=token_type_ids
        )
        
        bo_tag = self.bert_drop_1(output[0])
        
        tag = self.out_tag(bo_tag)

        loss = loss_fn(tag, target_tag, attention_mask, self.num_tag)
        
        return tag, loss
        

In [7]:
def preprocess(data) : 

    sentences = data.groupby('Sentence #')['Kata'].apply(list).values

    tag = data.groupby('Sentence #')['Label'].apply(list).values
    
    return sentences, tag
def loss_fn(output, target, mask, num_labels):
    lfn = nn.CrossEntropyLoss()
    active_loss = mask.view(-1) == 1
    active_logits = output.view(-1, num_labels)
    active_labels = torch.where(
        active_loss,
        target.view(-1),
        torch.tensor(lfn.ignore_index).type_as(target)
    )
    loss = lfn(active_logits, active_labels)
    return loss

In [8]:
def train_func(model, data, device, optimizer) :
    model.train()
    total_loss = 0
    for x_input in tqdm(data, total=len(data)) :
        for key, val in x_input.items() : 
            x_input[key] = val.to(device)
        optimizer.zero_grad()
        _, loss = model(
            input_ids=x_input['input_ids'],
            token_type_ids=x_input['token_type_ids'],
            attention_mask=x_input['attention_mask'],
            target_tag=x_input['target_tag']
        )
        loss.backward()
        optimizer.step()
#         scheduler.step()
        total_loss += loss.item()

    return total_loss / len(data)
    

def eval_func(data, model, device):
    model.eval()
    total_loss = 0
    for x_input in tqdm(data, total=len(data)) :
        for key, value in x_input.items() : 
            x_input[key] = value.to(device)
        _, loss = model(
            input_ids=x_input['input_ids'],
            token_type_ids=x_input['token_type_ids'],
            attention_mask=x_input['attention_mask'],
            target_tag=x_input['target_tag']
        )
        total_loss += loss.item()
    return total_loss / len(data)

In [9]:
from ast import literal_eval

train_data = pd.read_csv(f"{ROOT}/nested/train_data.csv")
test_data = pd.read_csv(f"{ROOT}/nested/test_data.csv")

train_data['sent'] = train_data['sent'].apply(literal_eval)
train_data['tag'] = train_data['tag'].apply(literal_eval)

test_data['sent'] = test_data['sent'].apply(literal_eval)
test_data['tag'] = test_data['tag'].apply(literal_eval)

In [10]:
train_data.explode('tag').tag.unique()

array(['O+O', 'B-PER+B-PER', 'I-PER+I-PER', 'O+I-PER'], dtype=object)

In [11]:
config = Config()
# sentences, tag = preprocess(data)
num_tag = len(tag2idx)
# print(f"Number of tags: {num_tag}")
# (
#     train_sentence,
#     test_sentence, 
#     train_tag,
#     test_tag
# ) = train_test_split(
#     sentences,
#     tag,
#     test_size=0.2
# )
train_sentence = train_data['sent'].values
test_sentence = test_data['sent'].values

train_tag = train_data['tag'].values
test_tag = test_data['tag'].values

train_dataset = EntityDataset(
    train_sentence, train_tag, config
)

test_dataset = EntityDataset(
    test_sentence, test_tag, config
)

train_dataloader = torch.utils.data.DataLoader(
    train_dataset, batch_size=config.TRAIN_BATCH_SIZE, num_workers=0
)

test_dataloader = torch.utils.data.DataLoader(
    test_dataset, batch_size=config.TEST_BATCH_SIZE, num_workers=0
)

In [12]:
device = torch.device("cuda")

In [None]:

model = NERModel(num_tag, config)
model.to(device)

optimizer = transformers.AdamW(model.parameters(), lr=5e-5)
best_loss = np.inf
train_losses = []
for epoch in range(config.EPOCH) : 
    print(f"Epoch: {epoch}")
    # train 
    train_loss = train_func(
      data=train_dataloader,
      model=model,
      optimizer=optimizer,
      device=device
    )
    print(f"Train-loss: {train_loss}")
    test_loss = eval_func(
      data=test_dataloader,
      model=model,
      device=device
    )
    print(f"Test-loss: {test_loss}")
    train_losses.append(train_loss)
    if test_loss < best_loss:
        best_loss = test_loss
        torch.save(model.state_dict(), config.BEST_MODEL_PATH)
        print("Saving best model")
    print()




Epoch: 0


100%|██████████| 24/24 [00:30<00:00,  1.29s/it]


Train-loss: 0.21003882284276187


100%|██████████| 191/191 [00:03<00:00, 52.99it/s]


Test-loss: 0.18622731885920626
Saving best model

Epoch: 1


100%|██████████| 24/24 [00:31<00:00,  1.31s/it]


Train-loss: 0.03828979469835758


100%|██████████| 191/191 [00:03<00:00, 51.91it/s]


Test-loss: 0.12767603829950588
Saving best model

Epoch: 2


100%|██████████| 24/24 [00:31<00:00,  1.33s/it]


Train-loss: 0.0222810652339831


100%|██████████| 191/191 [00:03<00:00, 51.50it/s]


Test-loss: 0.13592981516367741

Epoch: 3


100%|██████████| 24/24 [00:31<00:00,  1.33s/it]


Train-loss: 0.020520744932582602


100%|██████████| 191/191 [00:03<00:00, 51.35it/s]


Test-loss: 0.13937818829096818

Epoch: 4


100%|██████████| 24/24 [00:32<00:00,  1.34s/it]


Train-loss: 0.01784388303834324


100%|██████████| 191/191 [00:03<00:00, 50.24it/s]

Test-loss: 0.14037109829791083






In [13]:
model = NERModel(num_tag, config)
model.load_state_dict(torch.load(config.BEST_MODEL_PATH))
model.to(device)

NERModel(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(50000, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
   

In [55]:
from ast import literal_eval

test_data = pd.read_csv(f"{ROOT}/nested/test_data.csv")

test_data['sent'] = test_data['sent'].apply(literal_eval)
test_data['tag'] = test_data['tag'].apply(literal_eval)

test_data = test_data.sort_values(['sentence #'])


test_sentence = test_data['sent'].values
test_tag = test_data['tag'].values
test_dataset = EntityDataset(
    test_sentence, test_tag, config
)

test_dataloader = torch.utils.data.DataLoader(
    test_dataset
)

In [56]:
train_data.explode('tag').tag.unique()

array(['O+O', 'B-PER+B-PER', 'I-PER+I-PER', 'O+I-PER'], dtype=object)

In [57]:
y_true = []
y_pred = []
y_word = []
y_raw = []
model.eval()
for step, batch in enumerate(test_dataloader):
    for key, val in batch.items() : 
        batch[key] = val.to(device)
        
    with torch.no_grad():
        outputs = model(
            input_ids=batch['input_ids'],
            token_type_ids=batch['token_type_ids'],
            attention_mask=batch['attention_mask'],
            target_tag=batch['target_tag']
        )
        # For eval mode, the first result of outputs is logits
        logits = outputs[0] 
        
    logits = logits.argmax(2).cpu().numpy()
    label_ids = batch['target_tag'].to('cpu').numpy()
    input_mask = batch['attention_mask'].to('cpu').numpy()
    real = batch['input_ids'].to('cpu').numpy()
    raw = batch['target_tag_raw'].to('cpu').numpy()

    for i,mask in enumerate(input_mask):
        # Real one
        temp_1 = []
        # Predict one
        temp_2 = []

        temp_3 = []

        temp_4 = []

        for j, m in enumerate(mask):
            if m:
                if (idx2tag[logits[i][j]] != '[CLS]' and idx2tag[logits[i][j]] != '[SEP]' and idx2tag[logits[i][j]] != 'X') : 
                  temp_1.append(idx2tag[label_ids[i][j]])
                  temp_2.append(idx2tag[logits[i][j]])
                  temp_3.append(config.TOKENIZER.convert_ids_to_tokens(int(real[i][j])))
                  temp_4.append(idx2tag[raw[i][j]])

            else:
                break
                
    y_true.append(temp_1)
    y_pred.append(temp_2)
    y_word.append(temp_3)
    y_raw.append(temp_4)


In [58]:
# idx2tag = {
#     0 : 'B-PER',
#     1 : 'I-PER', 
#     2 : 'O', 
#     3 : '[CLS]',
#     4 : '[SEP]',
#     5 : 'X',
#     6 : 'O+I-PER',
#     7 : 'O+I-PER+I-PER',
#     8 : 'O+I-PER+I-PER+I-PER'
# }

In [59]:
df_res = pd.DataFrame({
    'sent' : y_word, 
    'true' : y_true,
    'pred' : y_pred
})
df_res = df_res.explode(['sent', 'true', 'pred'])
# df_res = df_res[df_res['true'].isin(['O', 'B-PER', 'I-PER'])]
df_res.to_csv('test.csv', index=False)

In [60]:
def get_level_token(seqs, max_lev=2, level=0) : 
  new_all_level = []
  for seq in seqs : 
    new_all_level_i = []
    for tag in seq : 
      new_tag = []
      tag_per_level = tag.split('+')
      for lev in range(max_lev) :
        if lev + 1 > len(tag_per_level) : 
          new_tag.append(tag_per_level[-1])
        else : 
          new_tag.append(tag_per_level[lev])
      new_all_level_i.append(new_tag)
    new_all_level.append(new_all_level_i)
  
  new_seq = []
  for seq in new_all_level :
    seq_i = [] 
    for tag in seq : 
      seq_i.append(tag[level])
    new_seq.append(seq_i)
  
  return new_seq

y_true_0 = get_level_token(y_true, max_lev=2, level=0)
y_pred_0 = get_level_token(y_pred, max_lev=2, level=0)

In [61]:
from seqeval.metrics import classification_report
print(classification_report(y_true_0, y_pred_0))

              precision    recall  f1-score   support

         PER       0.76      0.80      0.78       222

   micro avg       0.76      0.80      0.78       222
   macro avg       0.76      0.80      0.78       222
weighted avg       0.76      0.80      0.78       222



In [62]:
y_true_1 = get_level_token(y_true, max_lev=2, level=1)
y_pred_1 = get_level_token(y_pred, max_lev=2, level=1)

from seqeval.metrics import classification_report
print(classification_report(y_true_1, y_pred_1))

              precision    recall  f1-score   support

         PER       0.71      0.74      0.73       229

   micro avg       0.71      0.74      0.73       229
   macro avg       0.71      0.74      0.73       229
weighted avg       0.71      0.74      0.73       229



In [63]:
y_test_token = []
for i in y_true : 
  y_test_token = y_test_token + i

y_pred_token = []
for i in y_pred : 
  y_pred_token = y_pred_token + i

In [64]:
from sklearn.metrics import classification_report
print(classification_report(y_test_token, y_pred_token))

              precision    recall  f1-score   support

 B-PER+B-PER       0.94      0.91      0.92       222
 I-PER+I-PER       0.96      0.85      0.91       793
     O+I-PER       0.54      0.38      0.44       114
         O+O       0.98      0.99      0.98      7174

    accuracy                           0.97      8303
   macro avg       0.85      0.78      0.81      8303
weighted avg       0.97      0.97      0.97      8303



In [65]:
bert_df = pd.DataFrame() 
bert_df['sent'] = y_word
bert_df['raw'] = y_raw
bert_df['tag'] = y_true
bert_df['pred'] = y_pred

In [66]:
t = bert_df.explode(['sent', 'raw', 'tag', 'pred'])
t = t[t.raw != 'X']
t

Unnamed: 0,sent,raw,tag,pred
0,segala,O+O,O+O,O+O
0,puji,O+O,O+O,O+O
0,bagi,O+O,O+O,O+O
0,allah,O+O,O+O,O+O
0,tuhan,O+O,O+O,O+O
...,...,...,...,...
190,bertakwa,O+O,O+O,O+O
190,agar,O+O,O+O,O+O
190,kamu,O+O,O+O,O+O
190,diberi,O+O,O+O,O+O


In [67]:
test_data = pd.read_csv(f"{ROOT}/nested/test_data.csv")

test_data['sent'] = test_data['sent'].apply(literal_eval)
test_data['tag'] = test_data['tag'].apply(literal_eval)
test_data['pos'] = test_data['pos'].apply(literal_eval)

test_data = test_data.sort_values(['sentence #'])

In [68]:
t2 = test_data.explode(['sent', 'tag'])[['sentence #', 'juz', 'no_ayat', 'sent', 'tag']]
t2['pred'] = t['pred'].to_list()

In [71]:
t2.to_csv(f"{ROOT}/nested/results-prediction/indobert.csv", index=False)

In [None]:
st = test_data.explode(['pred', 'pred_raw'])
res_indobert = t[t['pred_raw'].isin(['B-PER', 'I-PER', 'O', 'O+I-PER', 'O+I-PER+I-PER', 'O+I-PER+I-PER+I-PER'])]

In [None]:
res_indobert

In [None]:
cols = ['juz', 'no_ayat', 'sent', 'tag', 'pred']
res_indobert = res_indobert[cols]
res_indobert

In [None]:
t = test_data.explode(['sent', 'tag'])[['juz', 'no_ayat', 'sent', 'tag']]
t['pred'] = res_indobert['pred'].to_list()

In [None]:
test_data = pd.read_csv(f"{ROOT}/nested/test_data.csv")
test_data.explode('sent')

In [70]:
t2[(t2.juz == 2) & (t2.no_ayat == 99)]

Unnamed: 0,sentence #,juz,no_ayat,sent,tag,pred
0,106,2,99,Dan,O+O,O+O
0,106,2,99,sesungguhnya,O+O,O+O
0,106,2,99,Kami,O+O,O+O
0,106,2,99,telah,O+O,O+O
0,106,2,99,menurunkan,O+O,O+O
0,106,2,99,kepadamu,O+O,O+O
0,106,2,99,ayat-ayat,O+O,O+O
0,106,2,99,yang,O+O,O+O
0,106,2,99,jelas,O+O,O+O
0,106,2,99,dan,O+O,O+O
