In [1]:
pip install transformers

[0mNote: you may need to restart the kernel to use updated packages.


In [2]:
pip install PySastrawi

Collecting PySastrawi
  Downloading PySastrawi-1.2.0-py2.py3-none-any.whl (210 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m210.6/210.6 kB[0m [31m4.8 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hInstalling collected packages: PySastrawi
Successfully installed PySastrawi-1.2.0
[0mNote: you may need to restart the kernel to use updated packages.


In [3]:
import string
import re
import math
from Sastrawi.StopWordRemover.StopWordRemoverFactory import StopWordRemoverFactory, StopWordRemover, ArrayDictionary
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import numpy as np
from transformers import AutoTokenizer, AutoModel

In [4]:
def preprocess(text):
  words = text.split()
  for i in range(len(words)):
      words[i] = re.sub(r'[^\w]', ' ', words[i])
  text = ' '.join(words)
  text = ' '.join(text.split())
    
  stop_fact = StopWordRemoverFactory().get_stop_words()
  more_stopword = ['bapak', 'pak', 'ibu', 'bu', 'selamat', 'pagi', 'siang', 'sore', 
                   'Bapak', 'Pak', 'Ibu', 'Bu', 'Selamat', 'Pagi', 'Siang', 'Sore',
                   'assalamualaikum', 'assalamu', 'alaikum', 'wr', 'wb', 'ub', 'kepala', 'upt', 'universitas', 'brawijaya',
                   'mohon', 'maaf', 'yth', 'tik', 'sti', 'bantuannya', 'ac', 'id', 'co', 'com' 
                   'Assalamualaikum', 'Assalamu', 'Wr', 'Wb', 'UB', 'Ub', 'Kepala', 'UPT', 'Universitas', 'Brawijaya',
                   'Mohon', 'Maaf', 'Yth', 'TIK', 'STI']
    
  stopwords = stop_fact + more_stopword

  dictionary = ArrayDictionary(stopwords)
  str = StopWordRemover(dictionary)

  text = str.remove(text)
    
  return text
  

In [5]:
class NaiveCustomLSTM(nn.Module):
    def __init__(self, input_sz: int, hidden_sz: int):
        super().__init__()
        self.input_size = input_sz
        self.hidden_size = hidden_sz
        
        #f_t forget gate
        self.W_f = nn.Parameter(torch.Tensor(input_sz, hidden_sz))
        self.U_f = nn.Parameter(torch.Tensor(hidden_sz, hidden_sz))
        self.b_f = nn.Parameter(torch.Tensor(hidden_sz))
        
        #i_t input gate
        self.W_i = nn.Parameter(torch.Tensor(input_sz, hidden_sz))
        self.U_i = nn.Parameter(torch.Tensor(hidden_sz, hidden_sz))
        self.b_i = nn.Parameter(torch.Tensor(hidden_sz))
        
        #g_t cell state updater
        self.W_g = nn.Parameter(torch.Tensor(input_sz, hidden_sz))
        self.U_g = nn.Parameter(torch.Tensor(hidden_sz, hidden_sz))
        self.b_g = nn.Parameter(torch.Tensor(hidden_sz))
        
        #o_t output gate
        self.W_o = nn.Parameter(torch.Tensor(input_sz, hidden_sz))
        self.U_o = nn.Parameter(torch.Tensor(hidden_sz, hidden_sz))
        self.b_o = nn.Parameter(torch.Tensor(hidden_sz))
        
        self.init_weights()
    
    def init_weights(self):
        limit = 1.0 / math.sqrt(self.hidden_size)
        for weight in self.parameters():
            weight.data.uniform_(-limit, limit)
    
    def forward(self, x, init_states=None):
        bs, _ = x.size()
        
        if init_states is None:
            h_t, c_t = (
                torch.zeros(bs, self.hidden_size).to(x.device),
                torch.zeros(bs, self.hidden_size).to(x.device),
            )
        else:
            h_t, c_t = init_states
            
        x_t = x
        
        f_t = torch.sigmoid(x_t @ self.W_f + h_t @ self.U_f + self.b_f)
        i_t = torch.sigmoid(x_t @ self.W_i + h_t @ self.U_i + self.b_i)
        g_t = torch.tanh(x_t @ self.W_g + h_t @ self.U_g + self.b_g)
        o_t = torch.sigmoid(x_t @ self.W_o + h_t @ self.U_o + self.b_o)
        c_t = f_t * c_t + i_t * g_t
        h_t = o_t * torch.tanh(c_t)
        
        return h_t, (h_t, c_t)

In [6]:
class Net(nn.Module):
    def __init__(self, hidden_sz):
        super().__init__()
        self.lstm = NaiveCustomLSTM(768, hidden_sz)
        self.fc1 = nn.Linear(hidden_sz, 7)
        
    def forward(self, x):
        x_, (h_n, c_n) = self.lstm(x)
        x_ = self.fc1(x_)
        return x_

In [8]:
tokenizer = AutoTokenizer.from_pretrained("bert-base-multilingual-cased")
model = AutoModel.from_pretrained("bert-base-multilingual-cased", output_hidden_states=True)
classifier = torch.load("/kaggle/input/lstm-intent-classifier/final_model_tuned.pth")

in_text = input("Masukkan teks: ")
clean_text = preprocess(in_text)
token_text = tokenizer.encode_plus(clean_text, return_tensors="pt", max_length=65, padding="max_length", truncation=True)

out = model(**token_text)
lhs_out = out.last_hidden_state
features_text = lhs_out.mean(dim=1)

raw_output = classifier(features_text)
model_output = F.softmax(raw_output, dim=1)
# print("Softmax probabilities: ", end="")
# for output in model_output:
#     for out in output:
#         print("{:0.6f}".format(out.item()), end=" ")
#     print("")
confidence, index = torch.max(model_output.data, 1)

labels = ['buat_reset_webhosting', 'gagal_login', 'masalah_vpn', 'mengajukan_email', 'nilai_it_tidak_keluar','permintaan_lisensi_office', 'ucapan_terima_kasih']

print("Intensi: " + labels[index] + "\nConfidence: " + str(confidence.item()))

Some weights of the model checkpoint at bert-base-multilingual-cased were not used when initializing BertModel: ['cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Masukkan teks:  Bagaimana solusinya akun akses saya tidak berfungsi


Intensi: ucapan_terima_kasih
Confidence: 0.9785112738609314
