In [1]:
from datasets import load_dataset
from sklearn.model_selection import train_test_split
from transformers import AutoTokenizer, AutoModelForTokenClassification, AutoModelForSequenceClassification
from torch import nn
from torch.nn.utils.rnn import pad_sequence, pad_packed_sequence, pack_padded_sequence
from NERModel import NERMOEE

import pandas as pd
import numpy as np
import urllib.request
import re
import torch
import json
import fasttext
import torch.nn.functional as F

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
device = 'cpu'

In [3]:
def preprocessing():
    sd_dataset = nergritImport()
    td_dataset = helpdeskUBImport()
    
    sd_dataset = preprocessNergrit(sd_dataset)
    
    if 'ner_tags' not in td_dataset.columns:
        td_dataset = preprocessHelpdesk(td_dataset)
    
    return sd_dataset, td_dataset

In [4]:
def nergritImport():
    
    # Import dataset Nergrit NER dari Huggingface
    dataset = load_dataset("id_nergrit_corpus", 'ner')
    
    # Gabung data train, test, dan validation
    test = dataset['test']
    train = dataset['train']
    validation = dataset['validation']

    tokens = []
    ner_tags = []
    id_data = []

    for dataset in [train, test, validation]:
        tokens.extend(dataset['tokens'])
        ner_tags.extend(dataset['ner_tags'])
        id_data.extend(dataset['id'])

    dataset = {"id":id_data, "tokens":tokens, "ner_tags":ner_tags}
    
    # Hapus kolom id
    dataset.pop('id')
    
    # Konversi dataset ke tipe data dataframe
    dataset = pd.DataFrame(dataset)
    
    return dataset

In [5]:
def helpdeskUBImport():
    
    try:
        with open('target_domain_preprocessed.json', 'r') as file:
            dataset = json.load(file)
        
        dataset = pd.DataFrame(dataset)
        
    except:
        # Download dataset helpdesk TIK UB
        url = 'https://docs.google.com/spreadsheets/d/1PzUlTZwY6IySZ7VNotDIH3h9hnuRIuy40CgAS7BcFkE/export?gid=1874021283&format=csv'
        output_file = 'test.csv'

        urllib.request.urlretrieve(url, output_file)

        # Import file csv dan convert ke dataframe
        dataset = pd.read_csv('test.csv', usecols=['body'])
    
    return dataset

In [6]:
def preprocessNergrit(sd_dataset:pd.DataFrame):
    dataset = pd.DataFrame()
    
    # Hapus data duplikat dataset
    dataset['tokens'] = sd_dataset['tokens'].apply(tuple)
    dataset['ner_tags'] = sd_dataset['ner_tags'].apply(tuple)

    dataset = dataset.drop_duplicates()
    dataset = dataset.reset_index(drop=True)
    
    # Split dataset
    train_dataset, test_dataset = train_test_split(dataset, test_size=0.2, random_state=11)
    train_dataset = train_dataset.reset_index(drop=True)
    test_dataset = test_dataset.reset_index(drop=True)
    
    # Gabung train test dataset menjadi dictionary
    dataset = {"train_dataset": train_dataset, "test_dataset": test_dataset}
    
    return dataset
    
def remove_blockquote_tag(sentence):
    if sentence is not None:
        return re.sub(r'<blockquote>.*?</blockquote>', '', sentence, flags=re.DOTALL)
    return None

def remove_html_tag(sentence):
    if sentence is not None:
        return re.sub(r'<[^>]+>', '', sentence)
    return None

In [7]:
def preprocessHelpdesk(td_dataset:pd.DataFrame):
    dataset = pd.DataFrame()
    
    tokenizer = AutoTokenizer.from_pretrained("bryanahusna/my-nergrit-model")
    model = AutoModelForTokenClassification.from_pretrained("bryanahusna/my-nergrit-model").to(device)
    
    # Hapus data duplikat dataset
    dataset['body'] = td_dataset['body']
    dataset = dataset.drop_duplicates(subset='body')
    dataset = dataset.reset_index(drop=True)
    
    # Hapus data yang memiliki kata lebih dari 200
    dataset['word_count'] = dataset['body'].apply(count_words)
    dataset = dataset[dataset['word_count'] <= 200]
    dataset = dataset.drop(columns=['word_count'])

    # Hapus tag blockquote dan isinyya
    dataset['body'] = dataset['body'].apply(remove_blockquote_tag)
    
    # Hapus tag html
    dataset['body'] = dataset['body'].apply(remove_html_tag)
    
    dataset_arr = np.array(dataset['body']).tolist()
    
    # Tokenisasi data
    tokens_vector = tokenizer(dataset_arr, padding=True, return_tensors='pt').to(device)
    attention_mask = tokens_vector['attention_mask']

    tokens = []
    
    for d in dataset_arr: 
        tokens.append(tokenizer.tokenize(d))
    
    # Predict
    batch_size = 16
    tokens_vector = torch.utils.data.DataLoader(tokens_vector['input_ids'], batch_size=batch_size)
    attention_mask = torch.utils.data.DataLoader(attention_mask, batch_size=batch_size)
    
    ner_tags = []
    
    for t, a in zip(tokens_vector, attention_mask):
        output = model(t, a)
        logits = output.logits

        predicted = torch.argmax(logits, dim=2)
        predicted = predicted.masked_fill(a==0, -1)
        predicted_list = predicted.tolist()
        
        result_list = [[value for value in sublist if value != -1] for sublist in predicted_list]
        result_list = [sublist[1:-1] for sublist in result_list]
        
        ner_tags.extend(result_list)
    
    result_dict = {"tokens":tokens, "ner_tags":ner_tags}
    result = pd.DataFrame(result_dict)
    
    with open('target_domain_preprocessed.json', 'w') as file:
            json.dump(result_dict, file,)
                          
    return result
    
def count_words(text):
    return len(text.split())

In [8]:
sd_dataset, td_dataset = preprocessing()

In [9]:
td_dataset

Unnamed: 0,tokens,ner_tags
0,"[sudah, dicoba, dan, tet, ##ep, gak, bisa, mba...","[38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 6, 25..."
1,"[bisa, di, screen, ##sh, ##ot, tampilan, untuk...","[38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38]"
2,"[y, ##th, ., bapak, /, ibu, ##nam, ##a, saya, ...","[38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 12, 3..."
3,"[y, ##th, ., kepala, sti, ub, mohon, bantuan, ...","[38, 38, 38, 11, 30, 30, 38, 38, 38, 38, 38, 3..."
4,"[terima, kasih, atas, informasi, yang, anda, b...","[38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 3..."
...,...,...
9250,"[jadi, awalnya, akun, gapura, saya, ke, log, o...","[38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 3..."
9251,"[nama, :, ilham, ramadan, gunawan, nim, :, 175...","[38, 38, 12, 31, 31, 38, 38, 17, 36, 36, 36, 3..."
9252,"[pada, saat, login, email, ub, ,, kenapa, ada,...","[38, 38, 38, 38, 11, 38, 38, 38, 38, 38, 38, 3..."
9253,"[mohon, bantuan, res, ##et, password, akun, ba...","[38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 1..."


In [12]:
td_dataset.shape

(9255, 2)

In [10]:
sd_dataset['test_dataset'].shape

(3366, 2)

In [11]:
sd_dataset['train_dataset'].shape

(13462, 2)

In [None]:
def modelConfig():
    try:
        with open('config.json', 'r') as file:
            config_dict = json.load(file)

        if 'dropout_rate' not in config_dict or config_dict['dropout_rate'] is None:
            print('input dropout_rate:')
            dropout_rate = input()
            config_dict['dropout_rate']=dropout_rate
            
            with open('config.json', 'w') as file:
                json.dump(config_dict, file)
            
        if 'embedding_size' not in config_dict or config_dict['embedding_size'] is None:
            print('input embedding_size:')def getModel():
    config = modelConfig()
    if 'model' in config and config['model'] is not None:
        print('return model')
        return
        
    print('create model')
            embedding_size = input()
            config_dict['embedding_size']=embedding_size
            
            with open('config.json', 'w') as file:
                json.dump(config_dict, file)
            
        if 'hidden_state' not in config_dict or config_dict['hidden_state'] is None:
            print('input hidden_state:')
            hidden_state = input()
            config_dict['hidden_state']=hidden_state
            
            with open('config.json', 'w') as file:
                json.dump(config_dict, file)
            
        if 'tag' not in config_dict or config_dict['tag'] is None:
            print('input tag:')
            tag = input()
            config_dict['tag']=tag
            
            with open('config.json', 'w') as file:
                json.dump(config_dict, file)
                
        with open('config.json', 'r') as file:
            config_dict = json.load(file)
    except:
        config_dict = {}
        
        print('input dropout_rate:')
        dropout_rate = input()
        config_dict['dropout_rate']=dropout_rate
        
        print('input embedding_size:')
        embedding_size = input()
        config_dict['embedding_size']=embedding_size
            
        print('input hidden_state:')
        hidden_state = input()
        config_dict['hidden_state']=hidden_state
            
        print('input tag:')
        tag = input()
        config_dict['tag']=tag
        
        with open('config.json', 'w') as file:
            json.dump(config_dict, file,)
    
    return config_dict

In [None]:
def getModel():
    config = modelConfig()
    if 'model' in config and config['model'] is not None:
        print('return model')
        return
        
    model = NERMOEE(256, 512,)
    print('create model')