In [None]:
#import IProgress
import numpy as np
import pandas as pd
import torch
from datasets import load_dataset
from nlp_id.lemmatizer import Lemmatizer 
from nlp_id.postag import PosTag
from transformers import AutoTokenizer, AutoModelForTokenClassification

lemmatizer = Lemmatizer() 
postagger = PosTag()
tokenizer = AutoTokenizer.from_pretrained("ageng-anugrah/indobert-large-p2-finetuned-ner")
model = AutoModelForTokenClassification.from_pretrained("ageng-anugrah/indobert-large-p2-finetuned-ner")

#from https://huggingface.co/ageng-anugrah/indobert-large-p2-finetuned-ner
def predict(model, tokenizer, sentence):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    inputs = tokenizer(sentence.split(),
                    is_split_into_words = True,
                    return_offsets_mapping=True, 
                    return_tensors="pt",
                    padding='max_length', 
                    truncation=True, 
                    max_length=512)
    model.to(device)
    ids = inputs["input_ids"].to(device)
    mask = inputs["attention_mask"].to(device)
    outputs = model(ids, attention_mask=mask)
    logits = outputs[0]
    active_logits = logits.view(-1, model.num_labels) # shape (batch_size * seq_len, num_labels)
    flattened_predictions = torch.argmax(active_logits, axis=1) # shape (batch_size*seq_len,) - predictions at the token level
    tokens = tokenizer.convert_ids_to_tokens(ids.squeeze().tolist())
    token_predictions = [model.config.id2label[i] for i in flattened_predictions.cpu().numpy()]
    wp_preds = list(zip(tokens, token_predictions)) # list of tuples. Each tuple = (wordpiece, prediction)
    prediction = []
    for token_pred, mapping in zip(wp_preds, inputs["offset_mapping"].squeeze().tolist()):
        #only predictions on first word pieces are important
        if mapping[0] == 0 and mapping[1] != 0:
            prediction.append(token_pred[1])
        else:
            continue
    return zip(sentence.split(), prediction)

data = pd.read_csv(r"Training Data _ Raw.csv", sep=';', index_col=False)
data_l = pd.read_csv(r"Training Data _ Lemmatized.csv", sep=';', index_col=False)
pos_db = ["FW", "JJ", "NN", "VB", "NNP", "IN", "NEG"]

for x in range(len(data)):
    l=""
    m=data.at[x,'VERBATIM']
    u=pd.DataFrame(postagger.get_pos_tag(m), columns=['word','pos'])
    for y in range(len(u)):
        if u.at[y,'pos'] in pos_db:
            l = " ".join([l,u.at[y,'word']])
    v=pd.DataFrame(predict(model,tokenizer,l), columns=['word','ner'])
    l=""
    for y in range(len(v)):
        if v.at[y,'ner']!="I-PLACE" and v.at[y,'ner']!="B-PLACE":
            l = " ".join([l,v.at[y,'word']])
        elif v.at[y,'ner']=="B-PLACE":
            l = " ".join([l,"loc"])
    l = lemmatizer.lemmatize(l)
    data_l.at[x,'VERBATIM'] = l
    if x%100==0:
        print(x)
        data_l.to_csv(r"C:\Users\USER\Desktop\!!!\Kategori Verbatim BSQ & CE - Training Data 21-23 _ Lemmatized.csv", sep=';', na_rep=' ', index=False)
data_l.to_csv(r"C:\Users\USER\Desktop\!!!\Kategori Verbatim BSQ & CE - Training Data 21-23 _ Lemmatized.csv", sep=';', na_rep=' ', index=False)

In [None]:
import autokeras as ak
import numpy as np
import pandas as pd
import pickle
import random
import tensorflow as tf
from sklearn.model_selection import train_test_split

df = pd.read_csv(r"Training Data _ Lemmatized.csv", sep=';')
X=df.iloc[:,1] #verbatim
y=df.iloc[:,-2] #y=df.iloc[:,-3] 
d=y.unique().tolist()
X=X.tolist()
y=y.tolist()
for L in range(0, len(y)):
    y[L] = d.index(y[L])
X=np.asarray(X)
y=np.asarray(y)
np.object = object
np.unicode = str
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.9)
clf = ak.TextClassifier(overwrite=True, max_trials=4) #more than 4 would cause the 'text_block_1/max_tokens' error
clf.fit(X_train, y_train, epochs=15, validation_split=0.1)
print(clf.evaluate(X_test, y_test))
m=clf.export_model()
try:
    m.save("model_div", save_format="tf") #m.save("model_ska", save_format="tf")
except Exception:
    m.save("model_div.h5") #m.save("model_ska.h5")