### import libraries

In [1]:
import pandas as pd
import numpy as np
from keras.models import Model, load_model
from keras.layers import TimeDistributed, Conv1D, Dense, Embedding, Input, Dropout, LSTM, Bidirectional, MaxPooling1D, \
    Flatten, concatenate
from keras.utils import plot_model, Progbar
from keras.initializers import RandomUniform
from keras.optimizers import SGD, Nadam
import numpy as np
from keras.callbacks import Callback
from seqeval.metrics import accuracy_score,classification_report,f1_score
from copy import deepcopy
import seqeval
import tensorflow as tf
from keras import backend as K
from sklearn.model_selection import KFold
import matplotlib.pyplot as plt

In [2]:
# Built-in functions
from withpos.preprocessing import iterate_minibatches, padding, convLabels

from withpos.preprocessing_labeled import toLabeledNER, readLabeled, toArray, createBatches, createMatrices, \
        addCharInformation, tag_dataset
from withpos.preprocessing_newdata import toArray_new,createBatches_new, createMatrices_new, \
        addCharInformation_new,tag_dataset_new, get_prediction_ner_new, prediction_ner_to_df_new

## Load POS Tagged Data, and Take Random Sample

In [3]:
# Load Data
data_main = pd.read_csv("./../2. POS TAGGING/OUTPUT/POS_CLEAN.csv",index_col=0)

# Take random sample
random_index = list(pd.read_csv("./INPUT/random index.csv", index_col=0)['index'])
sample = data_main.loc[random_index]
sample

Unnamed: 0,Link,Date Time,Title,Content,Label
3033,https://radarbromo.jawapos.com/kraksaan/30/12/...,2020-12-30 00:00:00,"Menyeberang, Tukang Becak Tewas Tertabrak Truk...","KRAKSAAN <PROPN> , <PUNCT> Radar <PROPN> Bromo...",1.0
1287,https://radarjember.jawapos.com/berita-daerah/...,2020-12-29 14:10:00,"Waduh, kok Bisa Menerobos?\n",TEROBOS <NOUN> LINTASAN <NOUN> : <PUNCT> Sebua...,1.0
3046,https://radarbromo.jawapos.com/probolinggo/29/...,2020-12-29 00:00:00,"Rem Motor Matik Blong, Sekeluarga Terjun ke Pe...","SUMBER <PROPN> , <PUNCT> Radar <PROPN> Bromo <...",1.0
510,https://jatimnow.com/baca-32404-pemotor-asal-n...,2020-12-28 16:46:36,Pemotor asal Nganjuk Tewas dalam Kecelakaan di...,jatimnow.com <PROPN> - <PUNCT> Arief <PROPN> (...,1.0
511,https://jatimnow.com/baca-32400-bocah-3-tahun-...,2020-12-28 15:18:19,Bocah 3 Tahun di Pacitan Tewas Tertabrak Truk ...,jatimnow.com <PROPN> - <PUNCT> Seorang <DET> b...,1.0
...,...,...,...,...,...
2739,https://radarmadiun.jawapos.com/dump-truk-puta...,2020-04-03 17:26:00,Dump Truk Putar Balik Dihantam Beat ÃÂÃÂ¢Ã...,"PACITAN <PROPN> , <PUNCT> Jawa <PROPN> Pos <PR...",1.0
465,https://news.detik.com/berita-jawa-timur/d-486...,2020-01-22 16:55:00,Pegawai Dishub Tewas Terlindas Truk di Jalur P...,Situbondo <PROPN> - <PUNCT> Seorang <DET> pega...,1.0
1077,https://radarbromo.jawapos.com/utama/20/03/202...,2020-03-20 02:02:12,"Sopir Ngantuk, Minibus Tabrak Median di Jalan ...","PASURUAN <PROPN> , <PUNCT> Radar <PROPN> Bromo...",1.0
1067,https://radarsurabaya.jawapos.com/read/2020/04...,2020-04-03 17:28:22,"Ngantuk, Pemotor Tabrak Truk Trailer yang Teng...",GRESIKNasib <NOUN> sial <ADJ> dialami <VERB> N...,1.0


## Labeling

Function toLabeledNER untuk membuat hasil POS Tag menjadi tabel yang bisa dilabeli dengan mudah

In [4]:
to_label = toLabeledNER(sample,'Content')
to_label

Unnamed: 0,article,sentence,word,pos
0,3033,,----------DOCSTART----------,
1,3033,0,KRAKSAAN,<PROPN>
2,3033,0,",",<PUNCT>
3,3033,0,Radar,<PROPN>
4,3033,0,Bromo,<PROPN>
...,...,...,...,...
60428,1141,3814,lancar,<VERB>
60429,1141,3814,beberapa,<DET>
60430,1141,3814,jam,<NOUN>
60431,1141,3814,kemudian,<ADV>


Load data yang telah diberi anotasi manual

## Load data

In [5]:
data_ = readLabeled(FILE_DIR = "./INPUT/TO_LABELLING_NER-LABELED.csv")
data_

Unnamed: 0,article,sentence,word,pos,ner
0,3033,,----------DOCSTART----------,,O
1,3033,0,KRAKSAAN,<PROPN>,O
2,3033,0,",",<PUNCT>,O
3,3033,0,Radar,<PROPN>,B-MISC
4,3033,0,Bromo,<PROPN>,I-MISC
...,...,...,...,...,...
60428,1141,3814,lancar,<VERB>,O
60429,1141,3814,beberapa,<DET>,O
60430,1141,3814,jam,<NOUN>,O
60431,1141,3814,kemudian,<ADV>,O


In [6]:
# Remove barrier
data = data_[pd.Series(['DOCSTART' not in str(i) for i in data_.word])]
data['pos']=data['pos'].apply(lambda x: x.replace('<','').replace('>','').replace('<','').replace('>','') if type(x)==str else None)
data.index = range(len(data))

data

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['pos']=data['pos'].apply(lambda x: x.replace('<','').replace('>','').replace('<','').replace('>','') if type(x)==str else None)


Unnamed: 0,article,sentence,word,pos,ner
0,3033,0,KRAKSAAN,PROPN,O
1,3033,0,",",PUNCT,O
2,3033,0,Radar,PROPN,B-MISC
3,3033,0,Bromo,PROPN,I-MISC
4,3033,0,Nahas,PROPN,O
...,...,...,...,...,...
60200,1141,3814,lancar,VERB,O
60201,1141,3814,beberapa,DET,O
60202,1141,3814,jam,NOUN,O
60203,1141,3814,kemudian,ADV,O


In [7]:
# Convert to array format
array = toArray(data)
array

[[['KRAKSAAN', 'O', 'PROPN'],
  [',', 'O', 'PUNCT'],
  ['Radar', 'B-MISC', 'PROPN'],
  ['Bromo', 'I-MISC', 'PROPN'],
  ['Nahas', 'O', 'PROPN'],
  ['menimpa', 'O', 'VERB'],
  ['Miskari', 'B-PER', 'PUNCT'],
  ['59', 'B-AGE', 'NUM'],
  [',', 'O', 'PUNCT'],
  ['Selasa', 'B-DATE', 'PROPN'],
  ['(', 'O', 'PUNCT'],
  ['29/12', 'B-DATE', 'PROPN'],
  [')', 'O', 'PUNCT'],
  ['malam', 'B-TIME', 'NOUN'],
  ['.', 'O', 'PUNCT']],
 [['Tukang', 'O', 'PROPN'],
  ['becak', 'B-VEHICLE', 'NOUN'],
  ['itu', 'O', 'DET'],
  ['ditabrak', 'O', 'VERB'],
  ['sebuah', 'O', 'DET'],
  ['truk', 'B-VEHICLE', 'NOUN'],
  ['boks', 'I-VEHICLE', 'NOUN'],
  ['saat', 'O', 'SCONJ'],
  ['hendak', 'O', 'ADV'],
  ['menyeberang', 'O', 'VERB'],
  ['di', 'O', 'ADP'],
  ['jalan', 'B-LOC', 'NOUN'],
  ['raya', 'I-LOC', 'NOUN'],
  ['Kelurahan', 'I-LOC', 'PROPN'],
  ['Patokan', 'I-LOC', 'PROPN'],
  [',', 'O', 'PUNCT'],
  ['Kraksaan', 'B-LOC', 'PROPN'],
  [',', 'O', 'PUNCT'],
  ['Kabupaten', 'B-LOC', 'PROPN'],
  ['Probolinggo', 'I-LOC',

## Set Parameter

In [12]:
EPOCHS = 50               # paper: 80
DROPOUT = 0.5             # paper: 0.68
DROPOUT_RECURRENT = 0.25  # not specified in paper, 0.25 recommended
LSTM_STATE_SIZE = 200     # paper: 275
CONV_SIZE = 3             # paper: 3
LEARNING_RATE = 0.0105    # paper 0.0105
OPTIMIZER = Nadam()       # paper uses SGD(lr=self.learning_rate), Nadam() recommended

## Final Model

In [14]:
"""Initialize class"""
class CNN_BLSTM_FINAL(object):
    
    def __init__(self, EPOCHS, DROPOUT, DROPOUT_RECURRENT, LSTM_STATE_SIZE, CONV_SIZE, LEARNING_RATE, OPTIMIZER):
        
        self.epochs = EPOCHS
        self.dropout = DROPOUT
        self.dropout_recurrent = DROPOUT_RECURRENT
        self.lstm_state_size = LSTM_STATE_SIZE
        self.conv_size = CONV_SIZE
        self.learning_rate = LEARNING_RATE
        self.optimizer = OPTIMIZER
        
    def loadData(self, train):
        #Load data and add character information
        self.trainSentences = train

    def addCharInfo(self):
        # format: [['EU', ['E', 'U'], 'B-ORG\n'], ...]
        self.trainSentences = addCharInformation(deepcopy(self.trainSentences))

    def embed(self):
        #Create word- and character-level embeddings

        labelSet = set()
        posSet=set()
        words = {}

        # read: token, char, label, pos
        for dataset in [self.trainSentences]:
            for sentence in dataset:
                for token, char, label, pos in sentence:
                    labelSet.add(label)
                    posSet.add(pos)
                    words[token.lower()] = True

        # char2Idx
        self.char2Idx = {"PADDING": 0, "UNKNOWN": 1}
        for c in " 0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ.,-_()[]{}!?:;#'\"/\\%$`&=*+@^~|<>":
            self.char2Idx[c] = len(self.char2Idx)

        # label2Idx
        self.label2Idx = {}
        for label in labelSet:
            self.label2Idx[label] = len(self.label2Idx)# <- napa keren bangettt siii???????
        self.idx2Label = {v: k for k, v in self.label2Idx.items()}
        
        # pos2Idx
        self.pos2Idx = {}
        for pos in posSet:
            self.pos2Idx[pos]=len(self.pos2Idx)
        self.posEmbeddings = np.identity(len(self.pos2Idx), dtype='float32')
        #self.idx2Pos = {v: k for k,v in self.pos2Idx.items()}    
        
        # case2Idx
        case2Idx = {'numeric': 0, 'allLower': 1, 'allUpper': 2, 'initialUpper': 3, 'other': 4, 'mainly_numeric': 5,
                    'contains_digit': 6, 'PADDING_TOKEN': 7}
        self.caseEmbeddings = np.identity(len(case2Idx), dtype='float32')  # identity matrix used 

        # WordEmbeddings GLoVE
        word2Idx = {}
        self.wordEmbeddings = []
        fEmbeddings = open('vectors.txt', encoding="utf-8")
        # loop through each word in embeddings
        for line in fEmbeddings:
            split = line.strip().split(" ")
            word = split[0]  # embedding word entry
            if len(word2Idx) == 0:  # add padding+unknown
                word2Idx["PADDING_TOKEN"] = len(word2Idx)
                vector = np.zeros(len(split) - 1)  # zero vector for 'PADDING' word
                self.wordEmbeddings.append(vector)
                word2Idx["UNKNOWN_TOKEN"] = len(word2Idx)
                vector = np.random.uniform(-0.25, 0.25, len(split) - 1)
                self.wordEmbeddings.append(vector)
            if word.lower() in words:
                vector = np.array([float(num) for num in split[1:]])
                self.wordEmbeddings.append(vector)  # word embedding vector
                word2Idx[word] = len(word2Idx)  # corresponding word dict
        self.wordEmbeddings = np.array(self.wordEmbeddings)

        # format: [[wordindices], [caseindices], [padded word indices], [label indices], [pos indices]]
        # FROM PREPRO
        self.train_set = padding(createMatrices(self.trainSentences, word2Idx, self.label2Idx, case2Idx, self.char2Idx, self.pos2Idx))
        return word2Idx, case2Idx, self.char2Idx, self.pos2Idx, self.idx2Label
    
    def createBatches(self):
        #Create batches
        self.train_idx, self.train_batch, self.train_batch_len = createBatches(self.train_set)
        return [self.train_idx, self.train_batch, self.train_batch_len]
    
    def tag_dataset(self, dataset, model):
        correctLabels = []
        predLabels = []
        progbar = Progbar(len(dataset))
        #print("Tagging token..")
        for i, data in enumerate(dataset):
            tokens, casing, char, labels, pos = data
            tokens = np.asarray([tokens])
            casing = np.asarray([casing])
            char = np.asarray([char])
            pos = np.asarray([pos])
            pred = model.predict([tokens, casing, char, pos], verbose=False)[0]#disinilah dia memprediksi modelnya
            progbar.update(i+1)
            pred = pred.argmax(axis=-1)  # Predict the classes
            correctLabels.append(labels)
            predLabels.append(pred)
        #print("Done.")
        return correctLabels, predLabels
    
    def buildModel(self):
        #Model layers
        # character input
        character_input = Input(shape=(None, 52,), name="Character_input_final")
        
        # sebelum embedding_initializer #<-masuk char2Idx
        embed_char_out = TimeDistributed(
            Embedding(len(self.char2Idx), 30, embeddings_initializer=RandomUniform(minval=-0.5, maxval=0.5)), name="Character_embedding_final")(
            character_input)
        
        dropout = Dropout(self.dropout)(embed_char_out)

        # CNN <-char tadi masuk ke CNN
        conv1d_out = TimeDistributed(Conv1D(kernel_size=self.conv_size, filters=30, padding='same', activation='tanh', strides=1), name="Convolution_final")(dropout)
        maxpool_out = TimeDistributed(MaxPooling1D(52), name="Maxpool_final")(conv1d_out)
        char = TimeDistributed(Flatten(), name="Flatten_final")(maxpool_out)
        char = Dropout(self.dropout)(char)

        # word-level input #<-masuk wordEmbeddings
        words_input = Input(shape=(None,), dtype='int32', name='words_input_final')
        words = Embedding(input_dim=self.wordEmbeddings.shape[0], output_dim=self.wordEmbeddings.shape[1], weights=[self.wordEmbeddings],
                          trainable=False,embeddings_initializer=RandomUniform(minval=-0.5, maxval=0.5))(words_input)
        # case-info input #<-masuk caseEmbeddings
        casing_input = Input(shape=(None,), dtype='int32', name='casing_input_final')
        casing = Embedding(output_dim=self.caseEmbeddings.shape[1], input_dim=self.caseEmbeddings.shape[0], weights=[self.caseEmbeddings],
                           trainable=False, embeddings_initializer=RandomUniform(minval=-0.5, maxval=0.5))(casing_input)
        # pos-info input #<-masuk caseEmbeddings
        pos_input = Input(shape=(None,), dtype='int32', name='pos_input_final')
        pos = Embedding(output_dim=self.posEmbeddings.shape[1], input_dim=self.posEmbeddings.shape[0], weights=[self.posEmbeddings],
                        trainable=False,embeddings_initializer=RandomUniform(minval=-0.5, maxval=0.5))(pos_input) #<-disini weh

        # concat & BLSTM
        output = concatenate([words, casing, char, pos])
        output = Bidirectional(LSTM(self.lstm_state_size, 
                                    return_sequences=True, 
                                    dropout=self.dropout,                        # on input to each LSTM block
                                    recurrent_dropout=self.dropout_recurrent     # on recurrent input signal
                                   ), name="BLSTM_final")(output)
        output = TimeDistributed(Dense(len(self.label2Idx), activation='softmax'),name="Softmax_layer_final")(output)

        # Set up model
        self.model = Model(inputs=[words_input, casing_input, character_input, pos_input], outputs=[output])
        self.model.compile(loss='sparse_categorical_crossentropy', optimizer=self.optimizer)
        # save clear weight into variable
        self.init_weights = self.model.get_weights()

        # Plot model
        plot_model(self.model, to_file='./OUTPUT/model_plot_final.png', show_shapes=True, show_layer_names=True)
        print("Model built. Saved model_plot_final.png\n")
        
    def train(self):
        #Default training

        for epoch in range(self.epochs):    
            print("Epoch {}/{}".format(epoch+1, self.epochs))
            for i,batch in enumerate(iterate_minibatches(self.train_batch,self.train_batch_len)):
                labels,tokens,casing,char,postag = batch       
                self.model.train_on_batch([tokens,casing,char,postag], labels)
                
        # save model
        self.model.save("./OUTPUT/FINAL.h5")
        print("Final Model's weights saved.")
        
        self.model.set_weights(self.init_weights)  # clear model
        print("Model weights cleared.")
        
        del(self.model)
        K.clear_session()
        
        return "./OUTPUT/FINAL.h5"
        
    print("Class initialized.")

Class initialized.


In [15]:
TRAIN_DATA = deepcopy(array)
cnn_blstm = CNN_BLSTM_FINAL(EPOCHS, DROPOUT, DROPOUT_RECURRENT, LSTM_STATE_SIZE, CONV_SIZE, LEARNING_RATE, OPTIMIZER)
cnn_blstm.loadData(TRAIN_DATA)
cnn_blstm.addCharInfo()
word2Idx, case2Idx, char2Idx, pos2Idx, idx2Label = cnn_blstm.embed()
b = cnn_blstm.createBatches()
cnn_blstm.buildModel()
directory = cnn_blstm.train()

Model built. Saved model_plot_final.png

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50
Final Model's weights saved.
Model weights cleared.


## Get the Predicted Tags! >.<

In [16]:
# Load The Final Model
model = load_model(directory)
model.summary()

Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
Character_input_final (InputLay [(None, None, 52)]   0                                            
__________________________________________________________________________________________________
Character_embedding_final (Time (None, None, 52, 30) 2910        Character_input_final[0][0]      
__________________________________________________________________________________________________
dropout (Dropout)               (None, None, 52, 30) 0           Character_embedding_final[0][0]  
__________________________________________________________________________________________________
Convolution_final (TimeDistribu (None, None, 52, 30) 2730        dropout[0][0]                    
______________________________________________________________________________________________

In [17]:
# Convert to Array
DATA = deepcopy(data_main)

data_ = toLabeledNER(DATA,'Content')
data_ = data_[pd.Series(['DOCSTART' not in str(i) for i in data_.word])]
data_['pos']=data_['pos'].apply(lambda x: x.replace('<','').replace('>','').replace('<','').replace('>','') if type(x)==str else None)
array_ = toArray_new(data_)
array_

[[['SIDOARJO', 'PROPN'],
  ['Sebuah', 'DET'],
  ['truk', 'NOUN'],
  ['Toyota', 'PROPN'],
  ['Dyna', 'PROPN'],
  ['nopol', 'PROPN'],
  ['L', 'PROPN'],
  ['8894', 'NUM'],
  ['US', 'PROPN'],
  ['yang', 'PRON'],
  ['melaju', 'VERB'],
  ['di', 'ADP'],
  ['Jalan', 'PROPN'],
  ['Raya', 'PROPN'],
  ['Seduri', 'PROPN'],
  ['terguling', 'PROPN'],
  [',', 'PUNCT'],
  ['Selasa', 'PROPN'],
  ['(', 'PUNCT'],
  ['29/12', 'PROPN'],
  [')', 'PUNCT'],
  ['malam', 'NOUN'],
  ['.', 'PUNCT']],
 [['Hal', 'NOUN'],
  ['itu', 'DET'],
  ['membuat', 'VERB'],
  ['sopir', 'NOUN'],
  ['harus', 'ADV'],
  ['dilarikan', 'VERB'],
  ['ke', 'ADP'],
  ['rumah', 'NOUN'],
  ['sakit', 'NOUN'],
  ['untuk', 'ADP'],
  ['menjalani', 'VERB'],
  ['perawatan', 'NOUN'],
  ['lantaran', 'SCONJ'],
  ['mengalami', 'VERB'],
  ['luka-luka', 'NOUN'],
  ['.', 'PUNCT']],
 [['Informasi', 'NOUN'],
  ['yang', 'PRON'],
  ['berhasil', 'VERB'],
  ['dihimpun', 'VERB'],
  [',', 'PUNCT'],
  ['awalnya', 'ADV'],
  ['truk', 'NOUN'],
  ['melaju', 'VERB']

In [18]:
# Transform Datasets
sentences_ = addCharInformation_new(deepcopy(array_))
set_ = padding(createMatrices_new(sentences_, word2Idx, case2Idx, char2Idx, pos2Idx))
batch_idx_, batch_, batch_len_ = createBatches_new(set_)

In [19]:
# Predict Token
pred_ = tag_dataset_new(batch_,model)

Tagging token..
Done.


In [31]:
# Get the Predictions
tag_ = get_prediction_ner_new(array_,pred_,batch_idx_,idx2Label)
prediction_ner_df_ = prediction_ner_to_df_new(tag_,data_)
out = deepcopy(data_)
out['pred_ner']=prediction_ner_df_['pred_ner']
out

Unnamed: 0,article,sentence,word,pos,pred_ner
1,3659,0,SIDOARJO,PROPN,O
2,3659,0,Sebuah,DET,O
3,3659,0,truk,NOUN,B-VEHICLE
4,3659,0,Toyota,PROPN,I-VEHICLE
5,3659,0,Dyna,PROPN,I-VEHICLE
...,...,...,...,...,...
328869,508,21006,kepala,NOUN,O
328870,508,21006,pusing,NOUN,O
328871,508,21006,kondisi,NOUN,O
328872,508,21006,sadar,ADJ,O


In [34]:
out[:60]

Unnamed: 0,article,sentence,word,pos,pred_ner
1,3659,0,SIDOARJO,PROPN,O
2,3659,0,Sebuah,DET,O
3,3659,0,truk,NOUN,B-VEHICLE
4,3659,0,Toyota,PROPN,I-VEHICLE
5,3659,0,Dyna,PROPN,I-VEHICLE
6,3659,0,nopol,PROPN,O
7,3659,0,L,PROPN,B-PLATE
8,3659,0,8894,NUM,I-PLATE
9,3659,0,US,PROPN,I-PLATE
10,3659,0,yang,PRON,O


In [35]:
out.to_csv("./OUTPUT/PREDICTED.csv")