# NER with Bidirectional-LSTM-CNNs

The original paper can be found at https://arxiv.org/abs/1511.08308

In [None]:
try:
  # Colab only
  %tensorflow_version 2.x
except Exception:
  pass

In [None]:
!pip install konlpy

In [None]:
!pip install gensim

In [None]:
!pip install numpy==1.16.2
import numpy as np
print(np.__version__)

In [None]:
#colab에서 실행 중이라면...
!git clone https://github.com/hukim1112/comment_classifier.git
import os
os.chdir('/content/comment_classifier')

In [None]:
import tensorflow as tf
from konlpy.tag import Twitter
from collections import Counter
import pandas as pd
import numpy as np
import gensim
from matplotlib import pyplot as plt
from data_utils import createBatches, iterate_minibatches
import ner
keras = tf.keras
t = Twitter()

# 1. read train dataset

In [None]:
file_path = 'train_entity.csv'

In [None]:
with open(file_path, encoding='euc-kr') as f:
    tokenized_sentences, labels = [], []
    tokenized_sentence, label = [], []
    for line in f:
        line = line.strip()
        if (len(line) == 0 or line.startswith("-DOCSTART-")):
            if not len(tokenized_sentence) == 0:
                tokenized_sentences.append(tokenized_sentence)
                labels.append(label)
                tokenized_sentence, label = [], [] #초기화
        else:
            word, tag = line.split(' ')
            tokenized_sentence.append(word)
            label.append(tag)

In [None]:
tokenized_sentences[:20]

# 2. Get vectorizer and fix some data error

In [None]:
from vectorizer import BaseVectorizer
vectorizer = BaseVectorizer(t.morphs)

In [None]:
print("tokenized sentence : ", tokenized_sentences[1606], '\n',
      "labels : ", labels[1606])

In [None]:
vectorizer.tokenizer('말해줄래') #우리 tokenizer와 다른 형태소 분석 형태로 데이터가 구성되어 있음.

In [None]:
for ts, label in zip(tokenized_sentences, labels):
    for idx, word in enumerate(ts):
        if len(vectorizer.tokenizer(word))>1:
            tokenized_word = vectorizer.tokenizer(word)
            ts.pop(idx)
            tag = label.pop(idx)
            for i in tokenized_word[::-1]:
                ts.insert(idx, i)
                label.insert(idx, tag)

In [None]:
tokenized_sentences[1606]

In [None]:
labels[1606]

# 3. Create word vocabulary and char vocabulary

In [None]:
sentences = []
for ts in tokenized_sentences:
    sentence = ' '.join(ts)
    sentences.append(sentence)

In [None]:
vectorizer.fit(sentences) #create word vocabulary from docs

In [None]:
vectorizer.get_char2idx() 
#create dictionary for converting char into index

In [None]:
print(vectorizer.char2idx)

In [None]:
label2idx = {}
idx2label = []
for label in labels:
    for l in label:
        if l not in label2idx:
            label2idx[l] = len(label2idx)
            idx2label.append(l)

In [None]:
print(label2idx, idx2label)

# 4. Prepare dataset for training

In [None]:
MAX_LENGTH = 15

def padding_char_indice(char_indice, MAX_LENGTH):
    return tf.keras.preprocessing.sequence.pad_sequences(
      char_indice, maxlen=MAX_LENGTH, padding='post', 
      value = vectorizer.char2idx['_PAD_'])
    

def integer_coding(tokenized_sentences, labels):
    dataset = []
    for ts, label in zip(tokenized_sentences, labels):
        word_indice = [vectorizer.vocabulary_[t] for t in ts]
        char_indice = [[vectorizer.char2idx[char] for char in t]  
                                                     for t in ts]
        char_indice = padding_char_indice(char_indice, MAX_LENGTH)
        label_indice = [label2idx[l] for l in label]
    
        for chars_of_token in char_indice:
            if len(chars_of_token)>MAX_LENGTH:
                print("최대 단어 길이 초과!")
                continue
        dataset.append([word_indice, char_indice, label_indice])
    return dataset

In [None]:
dataset = integer_coding(tokenized_sentences, labels)

In [None]:
word_indice, char_indice, label_indice = dataset[0]

In [None]:
word_indice

In [None]:
vectorizer.decode_from_list(word_indice)

In [None]:
label_indice

In [None]:
[idx2label[l] for l in label_indice]

In [None]:
char_indice

### save vocabularies

In [None]:
import os
os.makedirs("./models", exist_ok=True)
np.save("models/idx2Label.npy",idx2label)
np.save("models/word2Idx.npy",vectorizer.vocabulary_)
np.save("models/char2Idx.npy",vectorizer.char2idx)

# 5. Training

In [None]:
train_batch,train_batch_len = createBatches(dataset)

In [None]:
for i,batch in enumerate(iterate_minibatches(train_batch,train_batch_len)):
        labels, tokens ,char = batch
        print(tokens.shape)

In [None]:
from tensorflow.keras.layers import Embedding, TimeDistributed, Dropout, concatenate, Bidirectional, LSTM, Conv1D, Dense, MaxPooling1D, Flatten
from tensorflow.keras import Input, Model
from tensorflow.keras.initializers import RandomUniform
from tensorflow.keras.utils import Progbar

In [None]:
words_input = Input(shape=(None,),dtype='int32',name='words_input')
words = Embedding(input_dim=len(vectorizer.vocabulary_), output_dim=64)(words_input)
character_input=Input(shape=(None,MAX_LENGTH,),name='char_input')
embed_char_out=TimeDistributed(Embedding(len(vectorizer.char2idx),30,embeddings_initializer=RandomUniform(minval=-0.5, maxval=0.5)), name='char_embedding')(character_input)
dropout= Dropout(0.5)(embed_char_out)
conv1d_out= TimeDistributed(Conv1D(kernel_size=3, filters=30, padding='same',activation='tanh', strides=1))(dropout)
maxpool_out=TimeDistributed(MaxPooling1D(MAX_LENGTH))(conv1d_out)
char = TimeDistributed(Flatten())(maxpool_out)
char = Dropout(0.5)(char)
output = concatenate([words, char])
output = Bidirectional(LSTM(200, return_sequences=True, dropout=0.50, recurrent_dropout=0.25))(output)
output = TimeDistributed(Dense(len(label2idx), activation='softmax'))(output)
model = Model(inputs=[words_input,character_input], outputs=[output])
model.compile(loss='sparse_categorical_crossentropy', optimizer='nadam')
model.summary()

In [None]:
#tf.keras.utils.plot_model(model, 'model.png')

In [None]:
epochs = 20

for epoch in range(epochs):    
    print("Epoch %d/%d"%(epoch,epochs))
    a = Progbar(len(train_batch_len))
    for i,batch in enumerate(iterate_minibatches(train_batch,train_batch_len)):
        labels, tokens, char = batch       
        model.train_on_batch([tokens, char], labels)
        a.update(i)
    a.update(i+1)
    print(' ')

In [None]:
model.save("models/model.h5")

# 6. Prediction

In [None]:
ner_parser = ner.Parser(t.morphs)
ner_parser.load_models()

In [None]:
ner_parser.predict('내일 부산 날씨는?')

In [None]:
ner_parser.predict('오늘 서울 날씨 어때?')

In [None]:
ner_parser.predict('9월 15일 수유동 날씨 궁금해')