In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import tensorflow as tf
from io import *
%matplotlib inline
from io import BytesIO
import requests
np.random.seed(0)
plt.style.use("ggplot")

## Đọc dữ liệu 

In [4]:
def load_data(path):

  data = []
  with open(path) as f:
    sentence = []
    for i, line in enumerate(f.readlines()):
      try:
        word, tag = line.split()
        sentence.append((word,tag))
      except:
        data.append(sentence)
        sentence = []
  return data 

In [5]:
path_dev = '/content/drive/MyDrive/Data Scientist/DS310.N11 - NLP/data/word/dev_word.conll' 
path_train = '/content/drive/MyDrive/Data Scientist/DS310.N11 - NLP/data/word/train_word.conll'
path_test = '/content/drive/MyDrive/Data Scientist/DS310.N11 - NLP/data/word/test_word.conll'

In [8]:
train = load_data(path_train)
test = load_data(path_test)
dev = load_data(path_dev)

## Tạo tập từ vựng

In [47]:
words=[]
for i in range(len(train)):
  for j in range(len(train[i])):
    words.append(train[i][j][0])

for i in range(len(test)):
  for j in range(len(test[i])):
    words.append(test[i][j][0])

for i in range(len(dev)):
  for j in range(len(dev[i])):
    words.append(dev[i][j][0])

words = list(set(words))
words.append("ENDPAD")

num_words = len(words)
num_words

8102

## Tạo tập nhãn 

In [59]:
tags = []
for i in range(len(train)):
  for j in range(len(train[i])):
    tags.append(train[i][j][1])

tags = list(set(tags))
num_tags = len(tags)

## Ánh xạ giữa các câu và chỉ mục 


In [52]:
word2idx = {w: i for i, w in enumerate(words)}
tag2idx = {t: i for i, t in enumerate(tags)}

In [56]:
from tensorflow.keras.preprocessing.sequence import pad_sequences

max_len = 95
X_train = [[word2idx[w[0]] for w in s] for s in train]
X_train = pad_sequences(maxlen=max_len, sequences=X_train, padding="post", value=num_words-1)
X_dev = [[word2idx[w[0]] for w in s] for s in dev]
X_dev = pad_sequences(maxlen=max_len, sequences=X_dev, padding="post", value=num_words-1)
X_test = [[word2idx[w[0]] for w in s] for s in test]
X_test = pad_sequences(maxlen=max_len, sequences=X_test, padding="post", value=num_words-1)

y_train = [[tag2idx[w[1]] for w in s] for s in train]
y_train = pad_sequences(maxlen=max_len, sequences=y_train, padding="post", value=tag2idx["O"])
y_dev = [[tag2idx[w[1]] for w in s] for s in dev]
y_dev = pad_sequences(maxlen=max_len, sequences=y_dev, padding="post", value=tag2idx["O"])
y_test = [[tag2idx[w[1]] for w in s] for s in test]
y_test = pad_sequences(maxlen=max_len, sequences=y_test, padding="post", value=tag2idx["O"])

## Xây dựng model LSTM

In [58]:
from tensorflow.keras.models import Sequential
from tensorflow.keras import Model, Input
from tensorflow.keras.layers import LSTM, Embedding, Dense
from tensorflow.keras.layers import TimeDistributed, SpatialDropout1D, Bidirectional

In [61]:
model = Sequential() #Yêu cầu khỏi tạo mạng LSTM 1 chiều
#Input
input_word = Input(shape=(max_len,))
#Embedding
model = Embedding(input_dim=num_words, output_dim=max_len, input_length=max_len)(input_word)
#SpatialDropout1D
model = SpatialDropout1D(0.5)(model)
#LSTM
model = Bidirectional(LSTM(units=100, return_sequences=True, recurrent_dropout=0.5))(model)
#TimeDistributed
out = TimeDistributed(Dense(num_tags, activation='softmax'))(model)
model = Model(input_word, out)
model.summary()


Model: "model_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_2 (InputLayer)        [(None, 95)]              0         
                                                                 
 embedding_1 (Embedding)     (None, 95, 95)            769690    
                                                                 
 spatial_dropout1d_1 (Spatia  (None, 95, 95)           0         
 lDropout1D)                                                     
                                                                 
 bidirectional_1 (Bidirectio  (None, 95, 200)          156800    
 nal)                                                            
                                                                 
 time_distributed_1 (TimeDis  (None, 95, 20)           4020      
 tributed)                                                       
                                                           

In [62]:
model.compile(optimizer="adam",
              loss="sparse_categorical_crossentropy",
              metrics=["accuracy"])

In [63]:
import time
start_time = time.time()

history = model.fit(
    x=X_train,
    y=y_train,
    validation_data=(X_dev,y_dev),
    validation_split= 0.1,
    batch_size= 64, 
    epochs=8,
    verbose=1
)
print("[{}] Completed!".format(time.time() - start_time))

Epoch 1/8
Epoch 2/8
Epoch 3/8
Epoch 4/8
Epoch 5/8
Epoch 6/8
Epoch 7/8
Epoch 8/8
[203.28152346611023] Completed!


## Đánh giá Model

In [65]:
model.evaluate(X_test, y_test)



[0.08277864754199982, 0.9803403615951538]

Dự đoán


In [66]:
# i = np.random.randint(0, x_test.shape[0]) #659
print(X_test[1])
i =10
p = model.predict(np.array([X_test[i]]))

p = np.argmax(p, axis=-1)
y_true = y_test[i]
print("{:15}{:5}\t {}\n".format("Word", "True", "Pred"))
print("-" *30)
for w, true, pred in zip(X_test[i], y_true, p[0]):
    print("{:15}{}\t{}".format(words[w-1], tags[true], tags[pred]))

[5783 3312 6625 4190 3099 2825 4719 4397 1359 3213 5336 6625 4995 6301
 7097 4755 7750 7097 2378 5764 1194 1500 6625 6021 7513 6625 1875 1810
 6254 1681 4307 8083 8101 8101 8101 8101 8101 8101 8101 8101 8101 8101
 8101 8101 8101 8101 8101 8101 8101 8101 8101 8101 8101 8101 8101 8101
 8101 8101 8101 8101 8101 8101 8101 8101 8101 8101 8101 8101 8101 8101
 8101 8101 8101 8101 8101 8101 8101 8101 8101 8101 8101 8101 8101 8101
 8101 8101 8101 8101 8101 8101 8101 8101 8101 8101 8101]
Word           True 	 Pred

------------------------------
ngày_sinh      O	O
497            O	O
ương         O	O
149            O	O
bảo           O	O
Nhờ            B-PATIENT_ID	O
thang_máy      O	O
thực_tập       O	O
Nhờ            B-PATIENT_ID	B-PATIENT_ID
Hương_Sơn      O	O
522            O	O
bán_hoa        B-GENDER	B-GENDER
ương         O	O
N.T.P.N.       B-AGE	B-AGE
528            O	O
ương         O	O
thu_gom        O	O
618            O	O
Bảy_Huấn       O	O
chia_tay       O	O
Trương_Huyền_TrườngB-LO

In [67]:
from sklearn.metrics import  f1_score
predict = np.argmax(model.predict(X_test), axis=-1)



In [68]:
# Đánh giá theo độ đo f1 score: micro 
f1_score(y_test.ravel(), predict.ravel(), average = 'micro')

0.980340350877193

In [69]:
# Đánh giá theo độ đo f1 score: macro 
f1_score(y_test.ravel(), predict.ravel(), average = 'macro')

0.5337336045608405

In [70]:
# Xuất ra kết quả của F1-score và Accuracy cho từng nhãn thực thể. 
from sklearn.metrics import classification_report

a = {i: t for i, t in enumerate(tags)}
print(classification_report(y_test.ravel(), 
                            predict.ravel(),
                            labels=list(a.keys()), 
                            target_names=list(a.values())))

  _warn_prf(average, modifier, msg_start, len(result))


                       precision    recall  f1-score   support

               B-DATE       0.96      0.94      0.95      1649
B-SYMPTOM_AND_DISEASE       0.88      0.78      0.83      1136
     B-TRANSPORTATION       0.97      0.32      0.48       193
         I-PATIENT_ID       0.00      0.00      0.00        27
               I-NAME       0.00      0.00      0.00        13
     I-TRANSPORTATION       0.00      0.00      0.00        69
             B-GENDER       0.97      0.73      0.84       452
                I-AGE       0.00      0.00      0.00         6
       I-ORGANIZATION       0.84      0.74      0.79      2014
               B-NAME       0.67      0.01      0.02       318
                I-JOB       0.00      0.00      0.00       114
         B-PATIENT_ID       0.82      0.81      0.81      1988
                    O       0.99      1.00      0.99    262942
                B-AGE       0.94      0.61      0.74       573
           B-LOCATION       0.88      0.86      0.87  

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


## Lưu Model 

In [71]:
model.save('model_LSTM_bt2.h5')