In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import tensorflow as tf
from io import *
%matplotlib inline
from io import BytesIO
import requests
np.random.seed(0)
plt.style.use("ggplot")

## Đọc dữ liệu 

In [None]:
def load_data(path):

  data = []
  with open(path) as f:
    sentence = []
    for i, line in enumerate(f.readlines()):
      try:
        word, tag = line.split()
        sentence.append((word,tag))
      except:
        data.append(sentence)
        sentence = []
  return data 

In [None]:
path_dev = '/content/drive/MyDrive/Data Scientist/DS310.N11 - NLP/data/word/dev_word.conll' 
path_train = '/content/drive/MyDrive/Data Scientist/DS310.N11 - NLP/data/word/train_word.conll'
path_test = '/content/drive/MyDrive/Data Scientist/DS310.N11 - NLP/data/word/test_word.conll'

In [None]:
train = load_data(path_train)
test = load_data(path_test)
dev = load_data(path_dev)

## Tạo tập từ vựng

In [None]:
words=[]
for i in range(len(train)):
  for j in range(len(train[i])):
    words.append(train[i][j][0])

for i in range(len(test)):
  for j in range(len(test[i])):
    words.append(test[i][j][0])

for i in range(len(dev)):
  for j in range(len(dev[i])):
    words.append(dev[i][j][0])

words = list(set(words))
words.append("ENDPAD")

num_words = len(words)
num_words

8102

## Tạo tập nhãn 

In [None]:
tags = []
for i in range(len(train)):
  for j in range(len(train[i])):
    tags.append(train[i][j][1])

tags = list(set(tags))
num_tags = len(tags)

## Ánh xạ giữa các câu và chỉ mục 


In [None]:
word2idx = {w: i for i, w in enumerate(words)}
tag2idx = {t: i for i, t in enumerate(tags)}

In [None]:
from tensorflow.keras.preprocessing.sequence import pad_sequences

max_len = 95
X_train = [[word2idx[w[0]] for w in s] for s in train]
X_train = pad_sequences(maxlen=max_len, sequences=X_train, padding="post", value=num_words-1)
X_dev = [[word2idx[w[0]] for w in s] for s in dev]
X_dev = pad_sequences(maxlen=max_len, sequences=X_dev, padding="post", value=num_words-1)
X_test = [[word2idx[w[0]] for w in s] for s in test]
X_test = pad_sequences(maxlen=max_len, sequences=X_test, padding="post", value=num_words-1)

y_train = [[tag2idx[w[1]] for w in s] for s in train]
y_train = pad_sequences(maxlen=max_len, sequences=y_train, padding="post", value=tag2idx["O"])
y_dev = [[tag2idx[w[1]] for w in s] for s in dev]
y_dev = pad_sequences(maxlen=max_len, sequences=y_dev, padding="post", value=tag2idx["O"])
y_test = [[tag2idx[w[1]] for w in s] for s in test]
y_test = pad_sequences(maxlen=max_len, sequences=y_test, padding="post", value=tag2idx["O"])

In [204]:
from tensorflow.keras.models import Sequential
from tensorflow.keras import Model, Input
from tensorflow.keras.layers import LSTM, Embedding, Dense, GRU
from tensorflow.keras.layers import TimeDistributed, SpatialDropout1D, Bidirectional

## Tạo Model BiGRU

In [205]:
model = Sequential() #Yêu cầu khỏi tạo mạng LSTM 1 chiều
#Input
input_word = Input(shape=(max_len,))
#Embedding
model = Embedding(input_dim=num_words, output_dim=max_len, input_length=max_len)(input_word)
#SpatialDropout1D
model = SpatialDropout1D(0.5)(model)
#GRU
model = Bidirectional(GRU(64, return_sequences=True))(model)
#LSTM
model = Bidirectional(GRU(units=100, return_sequences=True, recurrent_dropout=0.5))(model)
#TimeDistributed
out = TimeDistributed(Dense(num_tags, activation='softmax'))(model)
model = Model(input_word, out)
model.summary()


Model: "model_10"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_14 (InputLayer)       [(None, 95)]              0         
                                                                 
 embedding_5 (Embedding)     (None, 95, 95)            769690    
                                                                 
 spatial_dropout1d_2 (Spatia  (None, 95, 95)           0         
 lDropout1D)                                                     
                                                                 
 bidirectional_2 (Bidirectio  (None, 95, 128)          61824     
 nal)                                                            
                                                                 
 bidirectional_3 (Bidirectio  (None, 95, 200)          138000    
 nal)                                                            
                                                          

In [206]:
model.compile(optimizer="adam",
              loss="sparse_categorical_crossentropy",
              metrics=["accuracy"])

In [207]:
import time
start_time = time.time()

history = model.fit(
    x=X_train,
    y=y_train,
    validation_data=(X_dev,y_dev),
    validation_split= 0.1,
    batch_size= 64, 
    epochs=8,
    verbose=1
)
print("[{}] Completed!".format(time.time() - start_time))

Epoch 1/8
Epoch 2/8
Epoch 3/8
Epoch 4/8
Epoch 5/8
Epoch 6/8
Epoch 7/8
Epoch 8/8
[328.43634009361267] Completed!


## Đánh giá Model

In [208]:
from sklearn.metrics import  f1_score
predict = np.argmax(model.predict(X_test), axis=-1)



In [209]:
# Đánh giá theo độ đo f1 score: micro 
f1_score(y_test.ravel(), predict.ravel(), average = 'micro')

0.9844842105263157

In [210]:
# Đánh giá theo độ đo f1 score: macro 
f1_score(y_test.ravel(), predict.ravel(), average = 'macro')

0.5964513192675934

In [211]:
# Xuất ra kết quả của F1-score và Accuracy cho từng nhãn thực thể. 
from sklearn.metrics import classification_report

a = {i: t for i, t in enumerate(tags)}
print(classification_report(y_test.ravel(), 
                            predict.ravel(),
                            labels=list(a.keys()), 
                            target_names=list(a.values())))

                       precision    recall  f1-score   support

               B-DATE       0.97      0.95      0.96      1649
B-SYMPTOM_AND_DISEASE       0.89      0.81      0.85      1136
     B-TRANSPORTATION       0.92      0.51      0.65       193
         I-PATIENT_ID       0.00      0.00      0.00        27
               I-NAME       0.00      0.00      0.00        13
     I-TRANSPORTATION       1.00      0.03      0.06        69
             B-GENDER       0.94      0.88      0.91       452
                I-AGE       0.00      0.00      0.00         6
       I-ORGANIZATION       0.83      0.82      0.83      2014
               B-NAME       0.91      0.40      0.56       318
                I-JOB       0.00      0.00      0.00       114
         B-PATIENT_ID       0.90      0.89      0.90      1988
                    O       0.99      1.00      0.99    262942
                B-AGE       0.90      0.88      0.89       573
           B-LOCATION       0.92      0.88      0.90  

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


## Lưu Model

In [212]:
model.save('model_biGRU_bt2.h5')