In [1]:
import os
import sys
import numpy as np
import pandas as pd
from tqdm import tqdm
import json

In [2]:
def proprocessing(source_path):

    all_datas = []

    with open(source_path,'r', encoding='utf8') as f:
        articles = f.read().encode('utf-8').decode('utf-8-sig')
        
    article_item_list = articles.replace('？','?').replace("…",'.').lower()
        
    article_item_list = article_item_list.split('\n\n--------------------\n\n')[:-1] 
    
    # article_item:  article + items
    for article_item in article_item_list:
        # split every line in article item
        article_item = article_item.split('\n')
        
        temp = {"article":None , "id":None  , "items":[]   }
        temp["article"] = article_item[0]

        for i in range(2,len(article_item)): # start from 2: skip column headers
            item = article_item[i].split('\t')
            item[:3] = [int(t) for t in item[:3]]
            temp["items"].append(item) 
        temp["id"] = temp["items"][0][0]
        all_datas.append(temp)

    print(f"Theres {len(all_datas)} articles")

    return all_datas

# 1. Fasttext + CRF

In [3]:
SOURCE_TXT = "./dataset/SampleData_deid.txt"
TARGET_PATH = "./dataset/crf_data.txt"

In [4]:
def save_data(target_path , all_datas):
    
    if os.path.isfile(target_path):
        os.remove(target_path)
    
    with open(target_path ,'w+') as f:
        for data in all_datas:
            article_words = [w for w in data["article"]]
            labels = ['O'] * len(article_words)
            items = data["items"]
            for item in items:
                labels[item[1]:item[2]] =  [f"I-{item[-1]}"] * (item[2]-item[1])
                labels[item[1]] = 'B-' + item[-1]
            for w,l in zip(article_words,labels):
                f.write(f"{w} {l}\n")
            f.write("\n")    

In [5]:
all_datas = proprocessing(SOURCE_TXT)
save_data(TARGET_PATH,all_datas)
all_datas[5]

Theres 26 articles


{'article': '醫師：阿嬤回去狀況怎麽樣?家屬：這個就是她出院的時候有開軟便藥，drn藥。啊，我們出院當天開始配睡前，兩顆。醫師：兩顆。家屬：那，那個住院後有，還是有消化差的情況，目前是牛奶加水，100ml每餐。醫師：100，牛奶加水，100ml每餐。家屬：對。醫師：那加起來的話會到幾ml?家屬：就牛奶加水，就100ml，一天7餐。醫師：7餐，那這樣一天才700哦。變，變差。家屬：那可是還，還是會有50ml的會消化不良。醫師：真的哦?家屬：對。那，目前使用灌食袋這個部分的情況就請醫師那個，評估一下。醫師：好。家屬：第三個注明她有吃這類wakamoto。醫師：wakamoto哦?家屬：欸9顆，3餐飯後使用。那這個可以并服?醫師：可以哦，可以并服哦。家屬：可哦。好，那接下來作業期間她有追蹤尿液的細菌培養說有桿菌，那請問醫師這個部分需要做接觸性的隔離嗎?醫師：啊，不用，就一般性的就好了。家屬：一般性。醫師：你要處理她的尿布啊。你就掛手套處理啦。家屬：就一般性隔離。醫師：就可以了。這個在醫院我們也沒有在隔離啦。家屬：好。接觸尿布的時候帶手套?醫師：就是接觸體液，比方説，她的大小便啊，還是抽痰的時候，照顧的人還是戴一下手套這樣子。就是一般的，就是一般標準就可以了。家屬：好。那出院之後，沒有發燒，生命徵象也穩定。醫師：也穩定。家屬：也沒有抽痰。醫師：好。家屬：對。那剛來門診之前，她有做那個抽血。醫師：抽血的檢查。家屬：對。醫師：哦，報告還沒有出來這樣子。家屬：對，應該不會那麽快吧?醫師：對，不會那麽快。家屬：我抽完這樣，就直接過來這裏了。醫師：哦。家屬：那也只能下次再看看。醫師：在看看囉。我幫她聽一下。她消化的部分本來，本來在醫院還可以勒，後來反而變，在出院之後比較差了一點。家屬：對，那這樣子變成說一餐100ml還有50ml的那個未消化就很差啦。而且我們也是用灌食帶這個速度。醫師：你們也是用灌食袋這樣?家屬：對，目前是使用灌食袋，速度上也應該不至於說太快這樣。醫師：太快這樣。家屬：對。醫師：我們也有再開那個促進腸胃蠕動的。家屬：好。醫師：好像，有沒有脹氣。家屬：有脹氣嗎?醫師：一點點，我開一點，再開點脹氣，然後腸胃蠕動的再藥繼續，繼續吃好不好?家屬：好。醫師：便便也都還好哦?家屬：對，因爲目前是，那個睡前兩顆，有計時，有固定配了。醫師：好。那我，我依序給她，就是你們護

## 1-1 NER model

In [None]:
#!pip install sklearn-crfsuite

In [6]:
import sklearn_crfsuite
from sklearn_crfsuite import scorers
from sklearn_crfsuite import metrics
from sklearn_crfsuite.metrics import flat_classification_report

# 1-2 TRAIN / TEST

In [7]:
# Load `train.data` and separate into a list of labeled data of each text
# return:
#   data_list: a list of lists of tuples, storing tokens and labels (wrapped in tuple) of each text in `train.data`
#   traindata_list: a list of lists, storing training data_list splitted from data_list
#   testdata_list: a list of lists, storing testing data_list splitted from data_list
from sklearn.model_selection import train_test_split
import numpy as np
def Dataset(data_path):
    with open(data_path, 'r', encoding='utf-8') as f:
        data=f.readlines()#.encode('utf-8').decode('utf-8-sig')
    data_list, data_list_tmp = list(), list()
    article_id_list=list()
    idx=0
    for row in data:
        data_tuple = tuple()
        if row == '\n':
            article_id_list.append(idx)
            idx+=1
            data_list.append(data_list_tmp)
            data_list_tmp = []
        else:
            row = row.strip('\n').split(' ')
            data_tuple = (row[0], row[1])
            data_list_tmp.append(data_tuple)
    if len(data_list_tmp) != 0:
        data_list.append(data_list_tmp)
    
    # Here we random split data into training dataset and testing dataset
    # But you should take `development data` or `test data` as testing data
    # At that time, you could just delete this line, 
    # nd generate data_list of `train data` and data_list of `development/test data` by this function
    traindata_list, testdata_list, traindata_article_id_list, testdata_article_id_list=train_test_split(data_list,
                                                                                                    article_id_list,
                                                                                                    test_size=0.33,
                                                                                                    random_state=42)
    
    return data_list, traindata_list, testdata_list, traindata_article_id_list, testdata_article_id_list 

In [8]:
data_list, traindata_list, testdata_list, traindata_article_id_list, testdata_article_id_list = Dataset(TARGET_PATH)

In [9]:
traindata_article_id_list

[5, 2, 12, 15, 3, 4, 21, 17, 22, 18, 25, 20, 7, 10, 14, 19, 6]

In [10]:
testdata_article_id_list

[8, 16, 0, 24, 11, 9, 13, 1, 23]

# 1-4 install fasttext and load model

In [11]:
#!pip install fasttext
# fast text model:  "https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.zh.300.bin.gz"

In [12]:
import fasttext
import fasttext.util
# load fast text model
model = fasttext.load_model('./dataset/cc.zh.300.bin')
model.get_dimension()



300

In [13]:
def embedding_to_feature(embedding):
    return { str("dim_") + str(i):e for i,e in enumerate(embedding) }

In [14]:
def get_embedding_label( data , fasttext_model): # given list of ('w',label) , return crf input x and label y
    
    x = [ [] for i in range(len(data))]
    y = [ [] for i in range(len(data))]

    for article_idx , article in enumerate(data):

        x[article_idx] =  [ {
                                **embedding_to_feature(  fasttext_model[word[0]] ) ,\
                            } for i , word in enumerate( article)]
        y[article_idx] = [ i[-1] for i in article]

    return x,y

In [15]:
x_train , y_train = get_embedding_label( traindata_list , model )
x_test , y_test = get_embedding_label( testdata_list , model )

print(f" train_x {len(x_train)} train_y {len(y_train)} ")
print(f" test_x {len(x_test)} test_y {len(y_test)} ")

 train_x 17 train_y 17 
 test_x 9 test_y 9 


In [21]:
def train_test(x_train,y_train,x_test,y_test , c1 =0.1, c2=0.1 ,max_iterations = 50):

    # union all labels
    labels = set([item for sublist in y_train for item in sublist]) | set([item for sublist in y_test for item in sublist]) 

    # crf model
    crf = sklearn_crfsuite.CRF(
        algorithm='lbfgs',
        max_iterations=max_iterations,
        all_possible_transitions=True,
        c1=c1,
        c2=c2,
    )
    # fit
    crf.fit(x_train, y_train)
    # pred
    y_pred = crf.predict(x_test)
    y_pred_mar = crf.predict_marginals(x_test)

    labels = list(crf.classes_)
    labels.remove('O')
    f1score = metrics.flat_f1_score(y_test, y_pred, average='weighted', labels=labels)
    
    return y_pred, y_pred_mar, f1score , crf

# 1-5 find best parameters

In [18]:
exp_result = []
best_f1 = [-1,0,0]

for c1 in np.arange(0.001,0.01,0.003):
    for c2 in np.arange(0.001,0.01,0.003):
        y_pred, y_pred_mar, f1score , crf_model = train_test(x_train, y_train, x_test, y_test,c1,c2)
        if best_f1[0] < f1score:
            best_f1 = f1score , c1,c2            
        exp_result.append([c1, c2, f1score])

print(f"Best f1 score is {best_f1[0]} c1 {best_f1[1]} c2 {best_f1[2]}")

Best f1 score is 0.6371470093953394 c1 0.010000000000000002 c2 0.010000000000000002


In [19]:
x = pd.DataFrame(columns=['c1','c2','f1-score'])
x[['c1','c2','f1-score']] = exp_result
x

Unnamed: 0,c1,c2,f1-score
0,0.001,0.001,0.605966
1,0.001,0.004,0.586633
2,0.001,0.007,0.576857
3,0.001,0.01,0.585787
4,0.004,0.001,0.582049
5,0.004,0.004,0.614845
6,0.004,0.007,0.591066
7,0.004,0.01,0.580986
8,0.007,0.001,0.586689
9,0.007,0.004,0.595671


# 2-1 crf + jeiba

In [23]:
#!pip install jieba

import jieba
import jieba.posseg as pseg

In [26]:
def get_pos_embedding_label( data , fasttext_model): # given list of ('w',label) , return crf input x and label y
    
    x = [ [] for i in range(len(data))]
    y = [ [] for i in range(len(data))]

    for article_idx , article in enumerate(data):

        ori_articles = ''.join([i[0] for i in article])
        
        # jeiba pos tag
        pos_taggings = []
        for word,flag in  pseg.cut(ori_articles):
            pos_taggings += [flag] * len(word)

        x[article_idx] =  [ {   
                                # add pos tag feature
                                **{'pos_tag': pos_taggings[i]},
                                # add embedding feature
                                **embedding_to_feature(  fasttext_model[word[0]] ) ,
                                              
                            } for i , word in enumerate( article)]
        y[article_idx] = [ i[-1] for i in article]

    return x,y

In [27]:
x_train , y_train = get_pos_embedding_label( traindata_list , model )
x_test , y_test = get_pos_embedding_label( testdata_list , model )

print(f" train_x {len(x_train)} train_y {len(y_train)} ")
print(f" test_x {len(x_test)} test_y {len(y_test)} ")

 train_x 17 train_y 17 
 test_x 9 test_y 9 


In [28]:
exp_result = []
best_f1 = [-1,0,0]

for c1 in np.arange(0.001,0.01,0.003):
    for c2 in np.arange(0.001,0.01,0.003):
        y_pred, y_pred_mar, f1score , crf_model = train_test(x_train, y_train, x_test, y_test,c1,c2 ,max_iterations=50 )
        if best_f1[0] < f1score:
            best_f1 = f1score , c1,c2            
        exp_result.append([c1, c2, f1score])

print(f"Best f1 score is {best_f1[0]} c1 {best_f1[1]} c2 {best_f1[2]}")

Best f1 score is 0.6517556543179831 c1 0.010000000000000002 c2 0.001


In [29]:
x = pd.DataFrame(columns=['c1','c2','f1-score'])
x[['c1','c2','f1-score']] = exp_result
x

Unnamed: 0,c1,c2,f1-score
0,0.001,0.001,0.610164
1,0.001,0.004,0.61101
2,0.001,0.007,0.647875
3,0.001,0.01,0.63729
4,0.004,0.001,0.62982
5,0.004,0.004,0.620533
6,0.004,0.007,0.634148
7,0.004,0.01,0.621558
8,0.007,0.001,0.610002
9,0.007,0.004,0.646743


## Output data
* Change model output into `output.tsv` 
* Only accept this output format uploading to competition system

In [None]:
output="article_id\tstart_position\tend_position\tentity_text\tentity_type\n"
for test_id in range(len(y_pred)):
    pos=0
    start_pos=None
    end_pos=None
    entity_text=None
    entity_type=None
    for pred_id in range(len(y_pred[test_id])):
        if y_pred[test_id][pred_id][0]=='B':
            start_pos=pos
            entity_type=y_pred[test_id][pred_id][2:]
        elif start_pos is not None and y_pred[test_id][pred_id][0]=='I' and y_pred[test_id][pred_id+1][0]=='O':
            end_pos=pos
            entity_text=''.join([testdata_list[test_id][position][0] for position in range(start_pos,end_pos+1)])
            line=str(testdata_article_id_list[test_id])+'\t'+str(start_pos)+'\t'+str(end_pos+1)+'\t'+entity_text+'\t'+entity_type
            output+=line+'\n'
        pos+=1     

In [None]:
output_path='output.tsv'
with open(output_path,'w',encoding='utf-8') as f:
    f.write(output)

In [None]:
print(output)

## Note
* You may try `python-crfsuite` to train an neural network for NER tagging optimized by gradient descent back propagation
    * [Documentation](https://github.com/scrapinghub/python-crfsuite)
* You may try `CRF++` tool for NER tagging by CRF model
    * [Documentation](http://taku910.github.io/crfpp/)
    * Need design feature template
    * Can only computed in CPU
* You may try other traditional chinese word embedding (ex. fasttext, bert, ...) for input features
* You may try add other features for NER model, ex. POS-tag, word_length, word_position, ...
* You should upload the prediction output on `development data` or `test data` provided later to the competition system. Note don't upload prediction output on the splitted testing dataset like this baseline example.

-----------------------------------------------------