In [1]:
import os
import sys
import numpy as np
import pandas as pd
from tqdm import tqdm
import json

ModuleNotFoundError: No module named 'pandas'

## Preprocessing
* Change input data (ex. train.txt) into CRF model input format (ex. train.data)
    * CRF model input format (ex. train.data):
        ```
        肝 O
        功 O
        能 O
        6 B-med_exam
        8 I-med_exam
        ```

In [None]:
def proprocessing(source_path , target_path):

    if os.path.isfile(target_path):
        os.remove(target_path)

    all_datas = []

    with open(source_path,'r', encoding='utf8') as f:
        article_item_list = f.read().encode('utf-8').decode('utf-8-sig').split('\n\n--------------------\n\n')[:-1] 
    
    # article_item:  article + items
    for article_item in article_item_list:
        # split every line in article item
        article_item = article_item.split('\n')
        
        temp = {"article":None , "id":None  , "items":[]   }
        temp["article"] = article_item[0]

        for i in range(2,len(article_item)): # start from 2: skip column headers
            item = article_item[i].split('\t')
            item[:3] = [int(t) for t in item[:3]]
            temp["items"].append(item) 
        temp["id"] = temp["items"][0][0]
        all_datas.append(temp)

    print(f"Theres {len(all_datas)} articles")

    with open(target_path ,'w+') as f:

        for data in all_datas:
            article_words = [w for w in data["article"]]
            labels = ['O'] * len(article_words)
            items = data["items"]

            for item in items:
                labels[item[1]:item[2]] =  [f"I-{item[-1]}"] * (item[2]-item[1])
                labels[item[1]] = 'B-' + item[-1]

            for w,l in zip(article_words,labels):
                f.write(f"{w} {l}\n")
            f.write("\n")

    return all_datas

In [None]:
SOURCE_TXT = "./dataset/SampleData_deid.txt"
TARGET_PATH = "./dataset/data.txt"

In [None]:
all_datas = proprocessing(SOURCE_TXT,TARGET_PATH)
all_datas[5]

## NER model
### CRF (Conditional Random Field model)
* Using `sklearn-crfsuite` API

    (you may try `CRF++`, `python-crfsuite`, `pytorch-crfsuite`(neural network version))

In [None]:
!pip install sklearn-crfsuite

import sklearn_crfsuite
from sklearn_crfsuite import scorers
from sklearn_crfsuite import metrics
from sklearn_crfsuite.metrics import flat_classification_report

In [None]:
def CRF(x_train, y_train, x_test, y_test):
    crf = sklearn_crfsuite.CRF(
        algorithm='lbfgs',
        c1=0.1,
        c2=0.1,
        max_iterations=100,
        all_possible_transitions=True
    )
    crf.fit(x_train, y_train)

    y_pred = crf.predict(x_test)
    y_pred_mar = crf.predict_marginals(x_test)

    labels = list(crf.classes_)
    labels.remove('O')
    f1score = metrics.flat_f1_score(y_test, y_pred, average='weighted', labels=labels)
    sorted_labels = sorted(labels,key=lambda name: (name[1:], name[0])) # group B and I results

    return y_pred, y_pred_mar, f1score

## Model Input: 
* input features:
    * word vector: pretrained traditional chinese word embedding by Word2Vec-CBOW
    
    (you may try add some other features, ex. pos-tag, word_length, word_position, ...) 

In [None]:
import numpy as np

In [None]:
# Load pretrained word vectors
# Get a dict of tokens (key) and their pretrained word vectors (value)
# Pretrained word2vec CBOW word vector: https://fgc.stpi.narl.org.tw/activity/videoDetail/4b1141305ddf5522015de5479f4701b1
dim = 0
word_vecs= {}
# Open pretrained word vector file
with open('./cna.cbow.cwe_p.tar_g.512d.0.txt') as f:
    for line in f:
        tokens = line.strip().split()

        # there 2 integers in the first line: vocabulary_size, word_vector_dim
        if len(tokens) == 2:
            dim = int(tokens[1])
            continue
    
        word = tokens[0] 
        vec = np.array([ float(t) for t in tokens[1:] ])
        word_vecs[word] = vec

In [None]:
print('vocabulary_size: ',len(word_vecs),' word_vector_dim: ',vec.shape)

Here we split data into training dataset and testing dataset,
however, we'll provide `development data` and `test data` which is real testing dataset.

You should upload prediction on `development data` and `test data` to system, not this splitted testing dataset.

In [None]:
# Load `train.data` and separate into a list of labeled data of each text
# return:
#   data_list: a list of lists of tuples, storing tokens and labels (wrapped in tuple) of each text in `train.data`
#   traindata_list: a list of lists, storing training data_list splitted from data_list
#   testdata_list: a list of lists, storing testing data_list splitted from data_list
from sklearn.model_selection import train_test_split
def Dataset(data_path):
    with open(data_path, 'r', encoding='utf-8') as f:
        data=f.readlines()#.encode('utf-8').decode('utf-8-sig')
    data_list, data_list_tmp = list(), list()
    article_id_list=list()
    idx=0
    for row in data:
        data_tuple = tuple()
        if row == '\n':
            article_id_list.append(idx)
            idx+=1
            data_list.append(data_list_tmp)
            data_list_tmp = []
        else:
            row = row.strip('\n').split(' ')
            data_tuple = (row[0], row[1])
            data_list_tmp.append(data_tuple)
    if len(data_list_tmp) != 0:
        data_list.append(data_list_tmp)
    
    # Here we random split data into training dataset and testing dataset
    # But you should take `development data` or `test data` as testing data
    # At that time, you could just delete this line, 
    # nd generate data_list of `train data` and data_list of `development/test data` by this function
    traindata_list, testdata_list, traindata_article_id_list, testdata_article_id_list=train_test_split(data_list,
                                                                                                    article_id_list,
                                                                                                    test_size=0.33,
                                                                                                    random_state=42)
    
    return data_list, traindata_list, testdata_list, traindata_article_id_list, testdata_article_id_list 

In [None]:
# look up word vectors
# turn each word into its pretrained word vector
# return a list of word vectors corresponding to each token in train.data
def Word2Vector(data_list, embedding_dict):
    embedding_list = list()

    # No Match Word (unknown word) Vector in Embedding
    unk_vector=np.random.rand(*(list(embedding_dict.values())[0].shape))

    for idx_list in range(len(data_list)):
        embedding_list_tmp = list()
        for idx_tuple in range(len(data_list[idx_list])):
            key = data_list[idx_list][idx_tuple][0] # token

            if key in embedding_dict:
                value = embedding_dict[key]
            else:
                value = unk_vector
            embedding_list_tmp.append(value)
        embedding_list.append(embedding_list_tmp)
        
    return embedding_list

In [None]:
# Input features: pretrained word vectors of each token
# Return a list of feature dicts, each feature dict corresponding to each token
def Feature(embed_list):
    feature_list = list()
    for idx_list in range(len(embed_list)):
        feature_list_tmp = list()
        for idx_tuple in range(len(embed_list[idx_list])):
            feature_dict = dict()
            for idx_vec in range(len(embed_list[idx_list][idx_tuple])):
                feature_dict['dim_' + str(idx_vec+1)] = embed_list[idx_list][idx_tuple][idx_vec]
            feature_list_tmp.append(feature_dict)
        feature_list.append(feature_list_tmp)

    return feature_list

In [None]:
# Get the labels of each tokens in train.data
# Return a list of lists of labels
def Preprocess(data_list):
    label_list = list()
    for idx_list in range(len(data_list)):
        label_list_tmp = list()
        for idx_tuple in range(len(data_list[idx_list])):
            label_list_tmp.append(data_list[idx_list][idx_tuple][1])
        label_list.append(label_list_tmp)
        
    return label_list

## Training

In [None]:
data_list, traindata_list, testdata_list, traindata_article_id_list, testdata_article_id_list = Dataset(TARGET_PATH)

In [None]:
traindata_article_id_list

In [None]:
testdata_article_id_list

In [None]:
# Load Word Embedding
trainembed_list = Word2Vector(traindata_list, word_vecs)
testembed_list = Word2Vector(testdata_list, word_vecs)

print(f"trainembed_list {len(trainembed_list)} testembed_list {len(testembed_list)}")

# CRF - Train Data (Augmentation Data)
x_train = Feature(trainembed_list) # 17 * article len
y_train = Preprocess(traindata_list) # 17 * article len

# CRF - Test Data (Golden Standard)
x_test = Feature(testembed_list) # 9*article len
y_test = Preprocess(testdata_list) # 9*article len

In [None]:
y_pred, y_pred_mar, f1score = CRF(x_train, y_train, x_test, y_test)

In [None]:
f1score

# fasttext + find best parameters of CRF

In [None]:
!pip install fasttext

In [None]:
import fasttext
import fasttext.util
# load fast text model
model = fasttext.load_model('./dataset/cc.zh.300.bin')
model.get_dimension()

In [None]:
def embedding_to_feature(embedding):
    return { str("dim_") + str(i):e for i,e in enumerate(embedding) }

In [None]:
def get_embedding_label( data , fasttext_model): # given list of ('w',label) , return crf input x and label y
    
    x = [ [] for i in range(len(data))]
    y = [ [] for i in range(len(data))]

    for article_idx , article in enumerate(data):
        x[article_idx] =  [  embedding_to_feature(fasttext_model[i[0]]) for i in  article]
        y[article_idx] = [ i[-1] for i in article]

    return x,y

In [None]:
x_train , y_train = get_embedding_label( traindata_list , model )
x_test , y_test = get_embedding_label( testdata_list , model )

print(f" train_x {len(x_train)} train_y {len(y_train)} ")
print(f" test_x {len(x_test)} test_y {len(y_test)} ")

# train model

In [None]:
def train_test(x_train,y_train,x_test,y_test , c1 =0.1, c2=0.1):

    # union all labels
    labels = set([item for sublist in y_train for item in sublist]) | set([item for sublist in y_test for item in sublist]) 

    # crf model
    crf = sklearn_crfsuite.CRF(
        algorithm='lbfgs',
        max_iterations=100,
        all_possible_transitions=True,
        c1=c1,
        c2=c2,
    )
    # fit
    crf.fit(x_train, y_train)
    # pred
    y_pred = crf.predict(x_test)
    y_pred_mar = crf.predict_marginals(x_test)

    labels = list(crf.classes_)
    labels.remove('O')
    f1score = metrics.flat_f1_score(y_test, y_pred, average='weighted', labels=labels)
    sorted_labels = sorted(labels,key=lambda name: (name[1:], name[0])) # group B and I results
    
    return y_pred, y_pred_mar, f1score , crf

# find best parameters

In [None]:
exp_result = []
best_f1 = [-1,0,0]

for c1 in np.arange(0.001,0.01,0.003):
    for c2 in (0.001,0.01,0.003):
        y_pred, y_pred_mar, f1score , crf_model = train_test(x_train, y_train, x_test, y_test,c1,c2)
        if best_f1[0] < f1score:
            best_f1 = f1score , c1,c2            
        exp_result.append([c1, c2, f1score])

print(f"Best f1 score is {best_f1[0]} c1 {best_f1[1]} c2 {best_f1[2]}")

In [None]:
x = pd.DataFrame(columns=['c1','c2','f1-score'])
x[['c1','c2','f1-score']] = exp_result
x

## Output data
* Change model output into `output.tsv` 
* Only accept this output format uploading to competition system

In [None]:
output="article_id\tstart_position\tend_position\tentity_text\tentity_type\n"
for test_id in range(len(y_pred)):
    pos=0
    start_pos=None
    end_pos=None
    entity_text=None
    entity_type=None
    for pred_id in range(len(y_pred[test_id])):
        if y_pred[test_id][pred_id][0]=='B':
            start_pos=pos
            entity_type=y_pred[test_id][pred_id][2:]
        elif start_pos is not None and y_pred[test_id][pred_id][0]=='I' and y_pred[test_id][pred_id+1][0]=='O':
            end_pos=pos
            entity_text=''.join([testdata_list[test_id][position][0] for position in range(start_pos,end_pos+1)])
            line=str(testdata_article_id_list[test_id])+'\t'+str(start_pos)+'\t'+str(end_pos+1)+'\t'+entity_text+'\t'+entity_type
            output+=line+'\n'
        pos+=1     

In [None]:
output_path='output.tsv'
with open(output_path,'w',encoding='utf-8') as f:
    f.write(output)

In [None]:
print(output)

## Note
* You may try `python-crfsuite` to train an neural network for NER tagging optimized by gradient descent back propagation
    * [Documentation](https://github.com/scrapinghub/python-crfsuite)
* You may try `CRF++` tool for NER tagging by CRF model
    * [Documentation](http://taku910.github.io/crfpp/)
    * Need design feature template
    * Can only computed in CPU
* You may try other traditional chinese word embedding (ex. fasttext, bert, ...) for input features
* You may try add other features for NER model, ex. POS-tag, word_length, word_position, ...
* You should upload the prediction output on `development data` or `test data` provided later to the competition system. Note don't upload prediction output on the splitted testing dataset like this baseline example.

-----------------------------------------------------