# Chinese Word Segmentation using RNN

Global file paths

In [1]:
input_file = 'data/training/msr_training.utf8'
test_file = 'data/testing/msr_test.utf8'
result_file = 'data/result/msr_rnn.utf8'
cws_info_filePath = 'rnn/cws.info'
cws_data_filePath = 'rnn/cws.data'
output_model_file = 'rnn/msr_training_word2vec.model'
output_vector_file = 'rnn/msr_training_word2vec.vector'
output_keras_model_file = 'rnn/cws_keras_model'
output_keras_model_weights_file = 'rnn/keras_model_weights'


### Preprocessing

In [2]:
import json
import h5py
import string
import codecs
import sys
import time

"""
vocab: char -> index
indexVocab: list of char ordered by the index in vocab
initProb: probability for each tag
tranProb: Transition probability for transition from tag 1 to tag 2
X: list of [char_1, char_2, .. char_ctxWindows]
y: list of tag indices
"""

corpus_tags = ['S', 'B', 'M', 'E']
retain_unknown = 'retain-unknown'
retain_padding = 'retain-padding'

def saveCwsInfo(path, cwsInfo):
    print('save cws info to %s'%path)
    fd = open(path, 'w')
    (initProb, tranProb), (vocab, indexVocab) = cwsInfo
    j = json.dumps((initProb, tranProb))
    fd.write(j + '\n')
    for char in vocab:
        fd.write(char.encode('utf-8') + '\t' + str(vocab[char]) + '\n')
    fd.close()

def loadCwsInfo(path):
    print('load cws info from %s'%path)
    fd = open(path, 'r')
    line = fd.readline()
    j = json.loads(line.strip())
    initProb, tranProb = j[0], j[1]
    lines = fd.readlines()
    fd.close()
    vocab = {}
    indexVocab = [0 for i in range(len(lines))]
    for line in lines:
        rst = line.strip().split('\t')
        if len(rst) < 2: continue
        char, index = rst[0].decode('utf-8'), int(rst[1])
        vocab[char] = index
        indexVocab[index] = char
    return (initProb, tranProb), (vocab, indexVocab)

def saveCwsData(path, cwsData):
    '''Save training samples'''
    print('save cws data to %s' % path)
    #use hdf5 with high efficiency
    fd = h5py.File(path,'w')
    (X, y) = cwsData
    fd.create_dataset('X', data = X)
    fd.create_dataset('y', data = y)
    fd.close()

def loadCwsData(path):
    '''load training samples'''
    print('load cws data from %s' % path)
    fd = h5py.File(path,'r')
    X = fd['X'][:]
    y = fd['y'][:]
    fd.close()
    return (X, y)

def sent2vec2(sent, vocab, ctxWindows = 5):

    charVec = []
    for char in sent:
        if char in vocab:
            charVec.append(vocab[char])
        else:
            charVec.append(vocab[retain_unknown])
    # padding on head and tail
    num = len(charVec)
    pad = int((ctxWindows - 1)/2)
    for i in range(pad):
        charVec.insert(0, vocab[retain_padding] ) # sentence head
        charVec.append(vocab[retain_padding] ) # sentence tail
    X = []
    for i in range(num):
        X.append(charVec[i:i + ctxWindows])
    return X

def sent2vec(sent, vocab, ctxWindows = 5):
    chars = []
    for char in sent:
        chars.append(char)
    return sent2vec2(chars, vocab, ctxWindows = ctxWindows)

def doc2vec(fname, vocab):
    fd = codecs.open(fname, 'r', 'utf-8')
    lines = fd.readlines()
    fd.close()

    X = []
    y = []

    tagSize = len(corpus_tags)
    tagCnt = [0 for i in range(tagSize)]
    tagTranCnt = [[0 for i in range(tagSize)] for j in range(tagSize)]
    
    sentCnt = 0
    initTagCnt = [0 for i in range(tagSize)]

    for line in lines:
        words = line.strip('\n').split()
        chars = []
        tags = []
        for word in words:
            if len(word) > 1:
                chars.append(word[0])
                tags.append(corpus_tags.index('B'))
                for char in word[1:(len(word) - 1)]:
                    chars.append(char)
                    tags.append(corpus_tags.index('M'))
                chars.append(word[-1])
                tags.append(corpus_tags.index('E'))
            else: 
                chars.append(word)
                tags.append(corpus_tags.index('S'))
                
        sentCnt += 1
        if len(words) > 0:
            if len(words[0]) > 1:
                initTagCnt[corpus_tags.index('B')] += 1
            else:
                initTagCnt[corpus_tags.index('S')] += 1

        lineVecX = sent2vec2(chars, vocab, ctxWindows = 7)

        lineVecY = []
        lastTag = -1
        for tag in tags:
            lineVecY.append(tag)
            tagCnt[tag] += 1
            if lastTag != -1:
                tagTranCnt[lastTag][tag] += 1
            lastTag = tag

        X.extend(lineVecX)
        y.extend(lineVecY)

    charCnt = sum(tagCnt)
    tranCnt = sum([sum(tag) for tag in tagTranCnt])
    initProb = []
    for i in range(tagSize):
        initProb.append(initTagCnt[i]/float(sentCnt))
    tranProb = []
    for i in range(tagSize):
        p = []
        for j in range(tagSize):
            p.append(tagTranCnt[i][j]/float(tranCnt))
        tranProb.append(p)

    return X, y, initProb, tranProb

def genVocab(fname, delimiters = [' ', '\n']):
    fd = codecs.open(fname, 'r', 'utf-8')
    data = fd.read()
    fd.close()

    vocab = {}
    indexVocab = []
    index = 0
    for char in data:
        if char not in delimiters and char not in vocab:
            vocab[char] = index
            indexVocab.append(char)
            index += 1

    vocab[retain_unknown] = len(vocab)
    vocab[retain_padding] = len(vocab)
    indexVocab.append(retain_unknown)
    indexVocab.append(retain_padding)
    return vocab, indexVocab

def load(fname):
    print 'train from file', fname
    vocab, indexVocab = genVocab(fname)
    X, y, initProb, tranProb = doc2vec(fname, vocab)
    print 'Total characters: ', len(X), len(y)
    print 'Total vocab: ', len(vocab), len(indexVocab)
    print 'Init prob: ', initProb
    print 'Transition prob: ', tranProb
    return (X, y), (initProb, tranProb), (vocab, indexVocab)



def preProcess():
    start_time = time.time()
    (X, y), (initProb, tranProb), (vocab, indexVocab) = load(input_file)
    saveCwsInfo(cws_info_filePath, ((initProb, tranProb), (vocab, indexVocab)))
    saveCwsData(cws_data_filePath, (X, y))

    end_time = time.time()
    print("used time : %d s" % (end_time - start_time))

In [13]:
preProcess()

train from file data/training/msr_training.utf8
Total characters:  4050469 4050469
Total vocab:  5171 5171
Init prob:  [0.30438083843357416, 0.6955501357507707, 0.0, 0.0]
Transition prob:  [[0.10778314698107833, 0.1514084213877909, 0.0, 0.0], [0.0, 0.0, 0.05428188006159124, 0.26236725602874794], [0.0, 0.0, 0.05345560079837499, 0.05428188006159124], [0.16643509822378974, 0.14998671645703562, 0.0, 0.0]]
save cws info to rnn/cws.info
save cws data to rnn/cws.data
used time : 18 s


In [3]:
# -*- coding: utf-8 -*-

# import modules & set up logging
import os
import sys
import logging
import multiprocessing
import time
import json

from gensim.models import Word2Vec
from gensim.models.word2vec import LineSentence

def output_vocab(vocab):
    for k, v in vocab.items():
        print(k)
        
def run_word2vec():
    start_time = time.time()
     
    model = Word2Vec(LineSentence(input_file), size=128, window=5, min_count=5, workers=multiprocessing.cpu_count())
 
    model.init_sims(replace=True)
    model.save(output_model_file)
    model.wv.save_word2vec_format(output_vector_file, binary=False)

    end_time = time.time()
    print("used time : %d s" % (end_time - start_time))

Using TensorFlow backend.


In [15]:
run_word2vec()

used time : 19 s


### Training

In [4]:
# -*- coding: utf-8 -*-

import numpy as np
import json
import h5py
import codecs
import time
import sys

from sklearn import model_selection

from keras.preprocessing import sequence
from keras.optimizers import SGD, RMSprop, Adagrad
from keras.utils import np_utils
from keras.models import Sequential, model_from_json
from keras.layers.core import Dense, Dropout, Activation, TimeDistributedDense
from keras.layers.embeddings import Embedding
from keras.layers.recurrent import LSTM, GRU, SimpleRNN

In [6]:
def train(cwsInfo, cwsData, modelPath, weightPath):

    (initProb, tranProb), (vocab, indexVocab) = cwsInfo
    (X, y) = cwsData

    train_X, test_X, train_y, test_y = model_selection.train_test_split(X, y , train_size=0.9, random_state=1)

    train_X = np.array(train_X)
    train_y = np.array(train_y)
    test_X = np.array(test_X)
    test_y = np.array(test_y)

    outputDims = len(corpus_tags)
    Y_train = np_utils.to_categorical(train_y, outputDims)
    Y_test = np_utils.to_categorical(test_y, outputDims)
    batchSize = 128
    vocabSize = len(vocab) + 1
    wordDims = 100
    maxlen = 7
    hiddenDims = 100

    w2vModel = Word2Vec.load(output_model_file)
    embeddingDim = w2vModel.vector_size
    embeddingUnknown = [0 for i in range(embeddingDim)]
    embeddingWeights = np.zeros((vocabSize + 1, embeddingDim))
    for word, index in vocab.items():
        if word in w2vModel:
            e = w2vModel[word]
        else:
            e = embeddingUnknown
        embeddingWeights[index, :] = e

    #LSTM
    model = Sequential()
    model.add(Embedding(output_dim = embeddingDim, input_dim = vocabSize + 1, 
        input_length = maxlen, mask_zero = True, weights = [embeddingWeights]))
    model.add(LSTM(output_dim = hiddenDims, return_sequences = True))
    model.add(LSTM(output_dim = hiddenDims, go_backwards = True, return_sequences = False))
    model.add(Dropout(0.5))
    model.add(Dense(outputDims))
    model.add(Activation('softmax'))
    model.compile(loss = 'categorical_crossentropy', optimizer = 'adam', metrics=["accuracy"])

    result = model.fit(train_X, Y_train, batch_size = batchSize, 
                    nb_epoch = 5, validation_data = (test_X,Y_test))

    j = model.to_json()
    fd = open(modelPath, 'w')
    fd.write(j)
    fd.close()

    model.save_weights(weightPath)

    return model

def start_train():
    print 'Loading vocab...'
    start_time = time.time()
    cwsInfo = loadCwsInfo(cws_info_filePath)
    cwsData = loadCwsData(cws_data_filePath)
    print("Loading used time : ", time.time() - start_time)
    print 'Done!'

    print 'Training model...'
    start_time = time.time()
    model = train(cwsInfo, cwsData, output_keras_model_file, output_keras_model_weights_file)
    print("Training used time : ", time.time() - start_time)
    print 'Done!'

In [8]:
start_train()

Loading vocab...
load cws info from rnn/cws.info
load cws data from rnn/cws.data
('Loading used time : ', 0.17533612251281738)
Done!
Training model...
Train on 3645422 samples, validate on 405047 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
('Training used time : ', 7736.648426055908)
Done!


### Testing

In [5]:
import string_util
StringUtil = string_util.StringUtil()

In [6]:
def viterbi(obs, states, start_p, trans_p, emit_p):

    lenObs = len(obs)
    lenStates = len(states)

    V = [[0.0 for col in range(lenStates)] for row in range(lenObs)]
    path = [[0 for col in range(lenObs)] for row in range(lenStates)]

    #t = 0
    for y in range(lenStates):
        #V[0][y] = start_p[y] * emit_p[y][obs[0]]
        V[0][y] = start_p[y] * emit_p[y][0]
        path[y][0] = y

    #t > 1
    for t in range(1, lenObs):
        newpath = [[0.0 for col in range(lenObs)] for row in range(lenStates)]

        for y in range(lenStates):
            prob = -1
            state = 0
            for y0 in range(lenStates):
                #nprob = V[t - 1][y0] * trans_p[y0][y] * emit_p[y][obs[t]]
                nprob = V[t - 1][y0] * trans_p[y0][y] * emit_p[y][t]
                if nprob > prob:
                    prob = nprob
                    state = y0
                    # max prob
                    V[t][y] = prob
                    # back pointer
                    newpath[y][:t] = path[state][:t]
                    newpath[y][t] = y

        path = newpath

    prob = -1
    state = 0
    for y in range(lenStates):
        if V[lenObs - 1][y] > prob:
            prob = V[lenObs - 1][y]
            state = y

    return prob, path[state]

In [17]:
#### -*- coding: utf-8 -*-

def loadModel(modelPath, weightPath):

    fd = open(modelPath, 'r')
    j = fd.read()
    fd.close()

    model = model_from_json(j)

    model.load_weights(weightPath)

    return model


def cwsSent(sent, model, cwsInfo):
    (initProb, tranProb), (vocab, indexVocab) = cwsInfo
    vec = sent2vec(sent, vocab, ctxWindows = 7)
    vec = np.array(vec)
    probs = model.predict_proba(vec, verbose=0)
    #classes = model.predict_classes(vec)

    prob, path = viterbi(vec, corpus_tags, initProb, tranProb, probs.transpose())

    output = ''
    total_len = len(path)
    for i, t in enumerate(path):
        output += sent[i]
        if i < total_len - 1:
            if corpus_tags[t] == 'S' or corpus_tags[t] == 'E':
                output += StringUtil.SPACE

    return output

def cwsFile(fname, dstname, model, cwsInfo):
    fd = codecs.open(fname, 'r', 'utf-8')
    lines = fd.readlines()
    fd.close()

    fd = open(dstname, 'w')
    for line in lines:
        rst = cwsSent(line.strip(), model, cwsInfo)
        fd.write(rst.encode('utf-8') + '\n')
    fd.close()

def test():
    cwsInfo = loadCwsInfo(cws_info_filePath)
    print('Loading model...')
    start_time = time.time()
    model = loadModel(output_keras_model_file, output_keras_model_weights_file)
    print("Loading used time : ", time.time() - start_time)
    print('Done!')
    
    print 'Doing segmentation for the test file', test_file, '...'
    cwsFile(test_file, result_file, model, cwsInfo)
    print('Done!')

Test predict a sentence

In [15]:
s = u'星期天我们去吃重庆火锅'
cwsInfo = loadCwsInfo(cws_info_filePath)
model = loadModel(output_keras_model_file, output_keras_model_weights_file)
print cwsSent(s, model, cwsInfo)

load cws info from rnn/cws.info
星期天 我们 去 吃 重庆 火锅


Do segmentation for the test file

In [18]:
test()

load cws info from rnn/cws.info
Loading model...
('Loading used time : ', 0.6088700294494629)
Done!
Doing segmentation for the test file data/testing/msr_test.utf8 ...
Done!


In [19]:
print 'Test result:\n'
!head data/result/msr_rnn.utf8

Test result:

扬帆 远东 做 与 中国 合作 的 先行
希腊 的 经济 结构 较 特殊 。
海 运业 雄踞 全球 之 首 ， 按 吨位 计 占 世界 总数 的 １７％ 。
另外 旅游 、 侨汇 也是 经济 收入 的 重要 组成部分 ， 制造业 规模 相对 较小 。
多年来 ， 中 希 贸易 始终 处于 较低 的 水平 ， 希腊 几乎 没有 在 中国 投资 。
十几年 来 ， 改革开放 的 中国 经济 高速 发展 ， 远东 在 崛起 。
瓦西里斯 的 船只 中 有 ４０％ 驶 向 远东 ， 每个 月 几乎 都 有 两三条 船 停靠 中国 港口 。
他 感受 到 了 中国 经济 发展 的 大潮 。
他 要 与 中国人 合作 。
他 来到 中国 ， 成为 第一个 访 华 的 大 船主 。


In [18]:
print 'Gold:\n'
!head data/gold/msr_test_gold.utf8

Gold:

扬帆  远东  做  与  中国  合作  的  先行  
希腊  的  经济  结构  较  特殊  。
海运  业  雄踞  全球  之  首  ，  按  吨位  计  占  世界  总数  的  １７％  。
另外  旅游  、  侨汇  也是  经济  收入  的  重要  组成部分  ，  制造业  规模  相对  较小  。
多年来  ，  中  希  贸易  始终  处于  较低  的  水平  ，  希腊  几乎  没有  在  中国  投资  。
十几年  来  ，  改革开放  的  中国  经济  高速  发展  ，  远东  在  崛起  。
瓦西里斯  的  船只  中  有  ４０％  驶  向  远东  ，  每个  月  几乎  都  有  两三条  船  停靠  中国  港口  。
他  感受  到  了  中国  经济  发展  的  大潮  。
他  要  与  中国人  合作  。
他  来到  中国  ，  成为  第一个  访  华  的  大  船主  。


Calculate accuracy using MSR gold set

In [9]:
from itertools import izip

def tag_for_sentence(sentence):
    words = sentence.decode('utf-8').strip().split()
    tags = []
    for word in words:
        if len(word) > 1:
            tags.append('b')
            for char in word[1:(len(word) - 1)]:
                tags.append('m')
            tags.append('e')
        else: 
            tags.append('s')
            
    return tags


def tag_for_file(input_path, output_path):
    print 'Tagging for %s and output to %s...' % (input_path, output_path)
    start = time.time()
    
    with open(output_path, "w+") as output_file:
        for line in open(input_path, "r").readlines():
            tags = tag_for_sentence(line)
            for tag in tags:
                output_file.writelines(tag + StringUtil.NEWLINE)
    
    print 'Done. Total time taken %d seconds' % (time.time() - start)
    
    
def print_model_accuracy(result_file_path, gold_standard_file_path):
    count = 0
    correct = 0
    with open(gold_standard_file_path, 'r') as expect_file, open(result_file_path, 'r') as result_file:
        for expect_line, result_line in izip(expect_file, result_file):
            count += 1
            if expect_line.strip() == result_line.strip():
                correct +=1

    print "Accuracy: {0}".format(float(correct)/count)

In [19]:
tag_for_file("data/result/msr_rnn.utf8", "data/result/msr_rnn_tag.utf8")

Tagging for data/result/msr_rnn.utf8 and output to data/result/msr_rnn_tag.utf8...
Done. Total time taken 0 seconds


In [20]:
tag_for_file("data/gold/msr_test_gold.utf8", "data/gold/tag_msr.utf8")

Tagging for data/gold/msr_test_gold.utf8 and output to data/gold/tag_msr.utf8...
Done. Total time taken 0 seconds


In [21]:
print_model_accuracy("data/result/msr_rnn_tag.utf8", "data/gold/tag_msr.utf8")

Accuracy: 0.956865829514


### Visualization

In [163]:
from keras.utils.visualize_util import plot

model = loadModel(output_keras_model_file, output_keras_model_weights_file)
plot(model, to_file='img/rnn_model.png', show_shapes=True)

![RNN model diagram](img/rnn_model.png)

### Test MSR trained model against PKU test set

In [None]:
test_file = 'data/testing/pku_test.utf8'
result_file = 'data/result/pku_rnn.utf8'
test()

In [24]:
tag_for_file("data/result/pku_rnn.utf8", "data/result/pku_rnn_tag.utf8")

Tagging for data/result/pku_rnn.utf8 and output to data/result/pku_rnn_tag.utf8...
Done. Total time taken 0 seconds


In [25]:
tag_for_file("data/gold/pku_test_gold.utf8", "data/gold/tag_pku.utf8")

Tagging for data/gold/pku_test_gold.utf8 and output to data/gold/tag_pku.utf8...
Done. Total time taken 0 seconds


In [26]:
print_model_accuracy("data/result/pku_rnn_tag.utf8", "data/gold/tag_pku.utf8")

Accuracy: 0.842629954901


### Train model with PKU corpus and test against PKU test set

In [23]:
input_file = 'data/training/pku_training.utf8'
test_file = 'data/testing/pku_test.utf8'
result_file = 'data/result/pku_rnn.utf8'
cws_info_filePath = 'rnn/cws_pku.info'
cws_data_filePath = 'rnn/cws_pku.data'
output_model_file = 'rnn/pku_training_word2vec.model'
output_vector_file = 'rnn/pku_training_word2vec.vector'
output_keras_model_file = 'rnn/pku_cws_keras_model'
output_keras_model_weights_file = 'rnn/pku_keras_model_weights'

In [22]:
preProcess()

train from file data/training/pku_training.utf8
Total characters:  1826448 1826448
Total vocab:  4701 4701
Init prob:  [0.28729041286694174, 0.3204175536341577, 0.07187447986474294, 0.3204175536341577]
Transition prob:  [[0.12090999527496495, 0.16095605053463716, 0.0, 0.0], [0.0, 0.0, 0.047525332052668096, 0.2762701436432787], [0.0, 0.0, 0.025106866571428254, 0.047525332052668096], [0.16555659695672334, 0.15614968291363146, 0.0, 0.0]]
save cws info to rnn/cws_pku.info
save cws data to rnn/cws_pku.data
used time : 7 s


In [24]:
run_word2vec()

used time : 8 s


In [25]:
start_train()

Loading vocab...
load cws info from rnn/cws_pku.info
load cws data from rnn/cws_pku.data
('Loading used time : ', 0.09951400756835938)
Done!
Training model...
Train on 1643803 samples, validate on 182645 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
('Training used time : ', 3642.9727289676666)
Done!


In [None]:
test()

In [31]:
tag_for_file("data/result/pku_rnn.utf8", "data/result/pku_rnn_tag.utf8")

Tagging for data/result/pku_rnn.utf8 and output to data/result/pku_rnn_tag.utf8...
Done. Total time taken 0 seconds


In [30]:
tag_for_file("data/gold/pku_test_gold.utf8", "data/gold/tag_pku.utf8")

Tagging for data/gold/pku_test_gold.utf8 and output to data/gold/tag_pku.utf8...
Done. Total time taken 0 seconds


In [32]:
print_model_accuracy("data/result/pku_rnn_tag.utf8", "data/gold/tag_pku.utf8")

Accuracy: 0.9107350651


### Test PKU trained model against MSR test set

In [None]:
test_file = 'data/testing/msr_test.utf8'
result_file = 'data/result/msr_rnn.utf8'
test()

In [34]:
tag_for_file("data/result/msr_rnn.utf8", "data/result/msr_rnn_tag.utf8")

Tagging for data/result/msr_rnn.utf8 and output to data/result/msr_rnn_tag.utf8...
Done. Total time taken 0 seconds


In [35]:
tag_for_file("data/gold/msr_test_gold.utf8", "data/gold/tag_msr.utf8")

Tagging for data/gold/msr_test_gold.utf8 and output to data/gold/tag_msr.utf8...
Done. Total time taken 0 seconds


In [36]:
print_model_accuracy("data/result/msr_rnn_tag.utf8", "data/gold/tag_msr.utf8")

Accuracy: 0.869051558135


### Train model with combined corpus and test again combined test set

In [10]:
input_file = 'data/training/data.utf8'
test_file = 'data/testing/combined/test.utf8'
result_file = 'data/result/combined_rnn.utf8'
cws_info_filePath = 'rnn/cws_combined.info'
cws_data_filePath = 'rnn/cws_combined.data'
output_model_file = 'rnn/combined_training_word2vec.model'
output_vector_file = 'rnn/combined_training_word2vec.vector'
output_keras_model_file = 'rnn/combined_cws_keras_model'
output_keras_model_weights_file = 'rnn/combined_keras_model_weights'

In [38]:
preProcess()

train from file data/training/data.utf8
Total characters:  5880019 5880019
Total vocab:  5416 5416
Init prob:  [0.2786111745557285, 0.31317602885296797, 0.09503676773833554, 0.31317602885296797]
Transition prob:  [[0.1118399027631214, 0.1543448400321375, 0.0, 0.0], [0.0, 0.0, 0.05218757863996186, 0.2667420744340824], [0.0, 0.0, 0.04459519070250285, 0.05218757863996186], [0.16610093521911637, 0.15200189956911578, 0.0, 0.0]]
save cws info to rnn/cws_combined.info
save cws data to rnn/cws_combined.data
used time : 24 s


In [18]:
run_word2vec()

used time : 27 s


In [None]:
start_train()

Loading vocab...
load cws info from rnn/cws_combined.info
load cws data from rnn/cws_combined.data
('Loading used time : ', 0.22170186042785645)
Done!
Training model...
Train on 5292017 samples, validate on 588002 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5

In [None]:
test()

In [20]:
tag_for_file("data/result/combined_rnn.utf8", "data/result/combined_rnn_tag.utf8")

Tagging for data/result/combined_rnn.utf8 and output to data/result/combined_rnn_tag.utf8...
Done. Total time taken 0 seconds


In [22]:
tag_for_file("data/gold/combined/test.utf8", "data/gold/combined/tag_combined.utf8")

Tagging for data/gold/combined/test.utf8 and output to data/gold/combined/tag_combined.utf8...
Done. Total time taken 0 seconds


In [23]:
print_model_accuracy("data/result/combined_rnn_tag.utf8", "data/gold/combined/tag_combined.utf8")

Accuracy: 0.905878498156


## Demo

In [23]:
cwsInfo = loadCwsInfo(cws_info_filePath)
model = loadModel(output_keras_model_file, output_keras_model_weights_file)

load cws info from rnn/cws_combined.info


In [24]:
s = u'上海自来水来自海上'
print cwsSent(s, model, cwsInfo)

上海 自来水 来自 海上


In [25]:
s = u'星期天我们去吃重庆火锅'
print cwsSent(s, model, cwsInfo)

星期天 我们 去 吃 重庆 火锅


In [26]:
s = u'权力的游戏开播了'
print cwsSent(s, model, cwsInfo)

权力 的 游戏 开播 了
