This is the code version of [sequence labeling - NER](https://github.com/gaoisbest/NLP-Projects/tree/master/Sequence%20labeling%20-%20NER), please click the link for more details.

In [None]:
# %load conlleval.py
# Python version of the evaluation script from CoNLL'00-
# Originates from: https://github.com/spyysalo/conlleval.py


# Intentional differences:
# - accept any space as delimiter by default
# - optional file argument (default STDIN)
# - option to set boundary (-b argument)
# - LaTeX output (-l argument) not supported
# - raw tags (-r argument) not supported

import sys
import re
import codecs
from collections import defaultdict, namedtuple

ANY_SPACE = '<SPACE>'


class FormatError(Exception):
    pass

Metrics = namedtuple('Metrics', 'tp fp fn prec rec fscore')


class EvalCounts(object):
    def __init__(self):
        self.correct_chunk = 0    # number of correctly identified chunks
        self.correct_tags = 0     # number of correct chunk tags
        self.found_correct = 0    # number of chunks in corpus
        self.found_guessed = 0    # number of identified chunks
        self.token_counter = 0    # token counter (ignores sentence breaks)

        # counts by type
        self.t_correct_chunk = defaultdict(int)
        self.t_found_correct = defaultdict(int)
        self.t_found_guessed = defaultdict(int)


def parse_args(argv):
    import argparse
    parser = argparse.ArgumentParser(
        description='evaluate tagging results using CoNLL criteria',
        formatter_class=argparse.ArgumentDefaultsHelpFormatter
    )
    arg = parser.add_argument
    arg('-b', '--boundary', metavar='STR', default='-X-',
        help='sentence boundary')
    arg('-d', '--delimiter', metavar='CHAR', default=ANY_SPACE,
        help='character delimiting items in input')
    arg('-o', '--otag', metavar='CHAR', default='O',
        help='alternative outside tag')
    arg('file', nargs='?', default=None)
    return parser.parse_args(argv)


def parse_tag(t):
    m = re.match(r'^([^-]*)-(.*)$', t)
    return m.groups() if m else (t, '')


def evaluate(iterable_sss, options=None):
    if options is None:
        options = parse_args([])    # use defaults

    counts = EvalCounts()
    num_features = None       # number of features per line
    in_correct = False        # currently processed chunks is correct until now
    last_correct = 'O'        # previous chunk tag in corpus
    last_correct_type = ''    # type of previously identified chunk tag
    last_guessed = 'O'        # previously identified chunk tag
    last_guessed_type = ''    # type of previous chunk tag in corpus

    #print('iterable_sss type:')
    #print(type(iterable_sss))

    for line in iterable_sss:
        line = line.rstrip('\r\n')

        if options.delimiter == ANY_SPACE:
            features = line.split()
        else:
            features = line.split(options.delimiter)

        if num_features is None:
            num_features = len(features)
        elif num_features != len(features) and len(features) != 0:
            raise FormatError('unexpected number of features: %d (%d)' %
                              (len(features), num_features))

        if len(features) == 0 or features[0] == options.boundary:
            features = [options.boundary, 'O', 'O']
        if len(features) < 3:
            raise FormatError('unexpected number of features in line %s' % line)

        guessed, guessed_type = parse_tag(features.pop())
        correct, correct_type = parse_tag(features.pop())
        first_item = features.pop(0)

        if first_item == options.boundary:
            guessed = 'O'

        end_correct = end_of_chunk(last_correct, correct,
                                   last_correct_type, correct_type)
        end_guessed = end_of_chunk(last_guessed, guessed,
                                   last_guessed_type, guessed_type)
        start_correct = start_of_chunk(last_correct, correct,
                                       last_correct_type, correct_type)
        start_guessed = start_of_chunk(last_guessed, guessed,
                                       last_guessed_type, guessed_type)

        if in_correct:
            if (end_correct and end_guessed and
                last_guessed_type == last_correct_type):
                in_correct = False
                counts.correct_chunk += 1
                counts.t_correct_chunk[last_correct_type] += 1
            elif (end_correct != end_guessed or guessed_type != correct_type):
                in_correct = False

        if start_correct and start_guessed and guessed_type == correct_type:
            in_correct = True

        if start_correct:
            counts.found_correct += 1
            counts.t_found_correct[correct_type] += 1
        if start_guessed:
            counts.found_guessed += 1
            counts.t_found_guessed[guessed_type] += 1
        if first_item != options.boundary:
            if correct == guessed and guessed_type == correct_type:
                counts.correct_tags += 1
            counts.token_counter += 1

        last_guessed = guessed
        last_correct = correct
        last_guessed_type = guessed_type
        last_correct_type = correct_type

    if in_correct:
        counts.correct_chunk += 1
        counts.t_correct_chunk[last_correct_type] += 1

    return counts


def uniq(iterable):
  seen = set()
  return [i for i in iterable if not (i in seen or seen.add(i))]


def calculate_metrics(correct, guessed, total):
    tp, fp, fn = correct, guessed-correct, total-correct
    p = 0 if tp + fp == 0 else 1.*tp / (tp + fp)
    r = 0 if tp + fn == 0 else 1.*tp / (tp + fn)
    f = 0 if p + r == 0 else 2 * p * r / (p + r)
    return Metrics(tp, fp, fn, p, r, f)


def metrics(counts):
    c = counts
    overall = calculate_metrics(
        c.correct_chunk, c.found_guessed, c.found_correct
    )
    by_type = {}
    for t in uniq(list(c.t_found_correct) + list(c.t_found_guessed)):
        by_type[t] = calculate_metrics(
            c.t_correct_chunk[t], c.t_found_guessed[t], c.t_found_correct[t]
        )
    return overall, by_type


def report(counts, out=None):
    if out is None:
        out = sys.stdout

    overall, by_type = metrics(counts)

    c = counts
    out.write('processed %d tokens with %d phrases; ' %
              (c.token_counter, c.found_correct))
    out.write('found: %d phrases; correct: %d.\n' %
              (c.found_guessed, c.correct_chunk))

    if c.token_counter > 0:
        out.write('accuracy: %6.2f%%; ' %
                  (100.*c.correct_tags/c.token_counter))
        out.write('precision: %6.2f%%; ' % (100.*overall.prec))
        out.write('recall: %6.2f%%; ' % (100.*overall.rec))
        out.write('FB1: %6.2f\n' % (100.*overall.fscore))

    for i, m in sorted(by_type.items()):
        out.write('%17s: ' % i)
        out.write('precision: %6.2f%%; ' % (100.*m.prec))
        out.write('recall: %6.2f%%; ' % (100.*m.rec))
        out.write('FB1: %6.2f  %d\n' % (100.*m.fscore, c.t_found_guessed[i]))


def report_notprint(counts, out=None):
    if out is None:
        out = sys.stdout

    overall, by_type = metrics(counts)

    c = counts
    final_report = []
    line = []
    line.append('processed %d tokens with %d phrases; ' %
              (c.token_counter, c.found_correct))
    line.append('found: %d phrases; correct: %d.\n' %
              (c.found_guessed, c.correct_chunk))
    final_report.append("".join(line))

    if c.token_counter > 0:
        line = []
        line.append('accuracy: %6.2f%%; ' %
                  (100.*c.correct_tags/c.token_counter))
        line.append('precision: %6.2f%%; ' % (100.*overall.prec))
        line.append('recall: %6.2f%%; ' % (100.*overall.rec))
        line.append('FB1: %6.2f\n' % (100.*overall.fscore))
        final_report.append("".join(line))

    for i, m in sorted(by_type.items()):
        line = []
        line.append('%17s: ' % i)
        line.append('precision: %6.2f%%; ' % (100.*m.prec))
        line.append('recall: %6.2f%%; ' % (100.*m.rec))
        line.append('FB1: %6.2f  %d\n' % (100.*m.fscore, c.t_found_guessed[i]))
        final_report.append("".join(line))
    return final_report


def end_of_chunk(prev_tag, tag, prev_type, type_):
    # check if a chunk ended between the previous and current word
    # arguments: previous and current chunk tags, previous and current types
    chunk_end = False

    if prev_tag == 'E': chunk_end = True
    if prev_tag == 'S': chunk_end = True

    if prev_tag == 'B' and tag == 'B': chunk_end = True
    if prev_tag == 'B' and tag == 'S': chunk_end = True
    if prev_tag == 'B' and tag == 'O': chunk_end = True
    if prev_tag == 'I' and tag == 'B': chunk_end = True
    if prev_tag == 'I' and tag == 'S': chunk_end = True
    if prev_tag == 'I' and tag == 'O': chunk_end = True

    if prev_tag != 'O' and prev_tag != '.' and prev_type != type_:
        chunk_end = True

    # these chunks are assumed to have length 1
    if prev_tag == ']': chunk_end = True
    if prev_tag == '[': chunk_end = True

    return chunk_end


def start_of_chunk(prev_tag, tag, prev_type, type_):
    # check if a chunk started between the previous and current word
    # arguments: previous and current chunk tags, previous and current types
    chunk_start = False

    if tag == 'B': chunk_start = True
    if tag == 'S': chunk_start = True

    if prev_tag == 'E' and tag == 'E': chunk_start = True
    if prev_tag == 'E' and tag == 'I': chunk_start = True
    if prev_tag == 'S' and tag == 'E': chunk_start = True
    if prev_tag == 'S' and tag == 'I': chunk_start = True
    if prev_tag == 'O' and tag == 'E': chunk_start = True
    if prev_tag == 'O' and tag == 'I': chunk_start = True

    if tag != 'O' and tag != '.' and prev_type != type_:
        chunk_start = True

    # these chunks are assumed to have length 1
    if tag == '[': chunk_start = True
    if tag == ']': chunk_start = True

    return chunk_start


def return_report(input_file):
    with codecs.open(input_file, "r", "utf8") as f:
        counts = evaluate(f)
    return report_notprint(counts)


def main(argv):
    args = parse_args(argv[1:])

    if args.file is None:
        counts = evaluate(sys.stdin, args)
    else:
        with open(args.file) as f:
            counts = evaluate(f, args)
    report(counts)

if __name__ == '__main__':
    sys.exit(main(sys.argv))

In [None]:
# %load utils_io.py

import os
import re
import codecs
import shutil
import jieba as jie
import random
import math
import logging
import numpy as np
import json


def iob2(tags):
    """
    Check that tags have a valid IOB format.
    Tags in IOB1 format are converted to IOB2.
    """
    for i, tag in enumerate(tags):
        if tag == 'O':
            continue
        split = tag.split('-')
        if len(split) != 2 or split[0] not in ['I', 'B']:
            return False
        if split[0] == 'B':
            continue
        elif i == 0 or tags[i - 1] == 'O':  # conversion IOB1 to IOB2
            tags[i] = 'B' + tag[1:]
        elif tags[i - 1][1:] == tag[1:]:
            continue
        else:  # conversion IOB1 to IOB2
            tags[i] = 'B' + tag[1:]
    return True

def iob_iobes(tags):
    """
    IOB -> IOBES
    """
    new_tags = []
    for i, tag in enumerate(tags):
        if tag == 'O':
            new_tags.append(tag)
        elif tag.split('-')[0] == 'B':
            if i + 1 != len(tags) and \
               tags[i + 1].split('-')[0] == 'I':
                new_tags.append(tag)
            else:
                new_tags.append(tag.replace('B-', 'S-'))
        elif tag.split('-')[0] == 'I':
            if i + 1 < len(tags) and \
                    tags[i + 1].split('-')[0] == 'I':
                new_tags.append(tag)
            else:
                new_tags.append(tag.replace('I-', 'E-'))
        else:
            raise Exception('Invalid IOB format!')
    return new_tags

def full_to_half(s):
    """
    Convert full-width character to half-width one 
    """
    n = []
    for char in s:
        num = ord(char)
        if num == 0x3000:
            num = 32
        elif 0xFF01 <= num <= 0xFF5E:
            num -= 0xfee0
        char = chr(num)
        n.append(char)
    return ''.join(n)

def replace_html(s):
    s = s.replace('&quot;','"')
    s = s.replace('&amp;','&')
    s = s.replace('&lt;','<')
    s = s.replace('&gt;','>')
    s = s.replace('&nbsp;',' ')
    s = s.replace("&ldquo;", "“")
    s = s.replace("&rdquo;", "”")
    s = s.replace("&mdash;","")
    s = s.replace("\xa0", " ")
    return(s)

def input_from_line(line, char_to_id):
    """
    Take sentence data and return an input for
    the training or the evaluation function.
    """
    line = full_to_half(line)
    line = replace_html(line)
    inputs = list()
    inputs.append([line])
    line.replace(' ', '$')
    inputs.append([[char_to_id[char] if char in char_to_id else char_to_id['<UNK>'] for char in line]])
    inputs.append([get_seg_features(line)])
    inputs.append([[]])
    return inputs


def get_seg_features(string):
    """
    Chinese word segmentation with jieba.
    Features are represented in BIES format, i.e., B:1, I:2, E:3, S:0.

    For example:
    string: u'我买了富士康手机'
    encoding: [0, 0, 0, 1, 2, 3, 1, 3]    
    """
    seg_feature = []

    for word in jie.cut(string):
        if len(word) == 1:
            seg_feature.append(0)
        else:
            tmp = [2] * len(word)
            tmp[0] = 1
            tmp[-1] = 3
            seg_feature.extend(tmp)
    return seg_feature


def result_to_json(string, tags):
    item = {'string': string, 'entities': []}
    entity_name = ""
    entity_start = 0
    idx = 0
    for char, tag in zip(string, tags):
        if tag[0] == 'S':
            item['entities'].append({'word': char, 'start': idx, 'end': idx+1, 'type':tag[2:]})
        elif tag[0] == 'B':
            entity_name += char
            entity_start = idx
        elif tag[0] == 'I':
            entity_name += char
        elif tag[0] == 'E':
            entity_name += char
            item['entities'].append({'word': entity_name, 'start': entity_start, 'end': idx + 1, 'type': tag[2:]})
            entity_name = ""
        else:
            entity_name = ""
            entity_start = idx
        idx += 1
    return item

def save_config(config, config_file):
    """
    Save configuration of the model parameters for model deploy.
    Parameters are stored in json format.
    """
    with open(config_file, mode='w', encoding='utf8') as f:
        json.dump(config, f, ensure_ascii=False, indent=4)


def load_config(config_file):
    """
    Load configuration of the model for model deploy.    
    Parameters are stored in json format.
    """
    with open(config_file, encoding='utf8') as f:
        return json.load(f)

def load_word2vec(emb_path, id_to_word, word_dim, old_weights):
    """
    Load word embedding from pre-trained file.
    """
    new_weights = old_weights
    print('Loading pretrained embeddings from {}...'.format(emb_path))
    pre_trained = {}
    emb_invalid = 0
    for i, line in enumerate(codecs.open(emb_path, 'r', 'utf-8')):
        # line format: [character_a, 0.0824, -0.335, ..., 'word_dim' embeddings]
        line = line.rstrip().split()
        if len(line) == word_dim + 1: # valid embedding
            # line[0] is a character
            pre_trained[line[0]] = np.array([float(x) for x in line[1:]]).astype(np.float32)
        else:
            emb_invalid += 1
    if emb_invalid > 0:
        print('WARNING: %i invalid lines' % emb_invalid)

    c_found = 0
    c_lower = 0
    c_zeros = 0
    n_words = len(id_to_word)
    # Lookup table initialization
    for i in range(n_words):
        word = id_to_word[i]
        if word in pre_trained:
            new_weights[i] = pre_trained[word]
            c_found += 1
        elif word.lower() in pre_trained:
            new_weights[i] = pre_trained[word.lower()]
            c_lower += 1
        elif re.sub('\d', '0', word.lower()) in pre_trained:
            new_weights[i] = pre_trained[re.sub('\d', '0', word.lower())]
            c_zeros += 1
    print('Loaded %i pre-trained embeddings.' % len(pre_trained))
    print('%i / %i (%.4f%%) words have been initialized with pretrained embeddings.' % (c_found + c_lower + c_zeros, n_words,
        100. * (c_found + c_lower + c_zeros) / n_words))
    print('%i found directly, %i after lowercasing, %i after lowercasing + zero.' % (c_found, c_lower, c_zeros))
    return new_weights


def make_path(params):
    """
    Make folders for training and evaluation
    """
    if not os.path.isdir(params.summary_dir):
        os.makedirs(params.summary_dir)
    if not os.path.isdir(params.ckpt_dir):
        os.makedirs(params.ckpt_dir)


def tag_mapping(sentences):
    """
    Create a dictionary and a mapping of tags, sorted by counts.
    """
    # x[0]: character
    # x[-1]: tag
    # tags is a list of list which only contain x[-1] (i.e., tag)
    tags = [[x[-1] for x in s] for s in sentences]
    dico = create_dictionary(tags)
    tag_to_id, id_to_tag = create_mapping(dico)
    print('Found %i unique named entity tags.' % len(dico))
    return dico, tag_to_id, id_to_tag

def char_mapping(sentences, lower):
    """
    Create a dictionary and a mapping of characters, sorted by counts.
    """
    # x[0]: character
    # x[1]: tag
    # chars is a list of list which only contain x[0] (i.e., character)
    chars = [[x[0].lower() if lower else x[0] for x in s] for s in sentences]
    dico = create_dictionary(chars)
    # add two extra characters
    dico['<PAD>'] = 10000001 # padding symbol, 10000001 is counts
    dico['<UNK>'] = 10000000 # unknown symbol
    char_to_id, id_to_char = create_mapping(dico)
    print('Found %i unique words (%i in total).' % (len(dico), sum(len(x) for x in chars)))
    return dico, char_to_id, id_to_char

def augment_with_pretrained(dictionary, ext_emb_path, chars):
    """
    Augment the dictionary with chars that have a pre-trained embedding.  
    """
    print('Loading pretrained embeddings from %s...' % ext_emb_path)
    assert os.path.isfile(ext_emb_path)

    # 'pretrained' contains characters that have pre-trained embedding vector
    # line format: character_a 0.0824 -0.335 ... 100 embeddings
    pretrained = set([line.rstrip().split()[0].strip() for line in codecs.open(ext_emb_path, 'r', 'utf-8') if len(ext_emb_path) > 0])

    if chars is None: # add every character that has a pre-trained embedding to the dictionary
        for char in pretrained:
            if char not in dictionary:
                dictionary[char] = 0
    else: # add the words that are given by `chars` (typically the words in the development and test sets.)
        for char in chars:
            if any(x in pretrained for x in [char, char.lower(), re.sub('\d', '0', char.lower())]) and char not in dictionary:
                dictionary[char] = 0

    char_to_id, id_to_char = create_mapping(dictionary)
    return dictionary, char_to_id, id_to_char

def create_dictionary(item_list):
    """
    Create the dictionary[character] = character counts.
    """
    assert type(item_list) is list
    dico = {}
    # items is a list, which represents the sentence, that contains each character, i.e., ['a', 'b', ...]
    for items in item_list:
        for item in items: # item is each character
            if item not in dico:
                dico[item] = 1
            else:
                dico[item] += 1
    return dico

def create_mapping(dico):
    """
    Create a mapping (item to ID / ID to item) from a dictionary.
    Items are ordered by decreasing frequency.
    """
    # sort the dictionary in descending order
    sorted_items = sorted(dico.items(), key=lambda x: (-x[1], x[0]))
    # assign id to each character, i.e., id:character
    id_to_item = {i: v[0] for i, v in enumerate(sorted_items)}
    # character:id
    item_to_id = {v: k for k, v in id_to_item.items()}
    return item_to_id, id_to_item

def zero_digits(s):
    """
    Replace every digit in a string by a zero.
    """
    return re.sub('\d', '0', s)

def load_sentences(path, zeros):
    """
    Read character-based input data. 
    [1] A line must contain at least a character and its tag.
    [2] Sentences are separated by empty line.
    """
    sentences = []
    sentence = []
    num = 0
    for line in codecs.open(path, 'r', 'utf8'):
        num += 1
        line = zero_digits(line.rstrip()) if zeros else line.rstrip()
        if not line:
            if len(sentence) > 0:
                if 'DOCSTART' not in sentence[0][0]:
                    sentences.append(sentence)
                sentence = []
        else:
            if line[0] == " ":
                line = "$" + line[1:]
                word = line.split()
            else:
                word= line.split()
            if len(word) >= 2:
                sentence.append(word)
    if len(sentence) > 0:
        if 'DOCSTART' not in sentence[0][0]:
            sentences.append(sentence)
    return sentences

def clean(params, maps_file_name, config_file_name, results_file_path):
    """
    Clean current folder
    remove saved model and training log
    """
    if os.path.isfile(os.path.join(params.data_dir, maps_file_name)):
        os.remove(os.path.join(params.data_dir, maps_file_name))

    if os.path.isdir(params.ckpt_dir):
        shutil.rmtree(params.ckpt_dir)

    if os.path.isdir(params.summary_dir):
        shutil.rmtree(params.summary_dir)

    if os.path.isdir(os.path.join(params.data_dir, results_file_path)):
        shutil.rmtree(os.path.join(params.data_dir, results_file_path))

    if os.path.isdir('__pycache__'):
        shutil.rmtree('__pycache__')

    if os.path.isfile(os.path.join(params.data_dir, config_file_name)):
        os.remove(os.path.join(params.data_dir, config_file_name))

In [None]:
# %load utils_train.py

import os
import re
import codecs
import shutil
import jieba as jie
import random
import math
import logging
import numpy as np
import json

def update_tag_scheme(sentences, tag_scheme):
    """
    Check and update sentences tagging scheme to tag_scheme.
    Only IOB1 and IOB2 schemes are accepted.
    """
    for i, s in enumerate(sentences):
        tags = [w[-1] for w in s]
        # Check that tags are given in the IOB format
        if not iob2(tags):
            s_str = '\n'.join(' '.join(w) for w in s)
            raise Exception('Sentences should be given in IOB format! ' +
                            'Please check sentence %i:\n%s' % (i, s_str))
        if tag_scheme == 'iob':
            # If format was IOB1, we convert to IOB2
            for word, new_tag in zip(s, tags):
                word[-1] = new_tag
        elif tag_scheme == 'iobes':
            new_tags = iob_iobes(tags)
            for word, new_tag in zip(s, new_tags):
                word[-1] = new_tag
        else:
            raise Exception('Unknown tagging scheme!')

def iob2(tags):
    """
    Check that tags have a valid IOB format.
    Tags in IOB1 format are converted to IOB2.
    """
    for i, tag in enumerate(tags):
        if tag == 'O':
            continue
        split = tag.split('-')
        if len(split) != 2 or split[0] not in ['I', 'B']:
            return False
        if split[0] == 'B':
            continue
        elif i == 0 or tags[i - 1] == 'O':  # conversion IOB1 to IOB2
            tags[i] = 'B' + tag[1:]
        elif tags[i - 1][1:] == tag[1:]:
            continue
        else:  # conversion IOB1 to IOB2
            tags[i] = 'B' + tag[1:]
    return True

def iobes_iob(tags):
    """
    IOBES -> IOB
    """
    new_tags = []
    for i, tag in enumerate(tags):
        if tag.split('-')[0] == 'B':
            new_tags.append(tag)
        elif tag.split('-')[0] == 'I':
            new_tags.append(tag)
        elif tag.split('-')[0] == 'S':
            new_tags.append(tag.replace('S-', 'B-'))
        elif tag.split('-')[0] == 'E':
            new_tags.append(tag.replace('E-', 'I-'))
        elif tag.split('-')[0] == 'O':
            new_tags.append(tag)
        else:
            raise Exception('Invalid format!')
    return new_tags

def iob_iobes(tags):
    """
    IOB -> IOBES
    """
    new_tags = []
    for i, tag in enumerate(tags):
        if tag == 'O':
            new_tags.append(tag)
        elif tag.split('-')[0] == 'B':
            if i + 1 != len(tags) and \
               tags[i + 1].split('-')[0] == 'I':
                new_tags.append(tag)
            else:
                new_tags.append(tag.replace('B-', 'S-'))
        elif tag.split('-')[0] == 'I':
            if i + 1 < len(tags) and \
                    tags[i + 1].split('-')[0] == 'I':
                new_tags.append(tag)
            else:
                new_tags.append(tag.replace('I-', 'E-'))
        else:
            raise Exception('Invalid IOB format!')
    return new_tags

class BatchManager(object):
    def __init__(self, data,  batch_size):
        # data format: [[string, chars, segs, tags], [next sentence information]]
        self.batch_data = self.sort_and_pad(data, batch_size)
        self.len_data = len(self.batch_data)

    def sort_and_pad(self, data, batch_size):
        num_batch = int(math.ceil(len(data) / batch_size))
        #print('len(data):{}, batch_size:{}, num_batch:{}'.format(len(data), batch_size, num_batch))
        # for minimal padding, sort them according to sequence length
        sorted_data = sorted(data, key=lambda x: len(x[0]))
        batch_data = list()
        for i in range(num_batch): # iterate the data
            batch_data.append(self.pad_data(sorted_data[i*batch_size : (i+1)*batch_size]))
        return batch_data

    @staticmethod
    def pad_data(data):
        strings = []
        chars = []
        segs = []
        targets = []
        max_length = max([len(sentence[0]) for sentence in data])
        for line in data:
            string, char, seg, target = line
            padding = [0] * (max_length - len(string))
            strings.append(string + padding)
            chars.append(char + padding)
            segs.append(seg + padding)
            targets.append(target + padding)
        '''
        take a look at first two training examples.
        
        print(strings[0:2])
        [['一', '亿', '二', '的', '代', '价', '（', '中', '华', '环', '保', '世', '纪', '行', '）'], ['钱', '其', '琛', '会', '见', '香', '港', '泉', '州', '同', '乡', '会', '访', '问', '团']]
        
        print(chars[0:2])
        [[7, 523, 283, 3, 122, 417, 306, 10, 244, 393, 138, 142, 374, 32, 307], [694, 147, 1966, 20, 319, 555, 514, 1283, 461, 66, 406, 20, 405, 123, 253]]
        
        print(segs[0:2])
        [[1, 3, 0, 0, 1, 3, 0, 1, 3, 1, 3, 1, 3, 0, 0], [1, 2, 3, 1, 3, 1, 3, 1, 3, 1, 2, 3, 1, 2, 3]]
        
        print(targets[0:2])
        [[0, 0, 0, 0, 0, 0, 0, 2, 3, 0, 0, 0, 0, 0, 0], [8, 7, 9, 0, 0, 4, 1, 1, 1, 1, 1, 1, 1, 1, 5]]
        '''
        return [strings, chars, segs, targets]

    def iter_batch(self, shuffle=False):
        if shuffle:
            random.shuffle(self.batch_data)
        for idx in range(self.len_data):
            yield self.batch_data[idx]


def prepare_dataset(sentences, char_to_id, tag_to_id, lower=False, train=True):
    """
    Prepare the dataset. Return a list of lists and each inner list contains:
    - string: character list
    - chars: character id list
    - segs: word segmentation encoding, see get_seg_features for more detail
    - tags: tag id list
    """

    none_index = tag_to_id['O']

    def f(x):
        return x.lower() if lower else x

    data = []
    # sentences is a list of list
    for s in sentences: # s is a list
        string = [w[0] for w in s]
        # convert each char to its id
        chars = [char_to_id[f(w) if f(w) in char_to_id else '<UNK>'] for w in string]
        segs = get_seg_features("".join(string))
        if train:
            # convert each tag to its id
            tags = [tag_to_id[w[-1]] for w in s]
        else:
            tags = [none_index for _ in chars]
        data.append([string, chars, segs, tags])

    return data

def get_seg_features(string):
    """
    Chinese word segmentation with jieba.
    Features are represented in BIES format, i.e., B:1, I:2, E:3, S:0.
    
    For example:
    string: u'我买了富士康手机'
    encoding: [0, 0, 0, 1, 2, 3, 1, 3]    
    """
    seg_feature = []

    for word in jie.cut(string):
        if len(word) == 1:
            seg_feature.append(0)
        else:
            tmp = [2] * len(word)
            tmp[0] = 1
            tmp[-1] = 3
            seg_feature.extend(tmp)
    return seg_feature


def zero_digits(s):
    """
    Replace every digit in a string by a zero.
    """
    return re.sub('\d', '0', s)



In [None]:
# %load model.py
import numpy as np
import tensorflow as tf

from utils_io import result_to_json
from utils_train import iobes_iob


class BrandsNERModel:
    """
    Model for recognizing brands.    
    """

    def __init__(self, config):
        self.learning_rate = config['learning_rate']
        self.char_dims = config['char_dim']
        self.num_chars = config['num_chars']
        self.word_dims = config['word_dim']
        self.num_words_types = 4 # BIES
        self.num_tags = config['num_tags']
        self.rnn_units = config['num_units']
        self.max_gradient_norm = config['max_gradient_norm']

        # input placeholders
        self.char_inputs = tf.placeholder(dtype=tf.int32, shape=[None, None], name='char_inputs')
        self.word_inputs = tf.placeholder(dtype=tf.int32, shape=[None, None], name='word_inputs')
        self.tags = tf.placeholder(dtype=tf.int32, shape=[None, None], name='brand_tags')
        self.keep_prob = tf.placeholder(dtype=tf.float32, name='dropout_keep_prob')

        self.batch_size = tf.shape(input=self.char_inputs)[0]
        self.num_steps = tf.shape(input=self.char_inputs)[1]

        self.global_step = tf.Variable(initial_value=0, dtype=tf.int32, trainable=False, name='global_step')
        # model's best development F1 score
        self.best_dev_f1 = tf.Variable(initial_value=0.0, dtype=tf.float32, trainable=False, name='best_dev_f1')
        # model's best test F1 score
        self.best_test_f1 = tf.Variable(initial_value=0.0, dtype=tf.float32, trainable=False, name='best_test_f1')

        # use crf or not
        self.use_crf = config['use_crf']

        # model architecture
        # embedding layer
        char_word_embeddings = self.embedding_layer(self.char_inputs, self.word_inputs)

        # dropout
        # according to 'Neural Architectures for Named Entity Recognition' Section 4.3
        rnn_inputs = tf.nn.dropout(x=char_word_embeddings, keep_prob=self.keep_prob, name='lstm_inputs_dropout')

        # get the actual sequence length of this batch
        self.seq_lengths = tf.cast(tf.reduce_sum(input_tensor=tf.sign(tf.abs(self.char_inputs)), axis=1), tf.int32)

        # bi-directional rnn layer
        rnn_outputs = self.birnn_layer(rnn_inputs, self.rnn_units, self.seq_lengths)

        self.logits = self.projection_layer(rnn_outputs)

        self.cost = self.cost_layer(self.logits, self.seq_lengths, self.use_crf)

        self.train_op = self.optimize(self.cost)

        self.saver = tf.train.Saver(tf.global_variables())

    def embedding_layer(self, char_inputs, word_inputs):
        """
        Character and word segmentation embedding.
        :return: concatenated character and word segmentation embedding.
        """
        char_and_word_embeddings = []
        with tf.variable_scope('char_embedding_layer'), tf.device('/cpu:0'):
            self.char_embeddings = tf.get_variable(name='char_embeddings', shape=[self.num_chars, self.char_dims], dtype=tf.float32, initializer=tf.contrib.layers.xavier_initializer())
            char_and_word_embeddings.append(tf.nn.embedding_lookup(params=self.char_embeddings, ids=char_inputs, name='char_embeddings_lookup'))
        if self.word_dims > 0:
            with tf.variable_scope('word_embedding_layer'), tf.device('/cpu:0'):
                self.word_embeddings = tf.get_variable(name='word_embeddings', shape=[self.num_words_types, self.word_dims], dtype=tf.float32, initializer=tf.contrib.layers.xavier_initializer())
                char_and_word_embeddings.append(tf.nn.embedding_lookup(params=self.word_embeddings, ids=word_inputs, name='word_embeddings_loopup'))
        # shape of rtn_embeddings: [None, None, self.char_dims + self.word_dims]
        # axis=2 is also ok
        rtn_embeddings = tf.concat(values=char_and_word_embeddings, axis=-1)
        return rtn_embeddings

    def birnn_layer(self, rnn_inputs, rnn_num_units, seq_lengths):
        """
        Bi-directional LSTM model.
        :return: concatenated forward and backward outputs.
        """
        with tf.variable_scope('birnn_layer'):
            rnn_cells = {}
            for tmp in ['forward', 'backward']:
                # according to 'Neural Architectures for Named Entity Recognition' Section 2.1
                # CoupledInputForgetGateLSTMCell from paper 'LSTM: A Search Space Odyssey', where f = 1- i.
                rnn_cells[tmp] = tf.contrib.rnn.CoupledInputForgetGateLSTMCell(num_units=rnn_num_units, use_peepholes=True, initializer=tf.contrib.layers.xavier_initializer())
            # obtain 'contextual word representation' through bi-rnn according to https://guillaumegenthial.github.io/sequence-tagging-with-tensorflow.html
            outputs, output_states = tf.nn.bidirectional_dynamic_rnn(cell_fw=rnn_cells['forward'], cell_bw=rnn_cells['backward'], inputs=rnn_inputs, sequence_length=seq_lengths, dtype=tf.float32)
        # concatenate forward output and backward output: [None, None, self.rnn_units + self.rnn_units]
        return tf.concat(values=outputs, axis=-1)

    def projection_layer(self, rnn_outputs):
        """
        Two hidden fully connected layers for computing tag scores for each character.
        :param rnn_outputs: the outputs of birnn_layer 
        :return: tag scores matrix P 
        """
        with tf.variable_scope('projections'):
            project_W = tf.get_variable(name='project_W', shape=[2 * self.rnn_units, self.rnn_units], dtype=tf.float32, initializer=tf.contrib.layers.xavier_initializer())
            # use tf.zeros_initializer()
            project_b = tf.get_variable(name='project_b', shape=[self.rnn_units], dtype=tf.float32, initializer=tf.zeros_initializer())
            rnn_outputs_flat = tf.reshape(tensor=rnn_outputs, shape=[-1, 2 * self.rnn_units])
            project_Z = tf.nn.xw_plus_b(x=rnn_outputs_flat, weights=project_W, biases=project_b, name='projection_Z')
            # activation
            project_A = tf.tanh(project_Z, name='projection_A')
        with tf.variable_scope('logits'):
            logits_W = tf.get_variable(name='logits_W', shape=[self.rnn_units, self.num_tags], dtype=tf.float32, initializer=tf.contrib.layers.xavier_initializer())
            logits_b = tf.get_variable(name='logits_b', shape=[self.num_tags], dtype=tf.float32, initializer=tf.zeros_initializer())
            logits = tf.nn.xw_plus_b(x=project_A, weights=logits_W, biases=logits_b, name='matrix_P')
            logits_format = tf.reshape(tensor=logits, shape=[-1, self.num_steps, self.num_tags])

        # I have tried only one single hidden layer, but the F1 score is lower about 0.3%, the codes likes:
        '''
        with tf.variable_scope('logits'):
            logits_W = tf.get_variable(name='logits_W', shape=[2 * self.rnn_units, self.num_tags], dtype=tf.float32, initializer=tf.contrib.layers.xavier_initializer())
            logits_b = tf.get_variable(name='logits_b', shape=[self.num_tags], dtype=tf.float32, initializer=tf.zeros_initializer())
            rnn_outputs_flat = tf.reshape(tensor=rnn_outputs, shape=[-1, 2 * self.rnn_units])
            logits = tf.nn.xw_plus_b(x=rnn_outputs_flat, weights=logits_W, biases=logits_b, name='matrix_P')
            logits_format = tf.reshape(tensor=logits, shape=[-1, self.num_steps, self.num_tags])
        '''
        return logits_format

    def cost_layer(self, logits, seq_lengths, use_crf):
        """
        Model cost for crf layer and softmax layer.
        """
        if use_crf:
            with tf.variable_scope('crf_cost'):
                # reference codes see https://github.com/glample/tagger/blob/master/model.py line 288
                small = -1000.0
                # let y_-1 (i.e., self.num_tags) and y_n(i.e., self.num_tag+1) be the start and end tags of the sequence, respectively.
                # shape of seq_start_logits: [self.batch_size, 1, self.num_tags + 2], here 2 means the start and end tags
                # let seq_0 be the added start character of the sequence, then the ner tag of seq_0 is y_-1, since the second last 0 (tf.zeros()) > -1000.0 (small * tf.ones())
                seq_start_logits = tf.concat(values=[small * tf.ones(shape=[self.batch_size, 1, self.num_tags]), tf.zeros(shape=[self.batch_size, 1, 1]), small * tf.ones(shape=[self.batch_size, 1, 1])], axis=-1)

                # shape of seq_end_logits: [self.batch_size, 1, self.num_tags+2]
                # let seq_n be the added end character of the sequence, then the ner tag of seq_n is y_n, since the last 0 (tf.zeros()) > -1000.0 (small * tf.ones())
                #seq_end_logits = tf.concat(values=[small * tf.ones(shape=[self.batch_size, 1, self.num_tags + 1]), tf.zeros(shape=[self.batch_size, 1, 1])], axis=-1)
                seq_end_logits = tf.concat(values=[small * tf.ones(shape=[self.batch_size, 1, self.num_tags]), small * tf.ones(shape=[self.batch_size, 1, 1]), tf.zeros(shape=[self.batch_size, 1, 1])], axis=-1)

                # padding the logits with small values, small values make sure that padding will not affect acutal predicted values.
                # shape of padded_logits: [self.batch_size, self.num_steps, self.num_tags + 2]
                padded_logits = tf.concat(values=[logits, tf.cast(small * tf.ones([self.batch_size, self.num_steps, 2]), tf.float32)], axis=-1)

                # shape of final_logits: [self.batch_size, self.num_steps + 2, self.num_tags + 2]
                final_logits = tf.concat(values=[seq_start_logits, padded_logits, seq_end_logits], axis=1)

                # padding the actual tags
                # the ner tag of padded seq_0 and seq_n is self.num_tags and self.num_tags+1, respectively
                # shape of padded final_tags: [self.batch_size, self.num_steps + 2]
                seq_start_tags = tf.cast(self.num_tags * tf.ones(shape=[self.batch_size, 1]), tf.int32)
                seq_end_tags = tf.cast((self.num_tags+1) * tf.ones(shape=[self.batch_size, 1]), tf.int32)
                final_tags = tf.concat(values=[seq_start_tags, self.tags, seq_end_tags], axis=-1)

                log_likelihood, self.transition_matrix = tf.contrib.crf.crf_log_likelihood(inputs=final_logits, tag_indices=final_tags, sequence_lengths=seq_lengths+2)
                crf_cost = tf.reduce_mean(-log_likelihood)

                tf.summary.scalar(name='cost', tensor=crf_cost)
                tf.summary.histogram(name='histogram_cost', values=crf_cost)
                self.summary_op = tf.summary.merge_all()
                return crf_cost
        else:
            with tf.variable_scope('softmax_cost'):
                cross_entropy_loss = tf.nn.sparse_softmax_cross_entropy_with_logits(labels=self.tags, logits=logits, name='cross_entropy_cost')
                # only compute seq_lengths cost
                masked_seq = tf.sequence_mask(lengths=seq_lengths)
                cross_entropy_loss = tf.boolean_mask(tensor=cross_entropy_loss, mask=masked_seq)
                softmax_cost = tf.reduce_mean(cross_entropy_loss)

                tf.summary.scalar(name='cost', tensor=softmax_cost)
                tf.summary.histogram(name='histogram_cost', values=softmax_cost)
                self.summary_op = tf.summary.merge_all()
                return softmax_cost

    def optimize(self, cost):
        """
        Adam optimizer with gradient clipping. 
        """
        with tf.variable_scope('optimizer'):
            optimizer = tf.train.AdamOptimizer(learning_rate=self.learning_rate)
            # compute_gradients return: [(gradient_a, variable_a), (gradient_b, variable_b)]
            # gradients: (gradient_a, gradient_b)
            # variables: (variable_a, variable_b)
            gradients, variables = zip(*optimizer.compute_gradients(loss=cost))
            clipped_gradients, _ = tf.clip_by_global_norm(t_list=gradients, clip_norm=self.max_gradient_norm)
            optimize = optimizer.apply_gradients(grads_and_vars=zip(clipped_gradients, variables), global_step=self.global_step)
        return optimize
    

    def step(self, sess, mini_batch, is_training, keep_prop):
        """
        Run the model one time.
        :param sess: tensorflow session.
        :param mini_batch: mini_batch data.
        :param is_training: flag denotes whether training step or testing step.
        :param keep_prop: keep probability of dropout.
        :return: statistics of this mini-batch.
        """
        _, tmp_chars, tmp_words, tmp_tags = mini_batch

        mini_batch_fd = {}
        mini_batch_fd[self.char_inputs] = np.asarray(tmp_chars)
        mini_batch_fd[self.word_inputs] = np.asarray(tmp_words)
        mini_batch_fd[self.keep_prob] = keep_prop

        if is_training:
            mini_batch_fd[self.tags] = np.asarray(tmp_tags)
            _, mini_batch_global_step, mini_batch_cost, mini_batch_summary = sess.run([self.train_op, self.global_step, self.cost, self.summary_op], feed_dict=mini_batch_fd)
            return mini_batch_global_step, mini_batch_cost, mini_batch_summary
        else:
            seq_length, predictions = sess.run([self.seq_lengths, self.logits], feed_dict=mini_batch_fd)
            return seq_length, predictions

    def decode(self, logits, seq_length, transition_matrix):
        """
        Decode the best tags via Viterbi algorithm.
        :param logits: predicted tag scores.
        :param seq_length: actual sequence lengths.
        :param transition_matrix: tag transition matrix.
        :return: best tags for each character.
        """
        best_tags = []
        small = -1000.0
        start_logits = np.asarray(a=[[small] * self.num_tags + [0, small]])
        end_logits = np.asarray(a=[[small] * self.num_tags + [small, 0]])
        # iterate each sequence
        for tmp_logits, tmp_length in zip(logits, seq_length):
            tmp_logits = tmp_logits[:tmp_length]
            padded_logits = np.concatenate([tmp_logits, small * np.ones(shape=[tmp_length, 2])], axis=1)
            # final_logits: [seq_len+2, num_tags+2]
            final_logits = np.concatenate([start_logits, padded_logits, end_logits], axis=0)
            # score: [seq_len, num_tags] matrix
            best_tag, _ = tf.contrib.crf.viterbi_decode(score=final_logits, transition_params=transition_matrix)
            # 1:len(best_tag)-1 means excludes start and end tags
            best_tags.append(best_tag[1:len(best_tag)-1])
        return best_tags


    def evaluate(self, sess, data_manager, id_to_tag):
        """
        Evaluate the model performance on dev or test data set.
        :param sess: tensorflow session.
        :param data_manager: dev or test data manager.
        :param id_to_tag: convert tag id to tag token.
        :return: [character_1 - real tag_1 - predicted tag_1, character_2 - real tag_2 - predicted tag_2, ...]
        """
        results = []
        if self.use_crf:
            transition_matrix = sess.run(self.transition_matrix)
        
        for mini_batch in data_manager.iter_batch():
            tmp_strings = mini_batch[0]
            tmp_tags = mini_batch[-1]
            tmp_lengths, tmp_logits = self.step(sess, mini_batch, is_training=False, keep_prop=1.0)
            
            if self.use_crf:
                batch_paths = self.decode(tmp_logits, tmp_lengths, transition_matrix)
            else:
                batch_paths = sess.run(tf.cast(tf.argmax(tmp_logits, axis=-1), tf.int32))
                
            for i in range(len(tmp_strings)):
                result = []
                string = tmp_strings[i][:tmp_lengths[i]]
                # real tags
                gold = iobes_iob([id_to_tag[int(x)] for x in tmp_tags[i][:tmp_lengths[i]]])
                # predicted tags
                pred = iobes_iob([id_to_tag[int(x)] for x in batch_paths[i][:tmp_lengths[i]]])
                # for each sample in one batch, store the character, real tag and predicted tag
                for char, gold, pred in zip(string, gold, pred):
                    result.append(' '.join([char, gold, pred]))
                # stores the whole data for all mini-batches
                results.append(result)
        return results

    def evaluate_line(self, sess, inputs, id_to_tag):
        lengths, scores = self.step(sess, inputs, is_training=False, keep_prop=1.0)
        
        if self.use_crf:
            transition_matrix = sess.run(self.transition_matrix)
            batch_paths = self.decode(scores, lengths, transition_matrix)
        else:
            batch_paths = sess.run(tf.cast(tf.argmax(scores, axis=-1), tf.int32))
            
        tags = [id_to_tag[idx] for idx in batch_paths[0]]
        return result_to_json(inputs[0][0], tags)


In [None]:
# %load train.py

import os
import pickle
import itertools
import tensorflow as tf
import numpy as np
from collections import OrderedDict

from model import BrandsNERModel
from conlleval import return_report
from utils_train import BatchManager, prepare_dataset, update_tag_scheme
from utils_io import clean, make_path, load_sentences, augment_with_pretrained, tag_mapping, char_mapping, load_config, save_config, load_word2vec

flags = tf.app.flags

# input
flags.DEFINE_string('data_dir',          'data',   'Path for training, development, testing and embedding data.')

# output
flags.DEFINE_string('summary_dir',  'summaries',   'Path for training and testing summaries.')
flags.DEFINE_string('ckpt_dir',     'checkpoints',  'Path for saving model checkpoints.')

# pre-processing
flags.DEFINE_boolean('zeros',            True,       'Replace digits with zero.')
flags.DEFINE_boolean('lower',            True,       'Convert character to lower case.')
flags.DEFINE_string('tag_schema',     'iobes',    'Tagging schema iobes or iob')

# bi-directional lstm + crf model
flags.DEFINE_integer('word_dim',            20,        'Embedding dimension for word, 0 if not used.')
flags.DEFINE_integer('char_dim',           100,        'Embedding dimension for character.')
flags.DEFINE_integer('num_units',          100,        'Number of recurrent units in LSTM cell.')

# training
flags.DEFINE_float('learning_rate',       0.001,      'Initial learning rate.')
flags.DEFINE_float('max_gradient_norm',       5,      'Clip gradients to this norm.')
flags.DEFINE_float('batch_size',             20,      'Batch size to use during training.')
flags.DEFINE_float('keep_prop',             0.5,      'Initial dropout rate.')
flags.DEFINE_boolean('use_crf',            True,      'Use crf layer or softmax layer as the top layer.')
flags.DEFINE_integer('num_epoch',           100,      'Number of epochs.')

# util
flags.DEFINE_boolean('clean',              True,      'Clean all the training-related folders and files.')

FLAGS = flags.FLAGS

def create_model(session, Model_class, path, load_vec, config, id_to_char):
    """
    Train the new model or re-use trained model.
    """
    model = Model_class(config)
    ckpt = tf.train.get_checkpoint_state(checkpoint_dir=path)
    if ckpt and ckpt.model_checkpoint_path:
        print('Reading model parameters from %s' % ckpt.model_checkpoint_path)
        model.saver.restore(session, ckpt.model_checkpoint_path)
    else:
        print('Created model with fresh parameters.')
        session.run(tf.global_variables_initializer())
        # assign character embeddings
        emb_weights = session.run(model.char_embeddings.read_value())
        emb_weights = load_vec(config['character_embedding_file'], id_to_char, config['char_dim'], emb_weights)
        session.run(model.char_embeddings.assign(emb_weights))
    return model


def train(training_file_name, dev_file_name, test_file_name, maps_file_name, character_embedding_file_name, config_file_name):
    """
    Train main entrance.
    """
    training_file = os.path.join(FLAGS.data_dir, training_file_name)
    dev_file = os.path.join(FLAGS.data_dir, dev_file_name)
    test_file = os.path.join(FLAGS.data_dir, test_file_name)
    maps_file = os.path.join(FLAGS.data_dir, maps_file_name)
    embedding_file = os.path.join(FLAGS.data_dir, character_embedding_file_name)
    config_file = os.path.join(FLAGS.data_dir, config_file_name)

    # load data sets
    # brands.train, dev, test tagging schema: IOB
    # train_sentences format: [[['a', 'B-SHOE'], ['b', 'I-SHOE'], ['c', I-SHOE], ['d', 'O'], ...], [next training example data]]
    train_sentences = load_sentences(training_file, FLAGS.zeros)
    dev_sentences = load_sentences(dev_file, FLAGS.zeros)
    test_sentences = load_sentences(test_file, FLAGS.zeros)

    # Use selected tagging scheme (IOB / IOBES)
    # train_sentences format: [[['a', 'B-SHOE'], ['b', 'I-SHOE'], ['c', E-SHOE], ['d', 'O'], ...], [next training example data]]
    update_tag_scheme(train_sentences, FLAGS.tag_schema)
    update_tag_scheme(test_sentences, FLAGS.tag_schema)

    # create maps.pkl if not exist
    # maps.pkl contains: char_to_id, id_to_char, tag_to_id, id_to_tag
    if not os.path.isfile(maps_file):
        print('create map file')
        # create dictionary for each character
        dict_chars_train = char_mapping(train_sentences, FLAGS.lower)[0]
        # update dictionary by add the characters in embedding files or test data set
        dict_chars, char_to_id, id_to_char = augment_with_pretrained(dict_chars_train.copy(), embedding_file, list(itertools.chain.from_iterable([[w[0] for w in s] for s in dev_sentences])))
        # Create a dictionary and a mapping for tags
        _t, tag_to_id, id_to_tag = tag_mapping(train_sentences)
        # pickle data
        with open(maps_file, 'wb') as f:
            pickle.dump([char_to_id, id_to_char, tag_to_id, id_to_tag], f)
    else:
        print('load map file')
        with open(maps_file, 'rb') as f:
            char_to_id, id_to_char, tag_to_id, id_to_tag = pickle.load(f)

    # convert character, tag, word segmentation to id
    train_data = prepare_dataset(train_sentences, char_to_id, tag_to_id, FLAGS.lower)
    dev_data = prepare_dataset(dev_sentences, char_to_id, tag_to_id, FLAGS.lower)
    test_data = prepare_dataset(test_sentences, char_to_id, tag_to_id, FLAGS.lower)
    print('%i / %i / %i sentences in train / dev / test.' % (len(train_data), len(dev_data), len(test_data)))

    # prepare mini-batch data
    train_manager = BatchManager(train_data, FLAGS.batch_size)
    dev_manager = BatchManager(dev_data, 100)
    test_manager = BatchManager(test_data, 100)

    # make path for store summary and model if not exist
    make_path(FLAGS)

    if os.path.isfile(config_file):
        config = load_config(config_file)
    else:
        config = OrderedDict()
        config['num_chars'] = len(char_to_id)
        config['char_dim'] = FLAGS.char_dim
        config['num_tags'] = len(tag_to_id)
        config['word_dim'] = FLAGS.word_dim
        config['num_units'] = FLAGS.num_units
        config['batch_size'] = FLAGS.batch_size
        config['character_embedding_file'] = os.path.join(FLAGS.data_dir, character_embedding_file_name)
        config['max_gradient_norm'] = FLAGS.max_gradient_norm
        config['keep_prop'] = FLAGS.keep_prop
        config['learning_rate'] = FLAGS.learning_rate
        config['zeros'] = FLAGS.zeros
        config['lower'] = FLAGS.lower
        config['use_crf'] = FLAGS.use_crf
        save_config(config, config_file)

    # config parameters for the tf.Session
    tf_config = tf.ConfigProto()
    tf_config.gpu_options.allow_growth = True
    # number of mini-batches per epoch
    steps_per_epoch = train_manager.len_data

    with tf.Session(config=tf_config) as sess:
        # train_writer = tf.summary.FileWriter(logdir=os.path.join(FLAGS.summary_dir, 'train'), graph=sess.graph)
        # test_writer = tf.summary.FileWriter(logdir=os.path.join(FLAGS.summary_dir, 'test'), graph=sess.graph)
        train_writer = tf.summary.FileWriter(logdir=FLAGS.summary_dir, graph=sess.graph)
        model = create_model(sess, BrandsNERModel, FLAGS.ckpt_dir, load_word2vec, config, id_to_char)
        loss = []
        for i in range(FLAGS.num_epoch):
            for mini_batch in train_manager.iter_batch(shuffle=True):
                global_step, mini_batch_cost, mini_batch_summary = model.step(sess, mini_batch, is_training=True, keep_prop=FLAGS.keep_prop)
                train_writer.add_summary(summary=mini_batch_summary, global_step=global_step)
                loss.append(mini_batch_cost)
                if global_step % 100 == 0:
                    print('iteration:{} step:{}/{}, NER loss:{:>9.6f}'.format(i+1, global_step%steps_per_epoch, steps_per_epoch, np.mean(loss)))
                    loss = []
            # evaluate the model on development data
            best = evaluate(sess, model, 'dev', dev_manager, id_to_tag)
            # if have better dev F1 score until now, then save the model
            if best:
                model.saver.save(sess=sess, save_path=os.path.join(FLAGS.ckpt_dir, 'Brands_ner.ckpt'), global_step=model.global_step.eval())
            # report the test F1 score
            evaluate(sess, model, 'test', test_manager, id_to_tag)


def evaluate(sess, model, name, data, id_to_tag):
    print('====================== evaluate:{}'.format(name))

    # ner_results contains 'character - real tag - predicted tag' for all samples in 'data'
    ner_results = model.evaluate(sess, data, id_to_tag)
    eval_lines = test_ner(ner_results, FLAGS.data_dir)

    for line in eval_lines:
        print(line)
    f1 = float(eval_lines[1].strip().split()[-1])

    if name == 'dev':
        best_test_f1 = model.best_dev_f1.eval()
        if f1 > best_test_f1:
            sess.run(model.best_dev_f1.assign(f1))
            print('new best dev f1 score:{:>.3f}'.format(f1))
        return f1 > best_test_f1
    elif name == 'test':
        best_test_f1 = model.best_test_f1.eval()
        if f1 > best_test_f1:
            sess.run(model.best_test_f1.assign(f1))
            print('new best test f1 score:{:>.3f}'.format(f1))
        return f1 > best_test_f1

def test_ner(results, path):
    """
    Report the performance.
    """
    output_file = os.path.join(path, 'Brands_ner_predict.utf8')
    with open(output_file, 'w', encoding='utf-8') as f:
        to_write = []
        for block in results:
            for line in block:
                to_write.append(line + '\n')
            to_write.append('\n')

        f.writelines(to_write)
    eval_lines = return_report(output_file)
    return eval_lines

def main(_):
    if FLAGS.clean:
        clean(FLAGS, 'maps.pkl', 'BrandsNERModel.config', 'Brands_ner_predict.utf8')
    train('brands.train', 'brands.dev', 'brands.test', 'maps.pkl', 'wiki_100.utf8', 'BrandsNERModel.config')

if __name__ == '__main__':
    tf.app.run(main)