# 导库

In [4]:
from os.path import join
from codecs import open

import numpy as np
import pandas as pd


# 数据预处理：index 形式

In [5]:
def read_corpus(split, data_dir="./ResumeNER"):
    """文件读取"""
    word_lists = []
    tag_lists = []
    with open(join(data_dir, split+".char.bmes"), 'r', encoding='utf-8') as f:
        word_list = []
        tag_list = []
        for line in f:
            if line != '\n':
                word, tag = line.strip('\n').split()
                word_list.append(word)
                tag_list.append(tag)
            else:
                word_lists.append(word_list)
                tag_lists.append(tag_list)
                word_list = []
                tag_list = []
    return word_lists, tag_lists
    

def build_map(lists):
    """构建词表"""
    maps = {}
    for list_ in lists:
        for e in list_:
            if e not in maps:
                maps[e] = len(maps)

    return maps 


def build_index(lists, vocabs):
    """构建索引"""
    word_index_lists = []
    for list_ in lists:
        word_index_list = []
        for e in list_:
            word_index_list.append(vocabs.get(e))
        word_index_lists.append(word_index_list)
    return word_index_lists

In [6]:
# 获取文件的数据和标签
train_word_lists, train_tag_lists = read_corpus('train')
print(len(train_word_lists), len(train_tag_lists))
dev_word_lists, dev_tag_lists = read_corpus('dev')
print(len(dev_word_lists), len(dev_tag_lists))
test_word_lists, test_tag_lists = read_corpus('test')
print(len(test_word_lists), len(test_tag_lists))

word_lists = train_word_lists + dev_word_lists + test_word_lists
tag_lists = train_tag_lists + dev_tag_lists + test_tag_lists

print(len(word_lists), len(tag_lists))

3821 3821
463 463
477 477
4761 4761


In [10]:
# 生成词典
import json
word_vocab = build_map(word_lists)
tag_vocab = build_map(tag_lists)

print(len(word_vocab), len(tag_vocab))
print(tag_vocab)

with open('./ResumeNER/word2id.json', 'w') as f:
    json.dump(word_vocab, f)
with open('./ResumeNER/tag2id.json', 'w') as f:
    json.dump(tag_vocab, f)

1902 28
{'B-NAME': 0, 'E-NAME': 1, 'O': 2, 'B-CONT': 3, 'M-CONT': 4, 'E-CONT': 5, 'B-RACE': 6, 'E-RACE': 7, 'B-TITLE': 8, 'M-TITLE': 9, 'E-TITLE': 10, 'B-EDU': 11, 'M-EDU': 12, 'E-EDU': 13, 'B-ORG': 14, 'M-ORG': 15, 'E-ORG': 16, 'M-NAME': 17, 'B-PRO': 18, 'M-PRO': 19, 'E-PRO': 20, 'S-RACE': 21, 'S-NAME': 22, 'B-LOC': 23, 'M-LOC': 24, 'E-LOC': 25, 'M-RACE': 26, 'S-ORG': 27}


In [13]:
train_word_index_lists = build_index(train_word_lists, word_vocab)
train_tag_index_lists = build_index(train_tag_lists, tag_vocab)

dev_word_index_lists = build_index(dev_word_lists, word_vocab)
dev_tag_index_lists = build_index(dev_tag_lists, tag_vocab)

test_word_index_lists = build_index(test_word_lists, word_vocab)
test_tag_index_lists = build_index(test_tag_lists, tag_vocab)

print('train:', len(train_word_lists), len(train_word_index_lists), len(train_tag_lists), len(train_tag_index_lists))
print('dev:', len(dev_word_lists), len(dev_word_index_lists), len(dev_tag_lists), len(dev_tag_index_lists))
print('test:', len(test_word_lists), len(test_word_index_lists), len(test_tag_lists), len(test_tag_index_lists))



train: 3821 3821 3821 3821
dev: 463 463 463 463
test: 477 477 477 477


In [17]:
df_train_data = pd.DataFrame({'words': train_word_lists, 'words_index': train_word_index_lists, 
                              'tags': train_tag_lists,
                       'tags_index': train_tag_index_lists}, index=list(range(len(train_word_lists))))
df_train_data.to_csv("./ResumeNER/df_train_data.csv")
df_train_data.head()

Unnamed: 0,words,words_index,tags,tags_index
0,"[高, 勇, ：, 男, ，, 中, 国, 国, 籍, ，, 无, 境, 外, 居, 留, ...","[0, 1, 2, 3, 4, 5, 6, 6, 7, 4, 8, 9, 10, 11, 1...","[B-NAME, E-NAME, O, O, O, B-CONT, M-CONT, M-CO...","[0, 1, 2, 2, 2, 3, 4, 4, 5, 2, 2, 2, 2, 2, 2, ..."
1,"[1, 9, 6, 6, 年, 出, 生, ，, 汉, 族, ，, 中, 共, 党, 员, ...","[14, 15, 16, 16, 17, 18, 19, 4, 20, 21, 4, 5, ...","[O, O, O, O, O, O, O, O, B-RACE, E-RACE, O, B-...","[2, 2, 2, 2, 2, 2, 2, 2, 6, 7, 2, 8, 9, 9, 10,..."
2,"[2, 0, 0, 7, 年, 1, 0, 月, 至, 今, 任, 人, 和, 投, 资, ...","[55, 56, 56, 57, 17, 14, 56, 58, 59, 60, 61, 6...","[O, O, O, O, O, O, O, O, O, O, O, B-ORG, M-ORG...","[2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 14, 15, 15, ..."
3,"[2, 0, 0, 7, 年, 1, 2, 月, 至, 2, 0, 1, 3, 年, 2, ...","[55, 56, 56, 57, 17, 14, 55, 58, 59, 55, 56, 1...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ...","[2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, ..."
4,"[2, 0, 1, 3, 年, 2, 月, 至, 今, 任, 山, 东, 三, 维, 石, ...","[55, 56, 14, 69, 17, 55, 58, 59, 60, 61, 78, 7...","[O, O, O, O, O, O, O, O, O, O, B-ORG, M-ORG, M...","[2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 14, 15, 15, 15,..."


In [18]:
df_dev_data = pd.DataFrame({'words': dev_word_lists, 'words_index': dev_word_index_lists, 
                              'tags': dev_tag_lists,
                       'tags_index': dev_tag_index_lists}, index=list(range(len(dev_word_lists))))
df_dev_data.to_csv("./ResumeNER/df_dev_data.csv")
df_dev_data.head()

Unnamed: 0,words,words_index,tags,tags_index
0,"[吴, 重, 阳, ，, 中, 国, 国, 籍, ，, 大, 学, 本, 科, ，, 教, ...","[344, 458, 533, 4, 5, 6, 6, 7, 4, 144, 27, 25,...","[B-NAME, M-NAME, E-NAME, O, B-CONT, M-CONT, M-...","[0, 17, 1, 2, 3, 4, 4, 5, 2, 11, 12, 12, 13, 2..."
1,"[历, 任, 公, 司, 副, 总, 经, 理, 、, 总, 工, 程, 师, ，]","[28, 61, 70, 71, 88, 89, 53, 37, 32, 89, 29, 3...","[O, O, B-ORG, E-ORG, B-TITLE, M-TITLE, M-TITLE...","[2, 2, 14, 16, 8, 9, 9, 10, 2, 8, 9, 9, 10, 2]"
2,"[2, 0, 0, 9, 年, 5, 月, 至, 今, ，, 受, 聘, 为, 公, 司, ...","[55, 56, 56, 15, 17, 93, 58, 59, 60, 4, 500, 8...","[O, O, O, O, O, O, O, O, O, O, O, O, O, B-ORG,...","[2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 14, 16..."
3,"[兼, 任, 中, 国, 科, 技, 会, 堂, 专, 家, 委, 员, 会, 专, 家, ...","[251, 61, 5, 6, 26, 123, 39, 1407, 51, 52, 316...","[O, O, B-ORG, M-ORG, M-ORG, M-ORG, M-ORG, E-OR...","[2, 2, 14, 15, 15, 15, 15, 16, 8, 9, 9, 9, 9, ..."
4,"[谢, 卫, 东, 先, 生, ：, 1, 9, 6, 6, 年, 1, 2, 月, 出, ...","[265, 559, 79, 140, 19, 2, 14, 15, 16, 16, 17,...","[B-NAME, M-NAME, E-NAME, O, O, O, O, O, O, O, ...","[0, 17, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,..."


In [19]:
df_test_data = pd.DataFrame({'words': test_word_lists, 'words_index': test_word_index_lists, 
                              'tags': test_tag_lists,
                       'tags_index': test_tag_index_lists}, index=list(range(len(test_word_lists))))
df_test_data.to_csv("./ResumeNER/df_test_data.csv")
df_test_data.head()

Unnamed: 0,words,words_index,tags,tags_index
0,"[常, 建, 良, ，, 男, ，]","[232, 315, 1090, 4, 3, 4]","[B-NAME, M-NAME, E-NAME, O, O, O]","[0, 17, 1, 2, 2, 2]"
1,"[1, 9, 6, 3, 年, 出, 生, ，, 工, 科, 学, 士, ，, 高, 级, ...","[14, 15, 16, 69, 17, 18, 19, 4, 29, 26, 27, 10...","[O, O, O, O, O, O, O, O, B-PRO, E-PRO, B-EDU, ...","[2, 2, 2, 2, 2, 2, 2, 2, 18, 20, 11, 13, 2, 8,..."
2,"[1, 9, 8, 5, 年, 8, 月, —, 1, 9, 9, 3, 年, 在, 国, ...","[14, 15, 110, 93, 17, 110, 58, 1619, 14, 15, 1...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, B-O...","[2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 14,..."
3,"[1, 9, 9, 3, 年, 5, 月, —, 1, 9, 9, 9, 年, 5, 月, ...","[14, 15, 15, 69, 17, 93, 58, 1619, 14, 15, 15,...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ...","[2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, ..."
4,"[1, 9, 9, 9, 年, 5, 月, —, 2, 0, 1, 0, 年, 4, 月, ...","[14, 15, 15, 15, 17, 93, 58, 1619, 55, 56, 14,...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ...","[2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, ..."


# torchtext 处理成 batch

In [2]:
import torch
import torch.optim as optim
import torch.nn as nn
import torch.nn.functional as F

from torchtext import data
from torchtext import datasets
from torchtext import vocab

In [None]:
# 1. 定义 Field

text_field = data.Field(tokenize='spacy', lower=True, fix_length=40, batch_first=True)
label_field = data.LabelField(dtype=torch.long)


In [None]:
# 2. 定义 DataSet

train, dev = data.TabularDataset.splits(
        path=BASE_PATH, train='train.tsv', validation='dev.tsv',format='tsv', skip_header=True,
        fields=[('text', text_field), ('label', label_field)])

# 这里需要注意单独处理的时候不能用 splits 方法。
test = data.TabularDataset(BASE_PATH+'test.tsv', format='tsv', skip_header=True,
        fields=[('index', label_field), ('text', text_field)])

print("the size of train: {}, dev:{}, test:{}".format(
    len(train), len(dev), len(test)))


In [None]:
# 查看 Example
print(train[1].text, train[1].label)

print(dev[1].text, dev[1].label)

print(test[1].text)

In [None]:
# 3. 建立 vocab，大小是text_field里面的词数量
# vectors = vocab.Vectors(embedding_file, cache_dir)

text_field.build_vocab(
        train, dev, test, max_size=25000,
        vectors='glove.6B.100d', unk_init=torch.Tensor.normal_)

label_field.build_vocab(train, dev, test)



In [None]:
len_vocab = len(text_field.vocab)
print(len_vocab)

print(len(label_field.vocab))
# for step, batch in enumerate(tqdm(train_iter, desc="Iteration")):
#     print(batch.text, batch.label)
    

In [None]:
# 4. 构造迭代器

train_iter, dev_iter = data.BucketIterator.splits(
        (train, dev), batch_sizes=(128, 128), sort_key=lambda x: len(x.text), 
        sort_within_batch=True, repeat=False, shuffle=True, device=device)

# 同样单独处理的时候
test_iter = data.Iterator(test, batch_size=len(test), train=False,
                          sort=False, device=device)

print("the size of train_iter: {}, dev_iter:{}, test_iter:{}".format(
    len(train_iter), len(dev_iter), len(test_iter)))



In [None]:
# 查看 Iterater
# seed_everything()
for batch_idx, (X_train_var, y_train_var) in enumerate(train_iter):
    print(batch_idx, X_train_var.shape, y_train_var.shape)
    break
