In [None]:
import preprocessing as ps
import numpy as np
from keras.models import Sequential
from keras.layers import Dense, Embedding, GlobalAveragePooling1D

from collections import Counter

In [None]:
ngram_range = 2
max_features = 100000
batch_size = 32
embedding_dims = 128
epochs = 10
maxlen = 1000
index = 3

In [None]:
data_train, label_train = ps.read_data_maxlen('../train.txt', index, maxlen)
data_test, label_test = ps.read_data_maxlen('../test.txt', index, maxlen)
data_val, label_val = ps.read_data_maxlen('../val.txt', index, maxlen)

In [None]:
def seg_all(data):
    return [ps.segment(x) for x in data]

In [None]:
data_train = seg_all(data_train)
data_test = seg_all(data_test)
data_val = seg_all(data_val)

In [None]:
def save_to_file(data, label, filename):
    with open(filename, 'w', encoding='utf-8') as f:
        for i in range(len(data)):
            f.write(' '.join(data[i]) + '\t' + label[i] + '\n')

def load_file(filename):
    with open(filename, 'r', encoding='utf-8') as f:
        data = []
        label = []
        for line in f:
            data.append(line.split('\t')[0].split(' '))
            label.append(line.split('\t')[1])
        return data, label

In [None]:
save_to_file(data_train, label_train, '../train_seg.txt')
save_to_file(data_test, label_test, '../test_seg.txt')
save_to_file(data_val, label_val, '../val_seg.txt')

In [None]:
data_train, label_train = load_file('../train_seg.txt')
data_test, label_test = load_file('../test_seg.txt')
data_val, label_val = load_file('../val_seg.txt')

In [None]:
print(max(map(len, data_train)))
print(max(map(len, data_test)))
print(max(map(len, data_val)))      

In [None]:
print(np.mean(list(map(len, data_train))))
print(np.mean(list(map(len, data_test))))
print(np.mean(list(map(len, data_val))))  

In [None]:
def get_words(data, max_features):
    text = []
    for line in data:
        text = text + line
    counter = Counter(text).most_common(max_features)
    words, _ = zip(*counter)
    
    word_to_id = dict((c, i) for i, c in enumerate(words))
    id_to_word = dict((i, c) for i, c in enumerate(words))
    return words, word_to_id, id_to_word

In [None]:
def get_words_line(data, max_features):
    print(len(data))
    counter = Counter()
    cnt = 0
    for line in data:
        counter = counter + Counter(line)
        
        cnt += 1
        if cnt % 50 == 0:
            print(cnt, len(counter))
    
    counter = counter.most_common(max_features)
    words, _ = zip(*counter)
    
    word_to_id = dict((c, i) for i, c in enumerate(words))
    id_to_word = dict((i, c) for i, c in enumerate(words))
    return words, word_to_id, id_to_word

In [None]:
def get_words_batch(data, max_features, batch_size):
    counter = Counter()
    batch_num = len(data) // batch_size
    
    for i in range(batch_num-1):
        text = []
        for line in data[i*batch_size:(i+1)*batch_size]:
            text += line
        counter += Counter(text)
    
    text = []
    for line in data[batch_size*(batch_num-1):]:
        text += line
    counter += Counter(text)
    
    counter = counter.most_common(max_features)
    words, _ = zip(*counter)
    
    word_to_id = dict((c, i) for i, c in enumerate(words))
    id_to_word = dict((i, c) for i, c in enumerate(words))

    return words, word_to_id, id_to_word, counter

In [None]:
words, word_to_id, id_to_word, counter = get_words_batch(data_train + data_test + data_val, max_features, 1000)

In [None]:
class_set, cls_to_id, id_to_cls = ps.get_classes(label_val)

In [None]:
X_train, y_train = ps.tokenize(data_train, label_train, word_to_id, cls_to_id, len(class_set))
X_test, y_test = ps.tokenize(data_test, label_test, word_to_id, cls_to_id, len(class_set))
X_val, y_val = ps.tokenize(data_val, label_val, word_to_id, cls_to_id, len(class_set))

In [None]:
print(max(map(len, X_train)))
print(max(map(len, X_test)))
print(max(map(len, X_val)))

In [None]:
maxlen = 800

In [None]:
token_indice, max_features = ps.build_ngram_tokens(X_train, max_features, ngram_range)
X_train = ps.pad_ngram_data(X_train, token_indice, maxlen, ngram_range)
X_test = ps.pad_ngram_data(X_test, token_indice, maxlen, ngram_range)
X_val = ps.pad_ngram_data(X_val, token_indice, maxlen, ngram_range)

In [None]:
y_train = np.array(y_train)
y_test = np.array(y_test)
y_val = np.array(y_val)

In [None]:
X_train, Y_train = ps.data_shuffle(X_train, y_train)
X_test, Y_test = ps.data_shuffle(X_test, y_test)
X_val, Y_val = ps.data_shuffle(X_val, y_val)

In [None]:
print(X_train.shape)
print(X_test.shape)
print(X_val.shape)
print(y_train.shape)
print(y_test.shape)
print(y_val.shape)

In [None]:
# 构建模型
print('Build model...')
model = Sequential()

# we start off with an efficient embedding layer which maps
# our vocab indices into embedding_dims dimensions
# 先从一个高效的嵌入层开始，它将词汇表索引映射到 embedding_dim 维度的向量上
model.add(Embedding(max_features,
                    embedding_dims,
                    input_length=maxlen))

# we add a GlobalAveragePooling1D, which will average the embeddings
# of all words in the document
# 添加一个 GlobalAveragePooling1D 层，它将平均整个序列的词嵌入
model.add(GlobalAveragePooling1D())

# We project onto a single unit output layer, and squash it with a sigmoid:
# 投影到一个单神经元输出层，然后使用 sigmoid 挤压。
model.add(Dense(len(class_set), activation='softmax'))


model.compile(loss='categorical_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

model.summary()  # 概述

In [None]:
model.fit(X_train, y_train,
          batch_size=batch_size,
          epochs=epochs,
          validation_data=(X_val, y_val))