In [1]:
import preprocessing as ps
import numpy as np
from keras.models import Sequential
from keras.layers import Dense, Embedding, GlobalAveragePooling1D

from collections import Counter

Using TensorFlow backend.


In [2]:
ngram_range = 2
max_features = 100000
batch_size = 32
embedding_dims = 128
epochs = 10
maxlen = 1000
index = 3

In [3]:
data_train, label_train = ps.read_data_maxlen('../train.txt', index, maxlen)
data_test, label_test = ps.read_data_maxlen('../test.txt', index, maxlen)
data_val, label_val = ps.read_data_maxlen('../val.txt', index, maxlen)

In [4]:
def seg_all(data):
    return [ps.segment(x) for x in data]

In [5]:
data_train = seg_all(data_train)

Building prefix dict from the default dictionary ...
Dumping model to file cache /var/folders/65/1sj9q72d15gg80vt9c70v9d80000gn/T/jieba.cache
Loading model cost 1.352 seconds.
Prefix dict has been built succesfully.


In [7]:
print(data_train[0])

['近日', '编辑', '进口', '现代', '合肥', '地区', '经销商', '了解', '即日起', '该店', '购买', '现代', '飞思', '现金', '直降', '目前', '店内', '现车', '充足', '颜色', '齐全', '详情请', '咨询', '经销商', '以下', '飞思', '车型', '最新', '价格', '变化', '表伤', '汲敌', '浠表', '邢殖担', '颜色', '详情请', '咨询', '经销商', '方面', '雅科', '车型', '配置', '图片', '报价', '质保', '周期', '五年', '公里', '店内', '建议', '保养', '周期', '万公里', '更换', '一次', '机油', '机滤', '费用', '大概', '更换', '一次', '机油', '三滤', '费用', '大概', '一切', '保养', '信息', '当天', '保养', '信息', '为准', '点评', '现代', '雅科', '一款', '极具', '性价比', '豪华车', '不管', '配置', '空间', '动力', '不输于', '德系', '日系', '豪车', '发动机', '更是', '获得', '全球', '十佳', '发动机', '称号', '即日起', '购买', '飞思', '享元', '现金', '优惠', '感兴趣', '朋友', '不妨', '经销商', '咨询', '汽车', '点评', '网易', '丽君']


In [8]:
# data_train = seg_all(data_train)
data_test = seg_all(data_test)
data_val = seg_all(data_val)

In [9]:
def save_to_file(data, label, filename):
    with open(filename, 'w', encoding='utf-8') as f:
        for i in range(len(data)):
            f.write(' '.join(data[i]) + '\t' + label[i] + '\n')

def load_file(filename):
    with open(filename, 'r', encoding='utf-8') as f:
        data = []
        label = []
        for line in f:
            data.append(line.split('\t')[0].split(' '))
            label.append(line.split('\t')[1])
        return data, label

In [10]:
save_to_file(data_train, label_train, '../train_seg.txt')
save_to_file(data_test, label_test, '../test_seg.txt')
save_to_file(data_val, label_val, '../val_seg.txt')

In [11]:
data_train, label_train = load_file('../train_seg.txt')
data_test, label_test = load_file('../test_seg.txt')
data_val, label_val = load_file('../val_seg.txt')

In [12]:
print(max(map(len, data_train)))
print(max(map(len, data_test)))
print(max(map(len, data_val)))      

427
413
427


In [13]:
print(np.mean(list(map(len, data_train))))
print(np.mean(list(map(len, data_test))))
print(np.mean(list(map(len, data_val))))  

163.15654
171.8274
178.2666


In [14]:
def get_words(data, max_features):
    text = []
    for line in data:
        text = text + line
    counter = Counter(text).most_common(max_features)
    words, _ = zip(*counter)
    
    word_to_id = dict((c, i) for i, c in enumerate(words))
    id_to_word = dict((i, c) for i, c in enumerate(words))
    return words, word_to_id, id_to_word

In [15]:
def get_words_line(data, max_features):
    print(len(data))
    counter = Counter()
    cnt = 0
    for line in data:
        counter = counter + Counter(line)
        
        cnt += 1
        if cnt % 50 == 0:
            print(cnt, len(counter))
    
    counter = counter.most_common(max_features)
    words, _ = zip(*counter)
    
    word_to_id = dict((c, i) for i, c in enumerate(words))
    id_to_word = dict((i, c) for i, c in enumerate(words))
    return words, word_to_id, id_to_word

In [16]:
def get_words_batch(data, max_features, batch_size):
    counter = Counter()
    batch_num = len(data) // batch_size
    
    for i in range(batch_num-1):
        text = []
        for line in data[i*batch_size:(i+1)*batch_size]:
            text += line
        counter += Counter(text)
    
    text = []
    for line in data[batch_size*(batch_num-1):]:
        text += line
    counter += Counter(text)
    
    counter = counter.most_common(max_features)
    words, _ = zip(*counter)
    
    word_to_id = dict((c, i) for i, c in enumerate(words))
    id_to_word = dict((i, c) for i, c in enumerate(words))

    return words, word_to_id, id_to_word, counter

In [17]:
words, word_to_id, id_to_word, counter = get_words_batch(data_train + data_test + data_val, max_features, 1000)

In [18]:
class_set, cls_to_id, id_to_cls = ps.get_classes(label_val)

In [19]:
X_train, y_train = ps.tokenize(data_train, label_train, word_to_id, cls_to_id, len(class_set))
X_test, y_test = ps.tokenize(data_test, label_test, word_to_id, cls_to_id, len(class_set))
X_val, y_val = ps.tokenize(data_val, label_val, word_to_id, cls_to_id, len(class_set))

In [20]:
print(max(map(len, X_train)))
print(max(map(len, X_test)))
print(max(map(len, X_val)))

419
410
419


In [24]:
words[90000:]

('一小撮',
 '李准',
 '谴丛',
 '班规',
 '心主',
 '校到',
 '广为传颂',
 '博士论文',
 '诬蔑',
 '大座',
 '史密森',
 '七五折',
 '海明威',
 '没吵',
 '鱼尾纹',
 '风场',
 '写稿',
 '冰洋',
 '竞速赛',
 '新朝',
 '中何',
 '逐步推广',
 '拉至',
 '幻靼',
 '淡水鱼',
 '刘良宗',
 '光之城',
 '薛耀明',
 '富印',
 '只闻',
 '戏太多',
 '危重症',
 '比均',
 '只称',
 '姜作臣',
 '用集',
 '欲火',
 '职炙担',
 '笔鸪',
 '两舰',
 '柯俊雄',
 '蛱煊',
 '露馈',
 '下战书',
 '班会课',
 '停备',
 '亚胺',
 '长镜头',
 '照会',
 '饲凹',
 '死命',
 '数百亿元',
 '宋宁华',
 '年过花甲',
 '但关',
 '包容心',
 '中共中央国务院',
 '肥妈',
 '性急',
 '湿冷',
 '冯飞',
 '大岭',
 '分能',
 '掠走',
 '甜筒',
 '接吻戏',
 '戴家',
 '谐校',
 '睡熟',
 '右面',
 '商务英语',
 '滞时',
 '欺辱',
 '一两点',
 '说话算话',
 '多屏',
 '千部',
 '梁晓雪',
 '水梨',
 '丈鲜',
 '睡袍',
 '桑原',
 '蛙声',
 '三声',
 '慷幼',
 '至死不渝',
 '难返',
 '以耗',
 '保带',
 '保埃岸',
 '巨嘴鸟',
 '金泰熙',
 '荆早',
 '立誓',
 '加快速度',
 '淮北市',
 '开胃菜',
 '以太网',
 '蠢醋',
 '装得',
 '嗜欲',
 '蓟始',
 '担疤',
 '西伯利亚地区',
 '恳徊',
 '特卖会',
 '六世',
 '政治危机',
 '鞭长莫及',
 '侄器',
 '刑宇',
 '恢故',
 '中高配',
 '宫市',
 '降本增效',
 '区时',
 '止乎礼',
 '喝饱水',
 '蛟斐',
 '土肥',
 '地摊儿',
 '雪梨',
 '董必武',
 '饱暖思淫欲',
 '华埠',
 '寒暑',
 '帷拢',
 '触点',
 '魁北克省',
 '瓦及',
 '不合情理',
 '缴个',
 

In [21]:
maxlen = 800

In [None]:
token_indice, max_features = ps.build_ngram_tokens(X_train, max_features, ngram_range)
X_train = ps.pad_ngram_data(X_train, token_indice, maxlen, ngram_range)
X_test = ps.pad_ngram_data(X_test, token_indice, maxlen, ngram_range)
X_val = ps.pad_ngram_data(X_val, token_indice, maxlen, ngram_range)

In [None]:
y_train = np.array(y_train)
y_test = np.array(y_test)
y_val = np.array(y_val)

In [None]:
X_train, Y_train = ps.data_shuffle(X_train, y_train)
X_test, Y_test = ps.data_shuffle(X_test, y_test)
X_val, Y_val = ps.data_shuffle(X_val, y_val)

In [None]:
print(X_train.shape)
print(X_test.shape)
print(X_val.shape)
print(y_train.shape)
print(y_test.shape)
print(y_val.shape)

In [None]:
# 构建模型
print('Build model...')
model = Sequential()

# we start off with an efficient embedding layer which maps
# our vocab indices into embedding_dims dimensions
# 先从一个高效的嵌入层开始，它将词汇表索引映射到 embedding_dim 维度的向量上
model.add(Embedding(max_features,
                    embedding_dims,
                    input_length=maxlen))

# we add a GlobalAveragePooling1D, which will average the embeddings
# of all words in the document
# 添加一个 GlobalAveragePooling1D 层，它将平均整个序列的词嵌入
model.add(GlobalAveragePooling1D())

# We project onto a single unit output layer, and squash it with a sigmoid:
# 投影到一个单神经元输出层，然后使用 sigmoid 挤压。
model.add(Dense(len(class_set), activation='softmax'))


model.compile(loss='categorical_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

model.summary()  # 概述

In [None]:
model.fit(X_train, y_train,
          batch_size=batch_size,
          epochs=epochs,
          validation_data=(X_val, y_val))