In [1]:
from os.path import isfile, isdir, join
import tarfile
import gzip

news20 = "20news-bydate.tar.gz"
news20_path = 'news20'

models_path = 'models'

temp_path = 'temp'

if not isdir(news20_path):
    gz = gzip.GzipFile(news20)
    tarfilename = news20.replace('.gz', '')
    open(tarfilename, 'wb').write(gz.read())
    
    with tarfile.TarFile(tarfilename) as tarf:
        tarf.extractall(news20_path)
        tarf.close()
    
    gz.close()

In [2]:
import os
assert isdir(news20_path), "The news20 is not availaible"

train_path = news20_path + '/20news-bydate-train'
test_path = news20_path + '/20news-bydate-test'

def get_x_y():
    train_x = []
    train_y = []
    test_x = []
    test_y = []
    
    for foldername in os.listdir(train_path):
        folder_path = join(train_path, foldername)
        
        for filename in os.listdir(folder_path):
            f = open(folder_path + '/' + filename, 'rb').read()
            train_x.append(f)
            train_y.append(foldername)
            
    for foldername in os.listdir(test_path):   
        folder_path = join(test_path, foldername)
        
        for filename in os.listdir(folder_path):
            f = open(folder_path + '/' + filename, 'rb').read()
            test_x.append(f)
            test_y.append(foldername)
            
    return train_x, train_y, test_x, test_y

In [3]:
train_x, train_y, test_x, test_y = get_x_y()
print(test_x[:1])
# print(train_y[1:11314:500])
print("训练集一共有文档{}".format(len(train_x)))
print("测试集一共有文档{}".format(len(test_x)))

[b"From: mattf@cac.washington.edu (Matthew Freedman)\nSubject: Non-Roman Font Availability\nArticle-I.D.: shelley.1rmgleINNa0g\nDistribution: world\nOrganization: U.W. Information Systems\nLines: 16\nNNTP-Posting-Host: elvis.cac.washington.edu\n\nCan anybody tell me anything about the availibility of non-Roman fonts\nfor X-Windows? Especially Unicode and/or han idiographic fonts.\n\nAlso, how about conversion tools for getting PC/Macintosh fonts into a\nformat suitable for X? I would assume it is not too difficult for\nbitmap fonts.\n\nThe FAQ's for this group and comp.fonts are not very helpful on these\nquestions. \n\n-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-\n= Matthew M. Freedman                                                 =\n= U. of Washington Information Systems       mattf@cac.washington.edu =\n= 4545 15th Ave. NE; 3rd Floor               (206) 543-5593           =\n= Seattle, WA  98105                                                  =\n-=-=-=-

In [4]:
from sklearn import preprocessing
lb = preprocessing.LabelBinarizer()
lb.fit(train_y)

train_y_lb = lb.transform(train_y)
test_y_lb = lb.transform(test_y)

le = preprocessing.LabelEncoder() 
le.fit(train_y)
train_y_le = le.transform(train_y)
test_y_le = le.transform(test_y)

print(test_y_lb[-2:])
print(test_y_le[:3])
print(train_y_le[:3])

[[0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0]]
[5 5 5]
[5 5 5]


In [5]:
# define network parameters
embed_size = 50 # how big is each word vector
max_features = 20000 # how many unique words to use (i.e num rows in embedding vector)
maxlen = 1000 # max number of words in a comment to use
num_filters = 100 # the number of CNN filters

In [6]:
from keras.preprocessing.text import Tokenizer
#把文档从字符串转为字符标量列表
def get_train_sequence():
    max_nb_token = 20000
    filters='!"#$%&()*+,-./:;<=>?@[\\]_^`{|}~\t\n'
    train_x, _, test_x, _ = get_x_y()    
    tokenizer = Tokenizer(num_words = max_nb_token, filters = filters, split = " ") #最大20，000个单词
    tokenizer.fit_on_texts([x.decode('gbk', 'ignore') for x in train_x])
    train_x_sequence = tokenizer.texts_to_sequences([x.decode('gbk', 'ignore') for x in train_x])
    test_x_sequence = tokenizer.texts_to_sequences([x.decode('gbk', 'ignore') for x in test_x])
    return train_x_sequence, test_x_sequence, tokenizer

Using TensorFlow backend.


In [7]:
import sys, os, re, csv, codecs, gc
import numpy as np
import pandas as pd
import tensorflow as tf
#=================Keras==============
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Dense, Input, Conv1D, Conv2D,Embedding, Dropout, Activation, Permute
from keras.layers import Bidirectional, MaxPooling1D, MaxPooling2D, Reshape, Flatten, Concatenate, BatchNormalization, GlobalMaxPool1D, GlobalMaxPool2D,SpatialDropout1D
from keras import backend
from keras.models import Model
from keras import initializers, regularizers, constraints, optimizers, layers, backend
#=================gensim=============
import gensim

In [8]:
x_train, x_test, tokenizer = get_train_sequence()

glove_dir = "glove_vector/"
def get_coefs(word,*arr): return word, np.asarray(arr, dtype='float32')

embeddings_index = dict(get_coefs(*o.strip().split()) for o in open(os.path.join(glove_dir, 'glove.6B.50d.txt')))

all_embs = np.stack(embeddings_index.values())
emb_mean,emb_std = all_embs.mean(), all_embs.std()
emb_mean,emb_std

word_index = tokenizer.word_index
nb_words = min(max_features, len(word_index))
embedding_matrix = np.random.normal(emb_mean, emb_std, (nb_words, embed_size))
for word, i in word_index.items():
    if i >= max_features: continue
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None: embedding_matrix[i] = embedding_vector

In [9]:
from  sklearn.metrics  import f1_score
#输入模型预测后再比较
def get_f1_score(clf, input_data, target_data):
    predict_data = clf.predict(input_data)
    f1_macro = f1_score(target_data, predict_data,  average = 'macro')
    f1_micro = f1_score(target_data, predict_data,  average = 'micro')
    return f1_macro, f1_micro

#直接比较输入输出
def get_f1_score_pure(input_data, target_data):
    f1_macro = f1_score(target_data, input_data,  average = 'macro')
    f1_micro = f1_score(target_data, input_data,  average = 'micro')
    return f1_macro, f1_micro

In [10]:
from keras.layers import Input, Embedding, Dense, Conv2D, MaxPool2D
filter_sizes = [3, 4, 5]

def get_model():    
    inp = Input(shape=(maxlen, ))
    x = Embedding(max_features, embed_size, weights=[embedding_matrix])(inp)
    x = SpatialDropout1D(0.4)(x)
    x = Reshape((maxlen, embed_size, 1))(x)
    
    conv_0 = Conv2D(num_filters, kernel_size=(filter_sizes[0], embed_size), kernel_initializer='normal',
                                                                                    activation='elu')(x)
    conv_1 = Conv2D(num_filters, kernel_size=(filter_sizes[1], embed_size), kernel_initializer='normal',
                                                                                    activation='elu')(x)
    conv_2 = Conv2D(num_filters, kernel_size=(filter_sizes[2], embed_size), kernel_initializer='normal',
                                                                                    activation='elu')(x)
    
    maxpool_0 = MaxPool2D(pool_size=(maxlen - filter_sizes[0] + 1, 1))(conv_0)
    maxpool_1 = MaxPool2D(pool_size=(maxlen - filter_sizes[1] + 1, 1))(conv_1)
    maxpool_2 = MaxPool2D(pool_size=(maxlen - filter_sizes[2] + 1, 1))(conv_2)
        
    z = Concatenate(axis=1)([maxpool_0, maxpool_1, maxpool_2])   
    z = Flatten()(z)
    z = Dropout(0.1)(z)
        
    outp = Dense(20, activation="sigmoid")(z)
    
    model = Model(inputs=inp, outputs=outp)
    return model

In [11]:
# 优化器我这里用了adam
from sklearn.utils import shuffle
import math
from keras.callbacks import LearningRateScheduler, ModelCheckpoint, EarlyStopping, TensorBoard,ReduceLROnPlateau

#动态调节学习率
estop = EarlyStopping(monitor='val_loss', min_delta=0, patience=5, verbose=0, mode='auto')
rlronp = ReduceLROnPlateau(monitor='val_loss', factor=0.2, patience=10, verbose=0, mode='auto', epsilon=0.0001, cooldown=0, min_lr=0)
def step_decay(epoch):
    initial_lrate = 0.004
    drop = 0.6
    epochs_drop = 5.0
    lrate = initial_lrate * math.pow(drop,math.floor((1+epoch)/epochs_drop))
    return lrate
lrate = LearningRateScheduler(step_decay)

def train_cnn_network(model, x_train, y_train):
    model.compile(loss='binary_crossentropy',
                  optimizer='adam',
                  metrics=['accuracy'])

    x_train, y_train = shuffle(x_train, y_train)
    model.fit(x_train, y_train, batch_size=32, validation_split=0.1, verbose = 2, shuffle = True, epochs=10, callbacks = [lrate])

In [12]:
#取文档序列作为输入
from keras.preprocessing import sequence
y_train = train_y_lb
y_test = test_y_lb

word_index = tokenizer.word_index
print('Pad sequences (samples x time)')
x_train = sequence.pad_sequences(
    x_train, maxlen=maxlen, padding = "post", value = 0)
x_test = sequence.pad_sequences(
    x_test, maxlen=maxlen, padding = "post", value  = 0)

model = get_model()
train_cnn_network(model, x_train, y_train)

Pad sequences (samples x time)
Train on 10182 samples, validate on 1132 samples
Epoch 1/10
 - 79s - loss: 0.1550 - acc: 0.9544 - val_loss: 0.0860 - val_acc: 0.9735
Epoch 2/10
 - 77s - loss: 0.0839 - acc: 0.9718 - val_loss: 0.0627 - val_acc: 0.9803
Epoch 3/10
 - 77s - loss: 0.0622 - acc: 0.9791 - val_loss: 0.0490 - val_acc: 0.9846
Epoch 4/10
 - 77s - loss: 0.0480 - acc: 0.9842 - val_loss: 0.0502 - val_acc: 0.9844
Epoch 5/10
 - 76s - loss: 0.0350 - acc: 0.9882 - val_loss: 0.0416 - val_acc: 0.9872
Epoch 6/10
 - 77s - loss: 0.0300 - acc: 0.9899 - val_loss: 0.0397 - val_acc: 0.9874
Epoch 7/10
 - 76s - loss: 0.0273 - acc: 0.9906 - val_loss: 0.0383 - val_acc: 0.9884
Epoch 8/10
 - 77s - loss: 0.0243 - acc: 0.9917 - val_loss: 0.0397 - val_acc: 0.9883
Epoch 9/10
 - 77s - loss: 0.0216 - acc: 0.9926 - val_loss: 0.0390 - val_acc: 0.9883
Epoch 10/10
 - 76s - loss: 0.0170 - acc: 0.9942 - val_loss: 0.0381 - val_acc: 0.9885


In [13]:
import datetime
def eva_cnn_network(model, x_train, x_test):
    begin_time = datetime.datetime.now()

    train_predict_y = model.predict(x_train)
    train_predict_y_nb = [np.argmax(doc) for doc in train_predict_y]
    train_target_y_nb = [np.argmax(doc) for doc in y_train]
    f1_macro_train_train, f1_micro_train_train = get_f1_score_pure(
        train_predict_y_nb, train_target_y_nb)

    test_predict_y = model.predict(x_test)
    test_predict_y_nb = [np.argmax(doc) for doc in test_predict_y]
    test_target_y_nb = [np.argmax(doc) for doc in y_test]
    f1_macro_test, f1_micro_test = get_f1_score_pure(
        test_predict_y_nb, test_target_y_nb)

    print("F1 Macro on train data: {}, F1 Micro: {}; \
          test data:Macro{} Micro{}".format(f1_macro_train_train, 
                                            f1_micro_train_train, 
                                            f1_macro_test, f1_micro_test))
    print("测试CNN 网络一共花了{}时间".format(
        datetime.datetime.now() - begin_time))

In [14]:
eva_cnn_network(model, x_train, x_test)

F1 Macro on train data: 0.9874831712885681, F1 Micro: 0.987537564079901;           test data:Macro0.8315883530964655 Micro0.837360594795539
测试CNN 网络一共花了0:00:40.927720时间
