In [2]:
import numpy as np
import pandas as pd
import cPickle
from collections import defaultdict
import re

import gensim
from gensim.models.doc2vec import TaggedLineDocument

from sklearn.cross_validation import StratifiedShuffleSplit
from sklearn.metrics import roc_auc_score
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.cluster import KMeans
import time


from sklearn.decomposition import TruncatedSVD

import keras
from keras.models import Sequential
from keras.layers.core import Dense, Dropout, Activation, Flatten, Reshape
from keras.layers.embeddings import Embedding
from keras.layers.convolutional import Convolution2D, MaxPooling2D
from keras.optimizers import Adadelta
from keras.constraints import unitnorm
from keras.regularizers import l2
from keras.utils import np_utils
from keras import callbacks

import lda

import matplotlib.pyplot as plt

from __future__ import division
%matplotlib inline

Using Theano backend.


# doc2vec

## model training

In [3]:
train_file = "../data/train_text-norm.csv"
test_file = "../data/test_text-norm.csv"
all_file = "../data/all_text-norm"

In [4]:
filenames = [train_file, test_file]
with open(all_file, 'w') as outfile:
    for fname in filenames:
        with open(fname) as infile:
            for line in infile:
                outfile.write(line)

In [5]:
print 'train_file', sum(1 for line in open(train_file))
print 'test_file', sum(1 for line in open(test_file))
print 'all_file', sum(1 for line in open(all_file))

train_file 1600000
test_file 359
all_file 1600359


In [6]:
documents = TaggedLineDocument(all_file)

doc2vec_model = gensim.models.Doc2Vec(alpha=0.025, min_alpha=0.025) 
doc2vec_model.build_vocab(documents)

for epoch in range(1):   # 需要增大变得更加精确
    doc2vec_model.train(documents)
    doc2vec_model.alpha -= 0.002  # decrease the learning rate
    doc2vec_model.min_alpha = doc2vec_model.alpha  # fix the learning rate, no decay

doc2vec_model.save("../model/doc2vec.model")

In [7]:
num_lines = sum(1 for line in open(all_file))

doc_vectors = []
for i in range(num_lines):
    doc_vectors.append(doc2vec_model.docvecs[i])
doc_vectors = np.array(doc_vectors)

In [8]:
print 'doc_vectors',doc_vectors.shape

doc_vectors (1600359L, 300L)


## data preparation

### p2c parameter

In [9]:
train_data = doc_vectors[:sum(1 for line in open(train_file)),]
test_data = doc_vectors[sum(1 for line in open(train_file)):,]

In [10]:
print 'train_data',train_data.shape
print 'test_data',test_data.shape

train_data (1600000L, 300L)
test_data (359L, 300L)


### ue parameter

In [18]:
df0_train_s_p2v = pd.read_pickle('../data/df0_train_s_p2v.pkl')
df0_test_s_p2v = pd.read_pickle('../data/df0_test_s_p2v.pkl')

In [19]:
print 'df0_train_s_p2v',df0_train_s_p2v.shape
print 'df0_test_s_p2v',df0_test_s_p2v.shape

df0_train_s_p2v (1600000, 14)
df0_test_s_p2v (359, 14)


### all parameter

In [16]:
train_x = np.c_[train_data,df0_train_s_p2v.values[,1:]]
train_y = df0_train_s_p2v.values[,0]

test_x = np.c_[test_data,df0_test_s_p2v.values[,1:]]
test_y = df0_test_s_p2v.values[,0]

print 'train_x shape:',train_x.shape
print 'train_y shape:',train_y.shape

print 'test_x shape:',test_x.shape
print 'test_y shape:',test_y.shape

array([[1, 2, 1, 2],
       [3, 4, 3, 4]])

# 下面未执行完

# MODEL TRAINING

In [None]:
def plot_history(history):
    # summarize history for accuracy
    plt.plot(history.history['acc'])
    plt.plot(history.history['val_acc'])
    plt.title('model accuracy')
    plt.ylabel('accuracy')
    plt.xlabel('epoch')
    plt.legend(['train', 'test'], loc='upper left')
    plt.show()
    # summarize history for loss
    plt.plot(history.history['loss'])
    plt.plot(history.history['val_loss'])
    plt.title('model loss')
    plt.ylabel('loss')
    plt.xlabel('epoch')
    plt.legend(['train', 'test'], loc='upper left')
    plt.show()
    
def get_test_acc(test_x_cat,test_y,model):
    p1 = model.predict_proba(test_x_cat)
    return np.mean(test_y==(np.argmax(p1,axis=1)))
    
%matplotlib inline

In [None]:
train_y_cat = np_utils.to_categorical(train_y)
test_y_cat = np_utils.to_categorical(test_y)

train_x_cat = train_x.reshape(train_x.shape[0],1,train_x.shape[1],1)
test_x_cat = test_x.reshape(test_x.shape[0],1,test_x.shape[1],1)

In [None]:
from keras.optimizers import SGD

sgd = SGD()

model_cnn = Sequential()
model_cnn.add(Convolution2D(10, 5, 1,border_mode="valid",activation="relu",input_shape=(1, train_x.shape[1], 1)))
model_cnn.add(MaxPooling2D(pool_size=(2,1)))
model_cnn.add(Flatten())
model_cnn.add(Dense(10))
model_cnn.add(Activation('relu'))
model_cnn.add(Dropout(0.5))
model_cnn.add(Dense(2))
model_cnn.add(Activation('softmax'))

model_cnn.compile(loss="categorical_crossentropy", optimizer=sgd,metrics=['accuracy'])

In [None]:
model_cnn.summary()

In [None]:
history = model_cnn.fit(train_x_cat,train_y_cat,nb_epoch=300,validation_split=0.2)

# MODEL EVALUATION

In [None]:
plot_history(history)

In [None]:
get_test_acc(test_x_cat,test_y,model_cnn)