In [1]:
import numpy as np
import pandas as pd
import cPickle
from collections import defaultdict
import re

import gensim

from sklearn.cross_validation import StratifiedShuffleSplit
from sklearn.metrics import roc_auc_score
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer

from sklearn.decomposition import TruncatedSVD

import keras
from keras.models import Sequential
from keras.layers.core import Dense, Dropout, Activation, Flatten, Reshape
from keras.layers.embeddings import Embedding
from keras.layers.convolutional import Convolution2D, MaxPooling2D
from keras.optimizers import Adadelta
from keras.constraints import unitnorm
from keras.regularizers import l2
from keras.utils import np_utils
from keras import callbacks

import lda

import matplotlib.pyplot as plt

from __future__ import division
%matplotlib inline

Using Theano backend.


# load data

## df

In [2]:
# df

df0_train_s = pd.read_pickle('../data/df0_train_s_ue0.pkl')
df0_test_s = pd.read_pickle('../data/df0_test_s_ue0.pkl')

## datasets

In [3]:
def get_idx_from_sent(sent, word_idx_map, max_l, kernel_size=5):
    """
    Transforms sentence into a list of indices. Pad with zeroes.
    """
    x = []
    pad = kernel_size - 1
    for i in xrange(pad):
        x.append(0)
    words = sent.split()
    for word in words:
        if word in word_idx_map:
            x.append(word_idx_map[word])
    while len(x) < max_l+2*pad:
        x.append(0)
    return x

def make_idx_data(revs, word_idx_map, max_l, kernel_size=5):
    """
    Transforms sentences into a 2-d matrix.
    """
    train, val, test = [], [], []
    for rev in revs:
        sent = get_idx_from_sent(rev['text'], word_idx_map, max_l, kernel_size)
        sent = sent+list(rev['ue'])# added
        sent.append(rev['y'])
        if rev['split'] == 1:
            train.append(sent)
        elif rev['split'] == 0:
            val.append(sent)
        else:
            test.append(sent)
    train = np.array(train, dtype=np.int)
    val = np.array(val, dtype=np.int)
    test = np.array(test, dtype=np.int)
    return [train, val, test]

# load data
print "loading data..."
x = cPickle.load(open("../data/twitter-train-val-test_ue.pickle", "rb"))
revs, W, word_idx_map, vocab = x[0], x[1], x[2], x[3]
print "data loaded!"

print 'max_l should be set :', np.max(pd.DataFrame(revs)['num_words'])  

loading data...
data loaded!
max_l should be set : 34


In [4]:
datasets = make_idx_data(revs, word_idx_map, max_l=34, kernel_size=5)  #*** must set max_l

In [5]:
# Train data preparation
N = datasets[0].shape[0]
conv_input_width = W.shape[1]
conv_input_height = int(datasets[0].shape[1]-1)

# For each word write a word index (not vector) to X tensor
train_X = np.zeros((N, conv_input_height), dtype=np.int)
train_Y = np.zeros((N, 2), dtype=np.int)
for i in xrange(N):
    for j in xrange(conv_input_height):
        train_X[i, j] = datasets[0][i, j]
    train_Y[i, datasets[0][i, -1]] = 1
    
print 'train_X.shape = {}'.format(train_X.shape)
print 'train_Y.shape = {}'.format(train_Y.shape)
print '----------------------------------------------------------'


# Validation data preparation
Nv = datasets[1].shape[0]

# For each word write a word index (not vector) to X tensor
val_X = np.zeros((Nv, conv_input_height), dtype=np.int)
val_Y = np.zeros((Nv, 2), dtype=np.int)
for i in xrange(Nv):
    for j in xrange(conv_input_height):
        val_X[i, j] = datasets[1][i, j]
    val_Y[i, datasets[1][i, -1]] = 1
    
print 'val_X.shape = {}'.format(val_X.shape)
print 'val_Y.shape = {}'.format(val_Y.shape)
print '----------------------------------------------------------'



# Test data preparation
Nt = datasets[2].shape[0]

# For each word write a word index (not vector) to X tensor
test_X = np.zeros((Nt, conv_input_height), dtype=np.int)
test_Y = np.zeros((Nt, 2), dtype=np.int)
for i in xrange(Nt):
    for j in xrange(conv_input_height):
        test_X[i, j] = datasets[2][i, j]
    test_Y[i, datasets[2][i, -1]] = 1
    
print 'test_X.shape = {}'.format(test_X.shape)
print 'test_Y.shape = {}'.format(test_Y.shape)
print '----------------------------------------------------------'

# Other Information
max_l = np.max(pd.DataFrame(revs)['num_words'])     #记录最长句子的单词量
print 'number of sentences: ' + str(len(revs))
print 'vocab size: ' + str(len(vocab))
print 'max sentence length: ' + str(max_l)
print 'W shape',W.shape
print 'word_idx_map length',len(word_idx_map)
print 'conv_input_height',conv_input_height
print 'conv_input_width',conv_input_width

train_X.shape = (4028L, 55L)
train_Y.shape = (4028L, 2L)
----------------------------------------------------------
val_X.shape = (972L, 55L)
val_Y.shape = (972L, 2L)
----------------------------------------------------------
test_X.shape = (5000L, 55L)
test_Y.shape = (5000L, 2L)
----------------------------------------------------------
number of sentences: 10000
vocab size: 18154
max sentence length: 34
W shape (18155L, 500L)
word_idx_map length 18154
conv_input_height 55
conv_input_width 500


# pure UE

In [6]:
# data

select_cols = [u'pos_num', u'neg_num', u'pos',u'ef0',u'ef1', u'ef2', u'ef3', u'ef4', u'ef5', u'ef6', u'ef7', u'ef8', u'ef9']

train_x = df0_train_s[select_cols]
train_y = np.array(df0_train_s['polarity'])

test_x = df0_test_s[select_cols]
test_y = np.array(df0_test_s['polarity'])

print 'train_x shape:',train_x.shape
print 'train_y shape:',train_y.shape

print 'test_x shape:',test_x.shape
print 'test_y shape:',test_y.shape

train_x shape: (5000, 13)
train_y shape: (5000L,)
test_x shape: (5000, 13)
test_y shape: (5000L,)


## svm

In [7]:
model_svm = SVC()
model_svm.fit(train_x,train_y)

train_acc_ue_svm = model_svm.score(train_x,train_y)
test_acc_ue_svm = model_svm.score(test_x,test_y)

print 'Traing Accuracy:',train_acc_ue_svm
print 'Testing Accuracy:',test_acc_ue_svm

Traing Accuracy: 0.6524
Testing Accuracy: 0.6512


## lr

In [8]:
model_lr = LogisticRegression()
model_lr.fit(train_x,train_y)

train_acc_ue_lr = model_lr.score(train_x,train_y)
test_acc_ue_lr = model_lr.score(test_x,test_y)

print 'Traing Accuracy:',train_acc_ue_lr
print 'Testing Accuracy:',test_acc_ue_lr

Traing Accuracy: 0.668
Testing Accuracy: 0.659


## rf

In [9]:
model_rf = RandomForestClassifier()
model_rf.fit(train_x,train_y)

train_acc_ue_rf = model_rf.score(train_x,train_y)
test_acc_ue_rf = model_rf.score(test_x,test_y)

print 'Traing Accuracy:',model_rf.score(train_x,train_y)
print 'Testing Accuracy:',model_rf.score(test_x,test_y)

Traing Accuracy: 0.9264
Testing Accuracy: 0.612


# w2v_UE

In [10]:
# data

train_Y_label = np.argmax(train_Y,axis=1)
test_Y_label = np.argmax(test_Y,axis=1)

print 'train_X shape:',train_X.shape
print 'train_Y_label shape:',train_Y_label.shape

print 'test_X shape:',test_X.shape
print 'test_Y_label shape:',test_Y_label.shape

train_X shape: (4028L, 55L)
train_Y_label shape: (4028L,)
test_X shape: (5000L, 55L)
test_Y_label shape: (5000L,)


## svm

In [11]:
model_svm = SVC()
model_svm.fit(train_X,train_Y_label)

train_acc_w2v_svm = model_svm.score(train_X,train_Y_label)
test_acc_w2v_svm = model_svm.score(test_X,test_Y_label)

print 'Traing Accuracy:',train_acc_w2v_svm
print 'Testing Accuracy:',test_acc_w2v_svm

Traing Accuracy: 1.0
Testing Accuracy: 0.506


## lr

In [12]:
model_lr = LogisticRegression()
model_lr.fit(train_X,train_Y_label)

train_acc_w2v_lr = model_lr.score(train_X,train_Y_label)
test_acc_w2v_lr = model_lr.score(test_X,test_Y_label)

print 'Traing Accuracy:',train_acc_w2v_lr
print 'Testing Accuracy:',test_acc_w2v_lr

Traing Accuracy: 0.54865938431
Testing Accuracy: 0.5388


## rf

In [13]:
model_rf = RandomForestClassifier()
model_rf.fit(train_X,train_Y_label)

train_acc_w2v_rf = model_rf.score(train_X,train_Y_label)
test_acc_w2v_rf = model_rf.score(test_X,test_Y_label)

print 'Traing Accuracy:',train_acc_ue_rf
print 'Testing Accuracy:',test_acc_ue_rf

Traing Accuracy: 0.9264
Testing Accuracy: 0.612


# s2v_UE

In [14]:
# data

select_cols = [u'pos_num', u'neg_num', u'pos', u'ef0', u'ef1', u'ef2', u'ef3', u'ef4', u'ef5', u'ef6', u'ef7', u'ef8', u'ef9']

train_x = np.c_[df0_train_s[select_cols].values, np.array((df0_train_s['sen_vec']).tolist())]
train_y = np.array(df0_train_s['polarity'])

test_x =  np.c_[df0_test_s[select_cols].values, np.array((df0_test_s['sen_vec']).tolist())]
test_y = np.array(df0_test_s['polarity'])

print 'train_x shape:',train_x.shape
print 'train_y shape:',train_y.shape

print 'test_x shape:',test_x.shape
print 'test_y shape:',test_y.shape

train_x shape: (5000L, 513L)
train_y shape: (5000L,)
test_x shape: (5000L, 513L)
test_y shape: (5000L,)


## svm

In [15]:
model_svm = SVC()
model_svm.fit(train_x,train_y)

train_acc_s2v_svm = model_svm.score(train_x,train_y)
test_acc_s2v_svm = model_svm.score(test_x,test_y)

print 'Traing Accuracy:',train_acc_s2v_svm
print 'Testing Accuracy:',test_acc_s2v_svm

Traing Accuracy: 0.6588
Testing Accuracy: 0.659


## lr

In [16]:
model_lr = LogisticRegression()
model_lr.fit(train_x,train_y)

train_acc_s2v_lr = model_lr.score(train_x,train_y)
test_acc_s2v_lr = model_lr.score(test_x,test_y)

print 'Traing Accuracy:',train_acc_s2v_lr
print 'Testing Accuracy:',test_acc_s2v_lr

Traing Accuracy: 0.7352
Testing Accuracy: 0.7306


## rf

In [17]:
model_rf = RandomForestClassifier()
model_rf.fit(train_x,train_y)

train_acc_s2v_rf = model_rf.score(train_x,train_y)
test_acc_s2v_rf = model_rf.score(test_x,test_y)

print 'Traing Accuracy:',model_rf.score(train_x,train_y)
print 'Testing Accuracy:',model_rf.score(test_x,test_y)

Traing Accuracy: 0.9886
Testing Accuracy: 0.6642


# all statistics

In [18]:
data = ['ue']*3 + ['w2v_ue']*3 + ['s2v_ue']*3
method = ['svm','lr','rf']*3
train = [train_acc_ue_svm,train_acc_ue_lr,train_acc_ue_rf,train_acc_w2v_svm,train_acc_w2v_lr,train_acc_w2v_rf,train_acc_s2v_svm,train_acc_s2v_lr,train_acc_s2v_rf]
test = [test_acc_ue_svm,test_acc_ue_lr,test_acc_ue_rf,test_acc_w2v_svm,test_acc_w2v_lr,test_acc_w2v_rf,test_acc_s2v_svm,test_acc_s2v_lr,test_acc_s2v_rf]

In [19]:
df_stat = pd.DataFrame({'data':data,'method':method,'train_acc':train,'test_acc':test})

In [20]:
df_stat[['data','method','train_acc','test_acc']]

Unnamed: 0,data,method,train_acc,test_acc
0,ue,svm,0.6524,0.6512
1,ue,lr,0.668,0.659
2,ue,rf,0.9264,0.612
3,w2v_ue,svm,1.0,0.506
4,w2v_ue,lr,0.548659,0.5388
5,w2v_ue,rf,0.983863,0.6104
6,s2v_ue,svm,0.6588,0.659
7,s2v_ue,lr,0.7352,0.7306
8,s2v_ue,rf,0.9886,0.6642


In [21]:
df_stat.to_pickle('../data/df_stat.pkl')