In [1]:
import numpy as np
import pandas as pd
import cPickle
from collections import defaultdict
import re

import gensim

from sklearn.cross_validation import StratifiedShuffleSplit
from sklearn.metrics import roc_auc_score
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer

from sklearn.decomposition import TruncatedSVD

import keras
from keras.models import Sequential
from keras.layers.core import Dense, Dropout, Activation, Flatten, Reshape
from keras.layers.embeddings import Embedding
from keras.layers.convolutional import Convolution2D, MaxPooling2D
from keras.optimizers import Adadelta
from keras.constraints import unitnorm
from keras.regularizers import l2
from keras.utils import np_utils
from keras import callbacks

import lda

import matplotlib.pyplot as plt

from __future__ import division
%matplotlib inline

Using Theano backend.


# load basic data

In [28]:
# 载入原始数据
df0 = pd.read_csv('../data/training.1600000.processed.noemoticon.csv',header=None,names=['polarity','id','date','query','name','text'])
df1 = pd.read_csv("../data/train_text-norm.csv",header=None,names=['text'])
df0['text'] = df1['text']

np.random.seed(0)
# 划分训练集测试集
sss = StratifiedShuffleSplit(df0['polarity'], 3, test_size=0.5, random_state=0)
for train_index, test_index in sss:
    df0_train, df0_test = df0.iloc[train_index,:], df0.iloc[test_index,:]
    
df0_train_s = df0_train.sample(5000,random_state=0)
df0_test_s = df0_test.sample(5000,random_state=0)

df0_train_s.polarity[df0_train_s.polarity==4]=1
df0_test_s.polarity[df0_test_s.polarity==4]=1

# build user feature

In [3]:
# document matrix
tf_vectorizer = CountVectorizer(max_df=0.95, min_df=1,stop_words='english')
tf = tf_vectorizer.fit_transform(np.append(df0_train_s['text'].values,df0_test_s['text'].values))
# vocab
vocab = tf_vectorizer.get_feature_names()

In [4]:
# 连接外部词库
positive_words = pd.read_table('../data/positive-words.txt',header=None).values.ravel()
negtive_words = pd.read_table('../data/negative-words.txt',header=None).values.ravel()

In [5]:
def get_number_words(s,option_words):
    return len(set(s.split()) & set(option_words))

In [6]:
# 0 means negative, 1 means positive
# train data
df0_train_s['pos_num'] = df0_train_s.text.apply(lambda s:get_number_words(s,positive_words))
df0_train_s['neg_num'] = df0_train_s.text.apply(lambda s:get_number_words(s,negtive_words))
df0_train_s['pos'] = df0_train_s['pos_num']-df0_train_s['neg_num']
# test data
df0_test_s['pos_num'] = df0_test_s.text.apply(lambda s:get_number_words(s,positive_words))
df0_test_s['neg_num'] = df0_test_s.text.apply(lambda s:get_number_words(s,negtive_words))
df0_test_s['pos'] = df0_test_s['pos_num']-df0_test_s['neg_num']

# build event feature

In [7]:
# 设定lda参数
n_topics = 10
n_iter = 1500

## 建立模型
lda_model = lda.LDA(n_topics=n_topics, n_iter=n_iter, random_state=1)
lda_model.fit(tf)  # fit train data tf  
## 输出模型的topic分类参数
topic_word = lda_model.topic_word_ 
n_top_words = 20
for i, topic_dist in enumerate(topic_word):
    topic_words = np.array(vocab)[np.argsort(topic_dist)][:-n_top_words:-1]
    print('Topic {}: {}'.format(i, ' '.join(topic_words)))



Topic 0: day happy good today hope going morning great weather night better birthday soon tomorrow nice weekend looking time ll
Topic 1: twitter know just thanks new got like don did won tweets hey doesn work lol sure sorry working need
Topic 2: like know lol don just good ll want say did thanks sorry really oh think haha yeah people im
Topic 3: http com bit twitpic ly tinyurl www th thanks tweet love plurk new blog miss pic says live check
Topic 4: quot love new just watching haha amp like wait song hope time good best girl read listening awesome movie
Topic 5: just good like going im lol amp think know watching eat home dont movie time got food tonight let
Topic 6: today just ve time oh amp got fun game day school missed guess old doing play week playing going
Topic 7: love lt really im miss lol amp thank just haha oh need night come sad going wish awesome tonight
Topic 8: just got bad like today feel think sick really hate right little sorry want hurts head need phone hair
Topic 9: 

In [12]:
print 'lda_model.doc_topic的大小是：',lda_model.doc_topic_.shape
# event feature for train data
ef_train = (lda_model.doc_topic_)[:len(df0_train_s),]
# event feature for test data
ef_test = (lda_model.doc_topic_)[len(df0_train_s):,]

lda_model.doc_topic的大小是： (10000L, 10L)


In [14]:
# ef_train_df
ef_train_df = pd.DataFrame(ef_train)
ef_train_df.columns = ["ef" + str(i) for i in xrange(0,10)] #***
ef_train_df.index=df0_train_s.index

# ef_test_df
ef_test_df = pd.DataFrame(ef_test)
ef_test_df.columns = ["ef" + str(i) for i in xrange(0,10)] #***
ef_test_df.index=df0_test_s.index

In [15]:
# df0
df0_train_s = pd.concat([df0_train_s,ef_train_df],axis=1)
df0_test_s = pd.concat([df0_test_s,ef_test_df],axis=1)

# w2v_ue feature

In [25]:
select_cols = [u'pos_num', u'neg_num', u'pos',u'ef0',u'ef1', u'ef2', u'ef3', u'ef4', u'ef5', u'ef6', u'ef7', u'ef8', u'ef9']

In [26]:
# 定义预处理函数
def build_data_train_test(data_train, data_test, train_ratio = 0.8, clean_string=True):
    """
    Loads data and split into train and test sets.
    """
    revs = []
    vocab = defaultdict(float)
    # Pre-process train data set
    for i in xrange(data_train.shape[0]):
        line = data_train['text'].iloc[i]
        y = data_train['polarity'].iloc[i]
        rev = []
        rev.append(line.strip())
        if clean_string:
            orig_rev = clean_str(' '.join(rev))
        else:
            orig_rev = ' '.join(rev).lower()
        words = set(orig_rev.split())
        for word in words:
            vocab[word] += 1
        datum  = {'y': y, 
                  'text': orig_rev,
                  'num_words': len(orig_rev.split()),
                  'ue':data_train.iloc[i,][select_cols].values,
                  'split': int(np.random.rand() < train_ratio)}
        revs.append(datum)
        
    # Pre-process test data set
    for i in xrange(data_test.shape[0]):
        line = data_test['text'].iloc[i]
        y = data_test['polarity'].iloc[i]
        rev = []
        rev.append(line.strip())
        if clean_string:
            orig_rev = clean_str(' '.join(rev))
        else:
            orig_rev = ' '.join(rev).lower()
        words = set(orig_rev.split())
        for word in words:
            vocab[word] += 1
        datum  = {'y': y, 
                  'text': orig_rev,
                  'num_words': len(orig_rev.split()),
                  'ue':data_test.iloc[i,][select_cols].values,
                  'split': -1}
        revs.append(datum)
        
    return revs, vocab

    
def get_W(word_vecs, k=300):
    """
    Get word matrix. W[i] is the vector for word indexed by i
    """
    vocab_size = len(word_vecs)
    word_idx_map = dict()
    W = np.zeros(shape=(vocab_size+1, k), dtype=np.float32)
    W[0] = np.zeros(k, dtype=np.float32)
    i = 1
    for word in word_vecs:
        W[i] = word_vecs[word]
        word_idx_map[word] = i
        i += 1
    return W, word_idx_map

def load_bin_vec(fname, vocab):
    """
    Loads 300x1 word vecs from Google (Mikolov) word2vec
    """
    word_vecs = {}
    with open(fname, 'rb') as f:
        header = f.readline()
        vocab_size, layer1_size = map(int, header.split())
        binary_len = np.dtype('float32').itemsize * layer1_size
        for line in xrange(vocab_size):
            word = []
            while True:
                ch = f.read(1)
                if ch == ' ':
                    word = ''.join(word)
                    break
                if ch != '\n':
                    word.append(ch)   
            if word in vocab:
                word_vecs[word] = np.fromstring(f.read(binary_len), dtype='float32')  
            else:
                f.read(binary_len)
    return word_vecs

def add_unknown_words(word_vecs, vocab, min_df=1, k=300):
    """
    For words that occur in at least min_df documents, create a separate word vector.    
    0.25 is chosen so the unknown vectors have (approximately) same variance as pre-trained ones
    """
    for word in vocab:
        if word not in word_vecs and vocab[word] >= min_df:
            word_vecs[word] = np.random.uniform(-0.25,0.25,k)  

def clean_str(string):
    """
    Tokenization/string cleaning for dataset
    Every dataset is lower cased except
    """
    string = re.sub(r"[^A-Za-z0-9(),!?\'\`]", " ", string)     
    string = re.sub(r"\'s", " \'s", string) 
    string = re.sub(r"\'ve", " \'ve", string) 
    string = re.sub(r"n\'t", " n\'t", string) 
    string = re.sub(r"\'re", " \'re", string) 
    string = re.sub(r"\'d", " \'d", string) 
    string = re.sub(r"\'ll", " \'ll", string) 
    string = re.sub(r",", " , ", string) 
    string = re.sub(r"!", " ! ", string) 
    string = re.sub(r"\(", " \( ", string) 
    string = re.sub(r"\)", " \) ", string) 
    string = re.sub(r"\?", " \? ", string) 
    string = re.sub(r"\s{2,}", " ", string)    
    return string.strip().lower()

def build_dict(model_word2vec,vocab):
    w2v = {}
    for i in model_word2vec.index2word:
        if i in vocab:
            w2v[i] = model_word2vec[i]
    return (w2v)

In [27]:
# 数据预处理
#w2v_file = '../GoogleNews-vectors-negative300.bin'  #谷歌w2v
revs, vocab = build_data_train_test(df0_train_s, df0_test_s, train_ratio=0.8, clean_string=True)  #获得文本信息及词汇信息
max_l = np.max(pd.DataFrame(revs)['num_words'])     #记录最长句子的单词量
print 'data loaded!'
print 'number of sentences: ' + str(len(revs))
print 'vocab size: ' + str(len(vocab))
print 'max sentence length: ' + str(max_l)
print 'loading word2vec vectors...',

# w2v构建
model_word2vec = gensim.models.Word2Vec.load("../model/model_word2vec.model")
w2v = build_dict(model_word2vec,vocab)
print 'word2vec loaded!'
print 'num words already in word2vec: ' + str(len(w2v))

add_unknown_words(w2v, vocab,k=500) #***
W, word_idx_map = get_W(w2v,k=500)  #****

# 数据存储*****
cPickle.dump([revs, W, word_idx_map, vocab], open('../data/twitter-train-val-test_ue.pickle', 'wb')) #****
print 'dataset created!'

data loaded!
number of sentences: 10000
vocab size: 18154
max sentence length: 34
loading word2vec vectors... word2vec loaded!
num words already in word2vec: 13440
dataset created!


# s2v_ue feature

In [21]:
# 构建sen_vec特征
def AvgWord2Vec(sentence, vec_size=500):
    global model_word2vec
    vector = np.zeros(vec_size)
    num = len(sentence)
    for word in sentence:
        try:
            vector += model_word2vec[word]
        except KeyError:
            num -= 1
    if num > 0:
        return (vector / num)
    else:
        return (vector)

model_word2vec = gensim.models.Word2Vec.load("../model/model_word2vec.model")

df0_train_s['sen_vec'] = map(lambda x: AvgWord2Vec(x)  ,map(lambda x: x.split(),df0_train_s['text']))
df0_test_s['sen_vec'] = map(lambda x: AvgWord2Vec(x)  ,map(lambda x: x.split(),df0_test_s['text']))

# save data

w2v_ue保存在：'../data/twitter-train-val-test_ue.pickle'

s2v_ue及其对应的原始数据保存在：'../data/df0_train_s_ue0.pkl'和'../data/df0_test_s_ue0.pkl'中

In [23]:
df0_train_s.to_pickle('../data/df0_train_s_ue0.pkl')
df0_test_s.to_pickle('../data/df0_test_s_ue0.pkl')