In [1]:
from sklearn.model_selection import train_test_split
from gensim.models.word2vec import Word2Vec
import numpy as np
import pandas as pd
import jieba
from sklearn.externals import joblib
from sklearn.svm import SVC

import sys
import imp
imp.reload(sys)

<module 'sys' (built-in)>

In [2]:
# 加载文件，导入数据,分词
def loadfile():
    neg = pd.read_excel('data/neg.xls',header=None,index=None)
    pos = pd.read_excel('data/pos.xls',header=None,index=None)
    # neg[0] 和 pos[0] 数据（成列，一列一条文本）
    
    
    cw = lambda x: list(jieba.cut(x))
    pos['words'] = pos[0].apply(cw)
    neg['words'] = neg[0].apply(cw)
    # neg['words‘] 和 pos['words'] 存放分词后的结果，一条文本处理后是一个list
    
    # use 1 for positive sentiment, 0 for negative，得到所有两类样本的labels
    y = np.concatenate((np.ones(len(pos)), np.zeros(len(neg))))

    # 按给定比例随机划分训练集和测试集
    x_train, x_test, y_train, y_test = train_test_split(np.concatenate((pos['words'], neg['words'])), y, test_size=0.2)
    
    np.save('svm_data/y_train.npy',y_train)
    np.save('svm_data/y_test.npy',y_test)
    return x_train,x_test

In [3]:
# x_train,x_test=loadfile()

In [4]:
# 对每个句子的所有词向量取均值作为每个评论的输入: sum each word's (1, n_dim) ,then divide it by num of words in a text
# 这样做忽略了单词之间的排序顺序对情感分析的影响
def buildWordVector(text, size, imdb_w2v):
    vec = np.zeros(size).reshape((1, size))
    count = 0.
    for word in text:
        try:
            vec += imdb_w2v[word].reshape((1, size))    # (300,) -> (1, 300)
            count += 1.
        except KeyError:
            continue
    if count != 0:
        vec /= count
    return vec

# 训练词向量，分别对x_train 和 x_test 得到各自 (num_examples, n_dim) 的映射矩阵
def get_train_vecs(x_train,x_test):
    n_dim = 300
    # Initialize model and build vocab
    imdb_w2v = Word2Vec(size=n_dim, min_count=10)
    imdb_w2v.build_vocab(x_train)
    
    # Train the model over train_reviews (this may take several minutes)
    imdb_w2v.train(sentences=x_train, total_examples=imdb_w2v.corpus_count, epochs=imdb_w2v.epochs)
    
    train_vecs = np.concatenate([buildWordVector(z, n_dim,imdb_w2v) for z in x_train])
    # train_vecs = scale(train_vecs)
    
    np.save('svm_data/train_vecs.npy',train_vecs)
    print(train_vecs.shape)
    
    # Train word2vec on test tweets
    imdb_w2v.train(sentences=x_test, total_examples=imdb_w2v.corpus_count, epochs=imdb_w2v.epochs)
    imdb_w2v.save('svm_data/w2v_model/w2v_model.pkl')
     
    # Build test tweet vectors then scale
    test_vecs = np.concatenate([buildWordVector(z, n_dim,imdb_w2v) for z in x_test])
    
    # test_vecs = scale(test_vecs)
    np.save('svm_data/test_vecs.npy',test_vecs)
    
    print(test_vecs.shape)
    print('data preprocessing and word embedding finished...')

至此完成所有预处理和embedding，并将相应的参数矩阵保存

In [5]:
def get_data():
    train_vecs = np.load('svm_data/train_vecs.npy')
    y_train = np.load('svm_data/y_train.npy')
    test_vecs = np.load('svm_data/test_vecs.npy')
    y_test = np.load('svm_data/y_test.npy') 
    return train_vecs,y_train,test_vecs,y_test

In [6]:
# # explore the data
# train_vecs,y_train,test_vecs,y_test = get_data()
# print(x_train.shape)
# print(x_test.shape)
# print(train_vecs.shape)
# print(test_vecs.shape)
# print(y_train.shape)
# print(y_test.shape)

In [7]:
##训练svm模型
def svm_train(train_vecs,y_train,test_vecs,y_test):
    clf = SVC(kernel='rbf',verbose=True)
    clf.fit(train_vecs,y_train)
    joblib.dump(clf, 'svm_data/svm_model/model.pkl')
    print(clf.score(test_vecs,y_test))
    
    
##得到待预测单个句子的词向量    
def get_predict_vecs(words):
    n_dim = 300
    imdb_w2v = Word2Vec.load('svm_data/w2v_model/w2v_model.pkl')
    train_vecs = buildWordVector(words, n_dim,imdb_w2v)

    return train_vecs
    
####对单个句子进行情感判断    
def svm_predict(string):
    words = jieba.lcut(string)   # 先把要分析的句子进行分词
    words_vecs = get_predict_vecs(words)  # 对分词后的结果求词向量
    clf = joblib.load('svm_data/svm_model/model.pkl')
     
    result = clf.predict(words_vecs)
    
    if int(result[0]) == 1:
        print(string,' positive')
    else:
        print(string,' negative')


In [8]:
def train():
    x_train,x_test=loadfile() #得到句子分词后的结果，并把类别标签保存为y_train。npy,y_test.npy
    get_train_vecs(x_train,x_test) #计算词向量并保存为train_vecs.npy,test_vecs.npy
    train_vecs,y_train,test_vecs,y_test=get_data()#导入训练数据和测试数据
    svm_train(train_vecs,y_train,test_vecs,y_test)#训练svm并保存模型
    

if __name__=='__main__':
    
    train()
    
    # 对输入句子情感进行判断
    string='电池充完了电连手机都打不开.简直烂的要命.真是金玉其外,败絮其中!连5号电池都不如'
    #string='牛逼的手机，从3米高的地方摔下去都没坏，质量非常好'    
    svm_predict(string)

Building prefix dict from the default dictionary ...
Loading model from cache /var/folders/5y/czdghcd91q3gnr9rmh8ft7040000gn/T/jieba.cache
Loading model cost 0.886 seconds.
Prefix dict has been built succesfully.
  


(16884, 300)
(4221, 300)
data preprocessing and word embedding finished...
[LibSVM]0.8012319355602938
电池充完了电连手机都打不开.简直烂的要命.真是金玉其外,败絮其中!连5号电池都不如  negative
