In [34]:
import pandas as pd
import jieba
from gensim.models.word2vec import LineSentence, Word2Vec
from scipy.linalg import norm
import gensim
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.cross_validation import train_test_split
import warnings
warnings.filterwarnings('ignore')

INFILE = './data/train_data.csv'
TRAIN_DATA_WORD = './data/word2vect/train_data_word.txt'
MODELFILE = './data/word2vect/movie_word2vect_model.bin'
STOPWORD_FILE = './data/stopwords.txt'
class WORD2VECT_LR():
    def __init__(self, infile, infile_encoding, mfile, swfile, swfile_encoding, dimension):
        #输入文件
        self.infile = infile
        #输入文件编码
        self.infile_encoding = infile_encoding
        #word2vect model文件
        self.mfile = mfile
        #停顿词文件
        self.swfile = swfile
        #停顿词文件编码
        self.swfile_encoding = swfile_encoding
        
        #vect的维度
        self.dimension = dimension
        
        #停顿词列表
        self.stopwords = []
        
        self.word2vect = None
        
        self.vect_list = []
    def jieba_tokenizer(self,x):
        words =  jieba.cut(x, cut_all=False)
        return " ".join(words)

    def get_stop_words(self):
        stopwords=[]
        with open(self.swfile,'r',encoding=self.infile_encoding) as fp:
            for line in fp.readlines():
                stopwords.append(line.strip())
            return stopwords
    
    def content2vect(self):
        '''
        输入文件转换成vect
        1）输入文件先切换成空格分割的词文件
        2）空格分割的句子文件转换成vect
        '''
        space = ' '
        if self.swfile:
            self.stopwords = get_stop_words(self.swfile, self.swfile_encoding)
        
        #1）输入文件先切换成空格分割的词文件
        f_word = open(TRAIN_DATA_WORD, 'w', encoding='utf-8')
        with open(self.infile, 'r', encoding=self.infile_encoding) as f:
            for line in f:
                line = line.split(',')
                words_list = [x for x in jieba.cut(line[1], cut_all=False) if x not in self.stopwords]
                f_word.write(space.join(words_list))
        f_word.close()
        
        #2）空格分割的句子文件转换成vect
        f_word = open(TRAIN_DATA_WORD, 'r', encoding='utf-8')
        model = Word2Vec(LineSentence(f_word), sg=0, size=self.dimension, window=5, min_count=5, workers=9)
        model.wv.save_word2vec_format(self.mfile, binary=True)
        f_word.close()
            

    def load_model(self):
        self.word2vect = gensim.models.KeyedVectors.load_word2vec_format(self.mfile, binary=True)

    def sentence_vector(self, s):
            words = [x for x in jieba.cut(s, cut_all=False) if x not in self.stopwords]
            v = np.zeros(self.dimension)
            cnt = 0
            for word in words:
                if word not in self.word2vect.wv.vocab:
                    continue
                else:
                    v += self.word2vect[word]

            v /= len(words)
            self.vect_list.append(v)
            
    def train(self):
        model = self.load_model()
        df = pd.DataFrame()
        df = pd.read_csv(INFILE, encoding="utf-8")
        df['comment'].apply(self.sentence_vector)
        X = np.array(self.vect_list)
    
        y = df['label']

        X_train, X_test, y_train, y_test  = train_test_split(X, y,test_size=0.1)

        lr = LogisticRegression()
        lr = lr.fit(X_train, y_train)
        
        y_pred = lr.predict(X_test)
        print('Test set accuracy: %3f' % lr.score(X_test, y_test))
        return lr

In [35]:
if __name__ == '__main__':
    model  = WORD2VECT_LR(INFILE, "utf_8", MODELFILE,STOPWORD_FILE , "utf-8", 128)
    #model.content2vect()
    model = model.train()
    

Test set accuracy: 0.687543
