In [19]:
import jieba
import pandas as pd
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cross_validation import train_test_split
from sklearn.feature_extraction.text import CountVectorizer

TRAIN_FILE = './data/train_data.csv'
TEST_FILE = './data/test_data.csv'
SUBMIT_FILE = './data/tfidf_lr_submission.csv'

class TF_IDF_LR():
    def __init__(self, input_file,  input_file_encoding, stop_words_file=False):
        '''
        input_file:          输入文件名
        input_file_encoding：输入文件编码格式
        stop_words_file：    停顿词文件名，无停顿词时设置为False
        '''
        self.input_file = input_file
        self.input_file_encoding = input_file_encoding
        self.stop_words_file = stop_words_file
        
    def load_data(self):
        '''
        利用pandas加载数据到dataframe
        '''
        return pd.read_csv(self.input_file, encoding=self.input_file_encoding)
    
    def get_data(self, test_size=0.1):
        '''
        取得数据的特征列和标签列，对特征列进行分词的处理，然后划分数据集为训练集和测试集
        '''
        df = self.load_data()
        X = df['comment'].apply(self.jieba_tokenizer)
        y = df['label']
        self.X_train, self.X_test, self.y_train, self.y_test = train_test_split(X, y, test_size=test_size)

    
    def train(self):
        #出现频率最高的几个，对评价无影响的词,最后未使用
        #excludeWords = {'的','了','我','看','电影','在'}
        excludeWords = {}
        
        #没有max_features:0.812344
        #max_features = 500    #0.766281
        #max_features = 1000   #0.772181
        #max_features = 1500   #0.794872
        #max_features = 2000   #0.795326
        #max_features = 3000   #0.804402
        #max_features = 4000   #0.804629
        #max_features = 5000   #0.809621
        #max_features = 6000   #0.811436
        #max_features = 7000   #0.810302
        #max_features = 8000   #0.811436
        #max_features = 9000   #0.815975
        max_features = 10000  #0.811436
        #max_features = 11000   #0.818017
        #max_features = 12000   #0.815067
        #max_features = 15000    #0.812798
        #max_features = 11000 
        self.tfidf_vec = TfidfVectorizer(smooth_idf=1,
                            analyzer='word',
                            encoding='utf-8',
                            preprocessor=None,
                            ngram_range=(1, 2),
                            max_features = max_features,
                            stop_words=excludeWords)   #812798
                                   
    
        # 使用tfidf的方式，将原始训练和测试文本转化为特征向量。
        X_tfidf_train = self.tfidf_vec.fit_transform(self.X_train)
        X_tfidf_test = self.tfidf_vec.transform(self.X_test)
    
        lr = LogisticRegression()
        lr = lr.fit(X_tfidf_train, self.y_train)

        print('Test set accuracy: %3f' % lr.score(X_tfidf_test, self.y_test))
        
        self.model = lr
    def jieba_tokenizer(self,x):
        '''
        对输入进行分词处理，然后词语之间以空格分割的形式返回字符串
        '''
        words =  jieba.cut(x,cut_all=True)
        return " ".join(words)
    
    def predict(self, test_file, submit_file):
        test_df = pd.read_csv(test_file,  encoding='utf-8', header=0)
        #将行按空格分割
        X = test_df['comment'].apply(self.jieba_tokenizer) 
        
        #将词转换为向量
        X_vect = self.tfidf_vec.transform(X)
        
        #预测
        y_pred = self.model.predict(X_vect)
        
        #预测结果保存
        res = pd.DataFrame({'label':y_pred})
        res.to_csv(submit_file, index=False)

In [20]:
if __name__ == '__main__':
    tf_idf = TF_IDF_LR(TRAIN_FILE, 'utf-8')
    tf_idf.get_data()
    tf_idf.train()
    tf_idf.predict(TEST_FILE, SUBMIT_FILE)

Test set accuracy: 0.825505


In [31]:
#评价本次预测结果和之前提交的xlgao_submission.csv的差别有多大， xlgao_submission.csv的正确率为0.82左右
#如果差别太大，表示错误率比起之前提交版本还差，没有必要提交

from utils import calculate_accurate
file1 = "./data/tfidf_lr_submission.csv"
file2 = "./data/xlgao_submission.csv"

calculate_accurate(file1, file2)
    

0.9512
