# 中文情感预测

In [1]:
#!/usr/bin/env python
# -*- coding: utf-8  -*-
import sys
import jieba # 用于中文分词
import pandas

# 默认提供的数据集data.csv在macOS下直接预读乱码，转存utf-8一份
SORUCE_FILE = "clean_data.csv"

In [3]:
# 从源数据中主要提取comment列和sentiment列
source_file_df =  pandas.read_csv(SORUCE_FILE, header=None, names=['a', 'b', 'comment', 'frequence', 'sentiment'])

In [4]:
source_file_df.head()

Unnamed: 0,a,b,comment,frequence,sentiment
0,1,10558400000.0,东西好吃,41,1
1,2,10558400000.0,味道可以,37,1
2,3,10558400000.0,贵,35,0
3,4,10558400000.0,超快送达,35,1
4,5,10558400000.0,松子太好吃,32,1


In [5]:
# 以comment列内容为属性
X = source_file_df[['comment']]
# 以sentiment列内容为lable，分类只有两类0消极或1积极
y = source_file_df.sentiment
X.shape, y.shape
# print(X, y)

((8854, 1), (8854,))

In [6]:
# 调包侠关键步骤：使用jieba抽取comment列的内容进行分词，分词结果放到cutted_comment列中
X['cutted_comment'] = X.comment.apply(lambda x: " ".join(jieba.cut(x)))
# 可以看出comment列分词后放到了cutted_comment
X.head()

Building prefix dict from the default dictionary ...
Dumping model to file cache /var/folders/h3/xhgwsrzx56vclhvlkydknskw0000gn/T/jieba.cache
Loading model cost 0.896 seconds.
Prefix dict has been built succesfully.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


Unnamed: 0,comment,cutted_comment
0,东西好吃,东西 好吃
1,味道可以,味道 可以
2,贵,贵
3,超快送达,超快 送达
4,松子太好吃,松子 太 好吃


In [7]:
# 使用train_test_split将数据集切分，按照作业要求取6000条数据用于训练
# 肉眼看train_test_split默认给打乱数据集了？？？？
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=6000)

In [8]:
# 6000个训练属性，对应6000个标签
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((6000, 2), (2854, 2), (6000,), (2854,))

In [9]:
# 文本向量化处理，sklearntt提供两个方法：CountVectorizer TfidfVectorizer 
# 选择CountVectorizer，原因：1. 数据集似乎是精心准备，不用做复杂的停用词处理 2. 搜到的多数教程以CountVectorizer为例，选此少踩坑
# 主要参考博客 https://www.cnblogs.com/Lin-Yi/p/8974108.html
from sklearn.feature_extraction.text import CountVectorizer
count_vec = CountVectorizer()


In [10]:
x_count_train = count_vec.fit_transform(X_train.cutted_comment)
x_count_test = count_vec.transform(X_test.cutted_comment)

In [11]:
# 使用朴素贝叶斯分类器  分别对两种提取出来的特征值进行学习和预测
from sklearn.naive_bayes import MultinomialNB
mnb_count = MultinomialNB()
mnb_count.fit(x_count_train, y_train)   # 学习
mnb_count_y_predict = mnb_count.predict(x_count_test) #预测

In [12]:
mnb_count.score(x_count_test, y_test)

0.9663629992992292

In [13]:
from sklearn.naive_bayes import BernoulliNB
bn_count = BernoulliNB()
bn_count.fit(x_count_train, y_train)
bn_count_y_predict = bn_count.predict(x_count_test)
bn_count.score(x_count_test, y_test)

0.9632095304835319

In [14]:
# sklearn.naive_bayes.ComplementNB
from sklearn.naive_bayes import ComplementNB
cnb_count = ComplementNB()
cnb_count.fit(x_count_train, y_train)
cnb_count_y_predict = cnb_count.predict(x_count_test)
cnb_count.score(x_count_test, y_test)

0.9081990189208129