* 数据准备过程

In [1]:
from sklearn.model_selection import train_test_split

corpus = ['口味 喜欢', '不错 喜欢', '喜欢 喜欢', '菜量 实惠', '不错',
          '不怎么样', '后悔', '不喜欢', '难吃', '差评']
label = [1,1,1,1,1, 0,0,0,0,0]
X_train, X_test, y_train, y_test = train_test_split(corpus, label, 
                                                    test_size=0.4)
print ('X_train: ', X_train)
print ('y_train: ', y_train)
print ('X_test: ', X_test)
print ('y_test: ', y_test)

X_train:  ['后悔', '差评', '菜量 实惠', '不错', '不喜欢', '难吃']
y_train:  [0, 0, 1, 1, 0, 0]
X_test:  ['口味 喜欢', '不错 喜欢', '不怎么样', '喜欢 喜欢']
y_test:  [1, 1, 0, 1]


* 使用CountVectorizer提取特征
  - 函数定义与参数：https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.CountVectorizer.html

In [2]:
from sklearn.feature_extraction.text import CountVectorizer
cvect = CountVectorizer()
X_train_c = cvect.fit_transform(X_train)
print(cvect.get_feature_names())
print(X_train_c.toarray())  

['不喜欢', '不错', '后悔', '实惠', '差评', '菜量', '难吃']
[[0 0 1 0 0 0 0]
 [0 0 0 0 1 0 0]
 [0 0 0 1 0 1 0]
 [0 1 0 0 0 0 0]
 [1 0 0 0 0 0 0]
 [0 0 0 0 0 0 1]]


* 使用TfidfVectorizer提取特征

In [3]:
from sklearn.feature_extraction.text import TfidfVectorizer
tvect = TfidfVectorizer()
X_train_t = tvect.fit_transform(X_train)
print(tvect.get_feature_names())
print(X_train_t.toarray())

['不喜欢', '不错', '后悔', '实惠', '差评', '菜量', '难吃']
[[0.         0.         1.         0.         0.         0.
  0.        ]
 [0.         0.         0.         0.         1.         0.
  0.        ]
 [0.         0.         0.         0.70710678 0.         0.70710678
  0.        ]
 [0.         1.         0.         0.         0.         0.
  0.        ]
 [1.         0.         0.         0.         0.         0.
  0.        ]
 [0.         0.         0.         0.         0.         0.
  1.        ]]


In [4]:
from sklearn.naive_bayes import MultinomialNB
nb = MultinomialNB()
nb.fit(X_train_t, y_train)

train_accuracy = nb.score(X_train_t, y_train)
print (train_accuracy)

X_test_t = tvect.transform(X_test)
print(tvect.get_feature_names())
print (X_test_t.toarray())
y_predict = nb.predict(X_test_t)
print ('y_predict: ', y_predict)
print ('y_test', y_test)

1.0
['不喜欢', '不错', '后悔', '实惠', '差评', '菜量', '难吃']
[[0. 0. 0. 0. 0. 0. 0.]
 [0. 1. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0.]]
y_predict:  [0 1 0 0]
y_test [1, 1, 0, 1]


In [10]:
from sklearn import metrics
print(metrics.classification_report(y_test, y_predict))
metrics.confusion_matrix(y_test, y_predict)

              precision    recall  f1-score   support

           0       0.50      1.00      0.67         1
           1       1.00      0.67      0.80         3

   micro avg       0.75      0.75      0.75         4
   macro avg       0.75      0.83      0.73         4
weighted avg       0.88      0.75      0.77         4



array([[1, 0],
       [1, 2]])