In [1]:
from utils import tokenize, load_curpus
import numpy as np

#### 加载数据

In [2]:
import pandas as pd
train_data = load_curpus("weibo2018/train.txt")
test_data = load_curpus("weibo2018/test.txt")
train_df = pd.DataFrame(train_data, columns=["content", "sentiment"])
test_df = pd.DataFrame(test_data, columns=["content", "sentiment"])

Building prefix dict from the default dictionary ...
Loading model from cache /var/folders/dy/xjy0y7v97js5x1bghby2fnkm0000gn/T/jieba.cache
Loading model cost 0.901 seconds.
Prefix dict has been built succesfully.


In [3]:
stopwords = []
with open("stopwords.txt", "r", encoding="utf8") as f:
    for w in f:
        stopwords.append(w.strip())

#### 词袋模型

In [4]:
from sklearn.feature_extraction.text import CountVectorizer
data_str = [" ".join(content) for content, sentiment in train_data] + \
            [" ".join(content) for content, sentiment in test_data]
vectorizer = CountVectorizer(token_pattern='\[?\w+\]?', stop_words=stopwords, max_features=1000)
vectorizer.fit_transform(data_str)

<10500x1000 sparse matrix of type '<class 'numpy.int64'>'
	with 76762 stored elements in Compressed Sparse Row format>

In [5]:
X_data, y_data = [], []
for content, sentiment in train_data:
    X, y = [], sentiment
    X_data.append(" ".join(content))
    y_data.append(sentiment)
X_train = vectorizer.transform(X_data)
y_train = y_data

In [6]:
X_data, y_data = [], []
for content, sentiment in test_data:
    X, y = [], sentiment
    X_data.append(" ".join(content))
    y_data.append(sentiment)
X_test = vectorizer.transform(X_data)
y_test = y_data

#### XGBoost

In [7]:
from xgboost import XGBClassifier
xgb = XGBClassifier(n_estimators=200,       # 200棵树
                    max_depth=10,                         # 每棵树的最大深度为10
                    subsample=0.7,                        # 样本下采样
                    scale_pos_weight=0.5)             # 不平衡分类，适当降低正面情感权重有助于提升总体准确率

In [8]:
xgb.fit(X_train, y_train)

XGBClassifier(base_score=0.5, booster=None, colsample_bylevel=1,
       colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
       importance_type='gain', interaction_constraints=None,
       learning_rate=0.300000012, max_delta_step=0, max_depth=10,
       min_child_weight=1, missing=nan, monotone_constraints=None,
       n_estimators=200, n_jobs=0, num_parallel_tree=1,
       objective='binary:logistic', random_state=0, reg_alpha=0,
       reg_lambda=1, scale_pos_weight=0.5, subsample=0.7, tree_method=None,
       validate_parameters=False, verbosity=None)

In [9]:
result=xgb.predict(X_test)

  if diff:


#### 模型评估
XGBoost真是机器学习界的一大杀器，在这种简单任务上真是又快又好

In [10]:
from sklearn import metrics
print(metrics.classification_report(y_test, result))
print("准确率:", metrics.accuracy_score(y_test, result))

             precision    recall  f1-score   support

          0       0.76      0.86      0.81       155
          1       0.93      0.88      0.91       345

avg / total       0.88      0.87      0.88       500

准确率: 0.874


#### 查看权重最高的“重要特征”
越靠前，说明其越重要

In [11]:
features = vectorizer.get_feature_names()
weights = xgb.feature_importances_

In [12]:
for i, index in enumerate(weights.argsort()[::-1][:50]):
    print(i, ": ", features[index])

0 :  [心]
1 :  难受
2 :  哈哈哈
3 :  mmp
4 :  同济大学
5 :  呵呵
6 :  [微笑]
7 :  伤心
8 :  恶心
9 :  [doge]
10 :  逼
11 :  [哈哈]
12 :  去死吧
13 :  sb
14 :  分手
15 :  服
16 :  cnmd
17 :  [爱你]
18 :  [摊手]
19 :  [拜拜]
20 :  心碎
21 :  感谢
22 :  婊
23 :  完美
24 :  哒
25 :  骂
26 :  diss
27 :  不想
28 :  无语
29 :  委屈
30 :  [兔子]
31 :  [太开心]
32 :  [笑而不语]
33 :  哭哭
34 :  哈哈
35 :  [污]
36 :  [泪]
37 :  [怒]
38 :  愿
39 :  mlgb
40 :  [嘻嘻]
41 :  好看
42 :  滚
43 :  傻
44 :  [喵喵]
45 :  不行
46 :  [失望]
47 :  [偷笑]
48 :  丧
49 :  [憧憬]
