# 语料加载

In [1]:
import jieba
import random
import pandas as pd

In [3]:
stopwords = pd.read_csv("data/stopwords.txt", quoting=3, index_col=False, sep="\t", names=["stopword"], encoding="utf-8")
stopwords = stopwords["stopword"].values

In [7]:
laogong_df = pd.read_csv("data/beilaogongda.csv", encoding="utf-8")
laopo_df = pd.read_csv("data/beilaopoda.csv", encoding="utf-8")
erzi_df = pd.read_csv("data/beierzida.csv", encoding="utf-8")
nver_df = pd.read_csv("data/beinverda.csv", encoding="utf-8")

In [9]:
laogong_df.dropna(inplace=True)
laopo_df.dropna(inplace=True)
erzi_df.dropna(inplace=True)
nver_df.dropna(inplace=True)

In [24]:
laogong = laogong_df.segment.values.tolist()
laopo = laopo_df.segment.values.tolist()
erzi = erzi_df.segment.values.tolist()
nver = nver_df.segment.values.tolist()

# 分词和去停用词

In [34]:
def preprocess_text(content_lines, sentences, category):
    for line in content_lines:
        try:
            segs = jieba.lcut(line)
            # 去空格
            segs = list(filter(lambda x: x.strip(), segs))
            # 去数字
            segs = [v for v in segs if not str(v).isdigit()]
            # 去长度为1的字符
            segs = list(filter(lambda x: len(x) > 1, segs))
            # 去掉停用词
            segs = list(filter(lambda x: x not in stopwords, segs))
            sentences.append(((" ").join(segs), category))
        except Exception as e:
            print(line, str(e))
            continue

In [35]:
sentences = []
preprocess_text(laogong, sentences, 0)
preprocess_text(laopo, sentences, 1)
preprocess_text(erzi, sentences, 2)
preprocess_text(nver, sentences, 3)

In [36]:
random.shuffle(sentences)

In [37]:
sentences[:5]

[('报警 人称 女儿 民警 到场', 3),
 ('报警 女儿 持械 民警 到场', 3),
 ('报警 儿子 持械 民警 到场', 2),
 ('报警 人称 民警 走后 老公 民警 到场', 0),
 ('报警 妻子 一拳 无需 救护 民警 携带 防护 设备', 1)]

# 抽取词向量特征

In [39]:
from sklearn.feature_extraction.text import CountVectorizer
vec = CountVectorizer(
    analyzer="word",
    max_features=4000
)

In [40]:
from sklearn.model_selection import train_test_split
x, y = zip(*sentences) #矩阵的行列转换
x_train, x_test, y_train, y_test = train_test_split(x, y, random_state=123)

In [41]:
vec.fit(x_train)

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=4000, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=None, vocabulary=None)

# 建模

In [43]:
from sklearn.naive_bayes import MultinomialNB

classifier = MultinomialNB()
classifier.fit(vec.transform(x_train), y_train)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

# 评估、计算AUC值

In [44]:
classifier.score(vec.transform(x_test), y_test)

0.9952267303102625

# 模型对比

## 改变特征向量模型

In [45]:
from sklearn.feature_extraction.text import CountVectorizer

vec = CountVectorizer(
    analyzer="word",
    ngram_range=(1, 4),
    max_features=20000
)

vec.fit(x_train)

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=20000, min_df=1,
        ngram_range=(1, 4), preprocessor=None, stop_words=None,
        strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=None, vocabulary=None)

In [47]:
classifier = MultinomialNB()
classifier.fit(vec.transform(x_train), y_train)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [48]:
classifier.score(vec.transform(x_test), y_test)

0.9880668257756563

## 改变训练模型

### SVM

In [50]:
from sklearn.svm import SVC

svm = SVC(kernel="linear")
svm.fit(vec.transform(x_train), y_train)

svm.score(vec.transform(x_test), y_test)

0.9976133651551312

### XGBoost

In [51]:
import xgboost as xgb
from sklearn.model_selection import StratifiedKFold
import numpy as np

In [53]:
xbg_train = xgb.DMatrix(vec.transform(x_train), label=y_train)
sgb_test = xgb.DMatrix(vec.transform(x_test))

Ps.不会用，这块后面再深究

XGBoost的调参指标

In [55]:
    params = {  
            'booster': 'gbtree',     #使用gbtree
            'objective': 'multi:softmax',  # 多分类的问题、  
            # 'objective': 'multi:softprob',   # 多分类概率  
            #'objective': 'binary:logistic',  #二分类
            'eval_metric': 'merror',   #logloss
            'num_class': 4,  # 类别数，与 multisoftmax 并用  
            'gamma': 0.1,  # 用于控制是否后剪枝的参数,越大越保守，一般0.1、0.2这样子。  
            'max_depth': 8,  # 构建树的深度，越大越容易过拟合  
            'alpha': 0,   # L1正则化系数  
            'lambda': 10,  # 控制模型复杂度的权重值的L2正则化项参数，参数越大，模型越不容易过拟合。  
            'subsample': 0.7,  # 随机采样训练样本  
            'colsample_bytree': 0.5,  # 生成树时进行的列采样  
            'min_child_weight': 3,  
            # 这个参数默认是 1，是每个叶子里面 h 的和至少是多少，对正负样本不均衡时的 0-1 分类而言  
            # 假设 h 在 0.01 附近，min_child_weight 为 1 叶子节点中最少需要包含 100 个样本。  
            'silent': 0,  # 设置成1则没有运行信息输出，最好是设置为0.  
            'eta': 0.03,  # 如同学习率  
            'seed': 1000,  
            'nthread': -1,  # cpu 线程数  
            'missing': 1 
        }  

# 总结

整个示例代码可以当做模板来用

从优化和提高模型准确率来说，主要有两方面可以尝试：

- 特征向量的构建，除了词袋模型，可以考虑使用 word2vec 和 doc2vec 等；
- 模型上可以选择有监督的分类算法、集成学习以及神经网络等。