In [1]:
import codecs
import gensim
from sklearn import  preprocessing
from sklearn.preprocessing import LabelEncoder
import numpy as np
import xgboost as xgb
from tqdm import tqdm

##  1 数据准备

In [2]:
# !wget https://raw.githubusercontent.com/foochane/text-classification/master/output/data_clean_split.txt

In [3]:
# 读取数据
labels = []
text = []
with codecs.open('output/data_clean_split.txt','r',encoding='utf-8') as f:
    document_split = f.readlines()
    for document in document_split:
        temp = document.split('\t')
        labels.append(temp[0])
        text.append(temp[1].strip())  

# 标签转换为数字
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(labels)

# 将每个句子切分成单个词
text_s2w= [s.split() for s in text]

## 2 构建word2vec模型

### 2.1 训练word2vec模型
参数说明：

min_count: 对于词频 < min_count 的单词，将舍弃（其实最合适的方法是用 UNK 符号代替，即所谓的『未登录词』，这里我们简化起见，认为此类低频词不重要，直接抛弃）

workers: 可以并行执行的核心数，需要安装 Cython 才能起作用（安装 Cython 的方法很简单，直接 pip install cython）

size: 词向量的维度，神经网络隐层节点数

window: 目标词汇的上下文单词距目标词的最长距离，很好理解，比如 CBOW 模型是用一个词的上下文预测这个词，那这个上下文总得有个限制，如果取得太多，距离目标词太远，有些词就没啥意义了，而如果取得太少，又信息不足，所以 window 就是上下文的一个最长距离

In [4]:
model = gensim.models.Word2Vec(text_s2w,
                               min_count=5,
                               workers=6,
                               window =8,
                               size=100)

### 2.2 word2vec模型的简单使用

In [5]:
# 1 构建词建词嵌入字典
embeddings_index = dict(zip(model.wv.index2word, model.wv.vectors))
print('Found %s word vectors.' % len(embeddings_index))

Found 87117 word vectors.


In [6]:
# 2 获取某个词的向量
model['汽车']

  


array([ 1.8342775 ,  2.1845498 ,  0.1811772 ,  0.44513613,  0.6321447 ,
       -2.2120235 , -3.4583235 ,  2.8061318 , -0.25104186,  0.85629886,
       -0.92454964, -1.6123239 , -0.80640805, -0.73272705, -0.74616235,
       -3.0445638 , -0.320902  ,  1.4514409 ,  3.7497303 ,  1.3578444 ,
        1.1719748 , -0.91839045, -2.2939498 , -0.13946047,  2.2899468 ,
       -2.454715  ,  1.7183454 , -1.2504009 ,  0.3535927 , -0.9718751 ,
       -1.3168051 ,  1.9108007 ,  0.7765807 , -0.9604371 ,  0.98970056,
        0.39219838, -1.5062613 ,  3.5600126 , -1.1126146 , -1.1381696 ,
        2.9837532 ,  0.96416277, -0.22817   ,  0.3855935 ,  0.19987127,
        0.63912606, -1.4296261 , -3.427696  ,  0.43095294, -0.9331193 ,
       -0.9451589 ,  0.92573446, -2.2235925 ,  0.5591363 , -0.31658825,
       -3.301475  , -0.8763386 , -2.3563743 ,  0.62115496,  1.9578915 ,
        2.289198  , -0.36153495,  0.21389788, -1.1329963 ,  2.2054768 ,
        1.9896835 ,  0.5370136 ,  0.23726553, -0.44702885, -2.17

In [7]:
# 3 查看某个词的与其他词的相似度
model.most_similar('人民日报')

  


[('光明日报', 0.8853372931480408),
 ('中国青年报', 0.7971559166908264),
 ('社论', 0.7969412803649902),
 ('经济日报', 0.7926549911499023),
 ('海外版', 0.7842204570770264),
 ('年月日', 0.781338632106781),
 ('新民晚报', 0.7726690769195557),
 ('时报', 0.7552007436752319),
 ('讲话', 0.7549471855163574),
 ('中共中央国务院', 0.7518625259399414)]

In [8]:
# 4 保存模型
model.save('/tmp/w2v_model')

In [None]:
# 5 加载模型
model_load = gensim.models.Word2Vec.load('/tmp/w2v_model')

## 3 训练数据处理



In [None]:
#该函数会将语句转化为一个标准化的向量（Normalized Vector）
def sent2vec(s):
    """
    将每个句子转换会一个100的向量
    """
    words = s.split()
    M = []
    for w in words:
        try:
            #M.append(embeddings_index[w])
            M.append(model[w])
        except:
            continue
    M = np.array(M)  # shape=(x,100),x是句子中词的个数，100是每个词向量的维数
    v = M.sum(axis=0) # 维度是100，对M中的x个数求和，得到每一维度的总和
    if type(v) != np.ndarray:
        return np.zeros(100)
    return v / np.sqrt((v ** 2).sum()) # 正则化，最后每个句子都变为一100维的向量

In [None]:
# 对训练集和验证集使用上述函数，进行文本向量化处理
text_s2v = [sent2vec(s) for s in tqdm(text)]

# 转换成numpy array数组
text_s2v = np.array(text_s2v)

# 切分数据集
from sklearn.model_selection import train_test_split
x_train_w2v, x_valid_w2v, y_train, y_valid = train_test_split(text_s2v, y, 
                                                  stratify=y, 
                                                  random_state=42, 
                                                  test_size=0.1, shuffle=True)

  # This is added back by InteractiveShellApp.init_path()
100%|██████████| 9249/9249 [01:04<00:00, 144.41it/s]


## 4 调用模型进行分类

In [None]:
# 定义损失函数
def multiclass_logloss(actual, predicted, eps=1e-15):
    """对数损失度量（Logarithmic Loss  Metric）的多分类版本。
    :param actual: 包含actual target classes的数组
    :param predicted: 分类预测结果矩阵, 每个类别都有一个概率
    """
    # Convert 'actual' to a binary array if it's not already:
    if len(actual.shape) == 1:
        actual2 = np.zeros((actual.shape[0], predicted.shape[1]))
        for i, val in enumerate(actual):
            actual2[i, val] = 1
        actual = actual2

    clip = np.clip(predicted, eps, 1 - eps)
    rows = actual.shape[0]
    vsota = np.sum(actual * np.log(clip))
    return -1.0 / rows * vsota

In [None]:
# 基于word2vec特征在一个简单的Xgboost模型上进行拟合
clf = xgb.XGBClassifier(max_depth=7, n_estimators=200, colsample_bytree=0.8, 
                        subsample=0.8, nthread=10, learning_rate=0.1, silent=False)
clf.fit(x_train_w2v, y_train)
predictions = clf.predict_proba(x_valid_w2v)

print ("logloss: %0.3f " % multiclass_logloss(y_valid, predictions))


size = 100: logloss: 0.375 /0.368 