In [1]:
import codecs
import gensim
from sklearn import  preprocessing
from sklearn.preprocessing import LabelEncoder
import numpy as np
import xgboost as xgb
from tqdm import tqdm

##  1 数据准备

In [2]:
# !wget https://raw.githubusercontent.com/foochane/text-classification/master/output/data_clean_split.txt

In [3]:
# 读取数据
labels = []
text = []
with codecs.open('output/data_clean_split.txt','r',encoding='utf-8') as f:
    document_split = f.readlines()
    for document in document_split:
        temp = document.split('\t')
        labels.append(temp[0])
        text.append(temp[1].strip())  

# 标签转换为数字
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(labels)

# 将每个句子切分成单个词
text_s2w= [s.split() for s in text]

## 2 构建word2vec模型

### 2.1 训练word2vec模型


In [4]:
model = gensim.models.Word2Vec(text_s2w,
                               min_count=5,
                               workers=6,
                               window =8,
                               size=100)

参数说明：

- min_count: 对于词频 < min_count 的单词，将舍弃（其实最合适的方法是用 UNK 符号代替，即所谓的『未登录词』，这里我们简化起见，认为此类低频词不重要，直接抛弃）

- workers: 可以并行执行的核心数，需要安装 Cython 才能起作用（安装 Cython 的方法很简单，直接 pip install cython）

size: 词向量的维度，神经网络隐层节点数

- window: 目标词汇的上下文单词距目标词的最长距离，很好理解，比如 CBOW 模型是用一个词的上下文预测这个词，那这个上下文总得有个限制，如果取得太多，距离目标词太远，有些词就没啥意义了，而如果取得太少，又信息不足，所以 window 就是上下文的一个最长距离

### 2.2 word2vec模型的简单使用
#### 2.2.1 构建词建词嵌入字典

In [5]:

embeddings_index = dict(zip(model.wv.index2word, model.wv.vectors))
print('Found %s word vectors.' % len(embeddings_index))

Found 87117 word vectors.


### 2.2.2 获取某个词的向量

In [6]:
model['汽车']

  """Entry point for launching an IPython kernel.


array([ 0.65716314,  0.6816527 ,  1.7237903 ,  0.12500447,  2.709019  ,
       -0.7488362 , -3.402965  , -2.5708413 ,  0.09961936,  0.49622235,
       -1.6381997 , -1.4871106 ,  1.1957139 ,  3.3516102 ,  1.4163692 ,
        2.102778  ,  1.035202  ,  2.4707658 ,  1.2491568 , -1.7699925 ,
       -1.793375  ,  0.41351292, -3.3450906 ,  0.19671081, -1.1741619 ,
       -1.367899  ,  0.14432155,  1.6968429 , -2.5475447 ,  0.3613438 ,
       -2.6219337 ,  1.0359944 , -0.9151234 , -1.1271834 , -1.7256584 ,
        1.8681693 , -2.3287654 ,  0.38760173, -0.22028962,  1.468779  ,
        1.3121998 , -2.4796908 , -0.70897305, -1.4625944 , -2.2880292 ,
        0.15544365, -0.39448664, -2.2615948 , -1.5946577 , -1.1362418 ,
        2.8668537 ,  1.4167479 ,  1.3603674 ,  1.852212  , -0.80646837,
        0.34428045, -2.8292348 , -0.26375222,  0.14192149,  0.12021378,
        1.7613864 ,  1.7598226 ,  3.221818  , -0.8587414 , -0.01432012,
       -1.6610274 , -3.0720038 , -4.026038  , -1.0249809 , -0.90

### 2.2.3 查看某个词的与其他词的相似度

In [7]:
model.most_similar('人民日报')

  """Entry point for launching an IPython kernel.


[('光明日报', 0.8941681385040283),
 ('年月日', 0.8077759742736816),
 ('经济日报', 0.7815771102905273),
 ('社论', 0.778007447719574),
 ('评论员', 0.7708622217178345),
 ('海外版', 0.769585132598877),
 ('中国青年报', 0.7640052437782288),
 ('讲话', 0.7531139850616455),
 ('伍绍祖', 0.7515900731086731),
 ('解放日报', 0.7509524822235107)]

### 2.2.4 保存模型

In [8]:
model.save('/tmp/w2v_model')

### 2.2.5 加载模型

In [None]:
model_load = gensim.models.Word2Vec.load('/tmp/w2v_model')

## 3 训练数据处理



In [None]:
#该函数会将语句转化为一个标准化的向量（Normalized Vector）
def sent2vec(s):
    """
    将每个句子转换会一个100的向量
    """
    words = s.split()
    M = []
    for w in words:
        try:
            #M.append(embeddings_index[w])
            M.append(model[w])
        except:
            continue
    M = np.array(M)  # shape=(x,100),x是句子中词的个数，100是每个词向量的维数
    v = M.sum(axis=0) # 维度是100，对M中的x个数求和，得到每一维度的总和
    if type(v) != np.ndarray: 
        return np.zeros(100)
    
    return v / np.sqrt((v ** 2).sum()) # 正则化，最后每个句子都变为一100维的向量

In [None]:
# 对训练集和验证集使用上述函数，进行文本向量化处理
text_s2v = [sent2vec(s) for s in tqdm(text)]

# 转换成numpy array数组
text_s2v = np.array(text_s2v)

# 切分数据集
from sklearn.model_selection import train_test_split
x_train_w2v, x_valid_w2v, y_train, y_valid = train_test_split(text_s2v, y, 
                                                  stratify=y, 
                                                  random_state=42, 
                                                  test_size=0.1, shuffle=True)

  # This is added back by InteractiveShellApp.init_path()
 25%|██▍       | 2276/9249 [00:17<00:33, 209.10it/s]

## 4 调用模型进行分类

In [None]:
# 定义损失函数
def multiclass_logloss(actual, predicted, eps=1e-15):
    """对数损失度量（Logarithmic Loss  Metric）的多分类版本。
    :param actual: 包含actual target classes的数组
    :param predicted: 分类预测结果矩阵, 每个类别都有一个概率
    """
    # Convert 'actual' to a binary array if it's not already:
    if len(actual.shape) == 1:
        actual2 = np.zeros((actual.shape[0], predicted.shape[1]))
        for i, val in enumerate(actual):
            actual2[i, val] = 1
        actual = actual2

    clip = np.clip(predicted, eps, 1 - eps)
    rows = actual.shape[0]
    vsota = np.sum(actual * np.log(clip))
    return -1.0 / rows * vsota

In [None]:
# 基于word2vec特征在一个简单的Xgboost模型上进行拟合
clf = xgb.XGBClassifier(max_depth=7, n_estimators=200, colsample_bytree=0.8, 
                        subsample=0.8, nthread=10, learning_rate=0.1, silent=False)
clf.fit(x_train_w2v, y_train)
predictions = clf.predict_proba(x_valid_w2v)

print ("logloss: %0.3f " % multiclass_logloss(y_valid, predictions))


size = 100: logloss: 0.375 /0.368 