<a href="https://colab.research.google.com/github/foochane/text-classification/blob/master/15word2vec%E7%9A%84%E8%AF%8D%E5%B5%8C%E5%85%A5%2Bxgboost.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
import codecs
import gensim
from sklearn import  preprocessing
from sklearn.preprocessing import LabelEncoder
import numpy as np
import xgboost as xgb
from tqdm import tqdm

##  1 数据准备

In [0]:
import os 
if not os.path.exists('output'):
    os.makedirs('output')
if not os.path.exists('output/data_clean_split.txt'):
  !wget -P ./output https://raw.githubusercontent.com/foochane/text-classification/master/output/data_clean_split.txt

In [0]:
# 读取数据
labels = []
text = []
with codecs.open('output/data_clean_split.txt','r',encoding='utf-8') as f:
    document_split = f.readlines()
    for document in document_split:
        temp = document.split('\t')
        labels.append(temp[0])
        text.append(temp[1].strip())  

# 标签转换为数字
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(labels)

# 将每个句子切分成单个词
text_s2w= [s.split() for s in text]

## 2 构建word2vec模型

### 2.1 训练word2vec模型


In [0]:
model = gensim.models.Word2Vec(text_s2w,
                               min_count=5,
                               workers=6,
                               window =8,
                               size=100)

参数说明：

- min_count: 对于词频 < min_count 的单词，将舍弃（其实最合适的方法是用 UNK 符号代替，即所谓的『未登录词』，这里我们简化起见，认为此类低频词不重要，直接抛弃）

- workers: 可以并行执行的核心数，需要安装 Cython 才能起作用（安装 Cython 的方法很简单，直接 pip install cython）

size: 词向量的维度，神经网络隐层节点数

- window: 目标词汇的上下文单词距目标词的最长距离，很好理解，比如 CBOW 模型是用一个词的上下文预测这个词，那这个上下文总得有个限制，如果取得太多，距离目标词太远，有些词就没啥意义了，而如果取得太少，又信息不足，所以 window 就是上下文的一个最长距离

### 2.2 word2vec模型的简单使用
#### 2.2.1 构建词建词嵌入字典

In [7]:
embeddings_index = dict(zip(model.wv.index2word, model.wv.vectors))
print('Found %s word vectors.' % len(embeddings_index))

Found 87117 word vectors.


### 2.2.2 获取某个词的向量

In [8]:
model['汽车']

  """Entry point for launching an IPython kernel.


array([ 2.3111808e+00, -3.1423843e-01,  1.1657915e+00,  2.4726887e+00,
        9.8205251e-01,  1.6136947e+00,  4.5870215e-02,  2.2275944e-01,
       -5.8411784e-03,  6.1064994e-01,  1.2467331e+00, -2.3204267e+00,
        1.2762122e+00, -5.2863240e-01, -1.5698267e+00, -5.5249399e-01,
        1.2271531e+00, -7.8824210e-01, -7.0738721e-01, -2.9825904e+00,
       -1.8991225e+00,  2.4633343e+00, -2.0911546e+00,  2.9926260e+00,
       -7.1663857e-01,  7.0350718e-01, -3.7784261e-01,  2.0060134e+00,
       -1.2401059e+00,  3.9565125e-03,  1.2215015e+00,  1.0248302e+00,
        5.7203490e-01, -2.2812138e+00,  9.4041961e-01, -3.2780641e-01,
        3.6779909e+00, -8.4591168e-01,  3.0106833e+00,  2.9739454e+00,
        1.4285785e+00, -3.4431670e+00, -2.8534022e-01, -4.5611352e-01,
        1.5401051e+00,  1.8888358e+00, -1.6028727e+00, -7.8392017e-01,
        6.0268289e-01,  1.8838952e+00, -9.1625470e-01,  1.0150412e+00,
        1.0806491e+00,  4.6332877e-02, -8.9380598e-01, -1.7845334e+00,
      

### 2.2.3 查看某个词的与其他词的相似度

In [9]:
model.most_similar('人民日报')

  """Entry point for launching an IPython kernel.
  if np.issubdtype(vec.dtype, np.int):


[('光明日报', 0.8673247694969177),
 ('海外版', 0.8082146644592285),
 ('经济日报', 0.7917121648788452),
 ('年月日', 0.7912720441818237),
 ('社论', 0.7694905400276184),
 ('全国代表大会', 0.7467828989028931),
 ('批示', 0.7455558776855469),
 ('文艺报', 0.73807293176651),
 ('文汇报', 0.7371698617935181),
 ('伍绍祖', 0.7356415390968323)]

### 2.2.4 保存模型

In [10]:
model.save('/tmp/w2v_model')

  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL


### 2.2.5 加载模型

In [11]:
model_load = gensim.models.Word2Vec.load('/tmp/w2v_model')

  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL


## 3 训练数据处理



In [0]:
#该函数会将语句转化为一个标准化的向量（Normalized Vector）
def sent2vec(s):
    """
    将每个句子转换会一个100的向量
    """
    words = s.split()
    M = []
    for w in words:
        try:
            #M.append(embeddings_index[w])
            M.append(model[w])
        except:
            continue
    M = np.array(M)  # shape=(x,100),x是句子中词的个数，100是每个词向量的维数
    v = M.sum(axis=0) # 维度是100，对M中的x个数求和，得到每一维度的总和
    if type(v) != np.ndarray: 
        return np.zeros(100)
    
    return v / np.sqrt((v ** 2).sum()) # 正则化，最后每个句子都变为一100维的向量

In [13]:
# 对训练集和验证集使用上述函数，进行文本向量化处理
text_s2v = [sent2vec(s) for s in tqdm(text)]

# 转换成numpy array数组
text_s2v = np.array(text_s2v)

# 切分数据集
from sklearn.model_selection import train_test_split
x_train_w2v, x_valid_w2v, y_train, y_valid = train_test_split(text_s2v, y, 
                                                  stratify=y, 
                                                  random_state=42, 
                                                  test_size=0.1, shuffle=True)

  # Remove the CWD from sys.path while we load stuff.
100%|██████████| 9249/9249 [00:45<00:00, 205.47it/s]


## 4 调用模型进行分类

In [0]:
# 定义损失函数
def multiclass_logloss(actual, predicted, eps=1e-15):
    """对数损失度量（Logarithmic Loss  Metric）的多分类版本。
    :param actual: 包含actual target classes的数组
    :param predicted: 分类预测结果矩阵, 每个类别都有一个概率
    """
    # Convert 'actual' to a binary array if it's not already:
    if len(actual.shape) == 1:
        actual2 = np.zeros((actual.shape[0], predicted.shape[1]))
        for i, val in enumerate(actual):
            actual2[i, val] = 1
        actual = actual2

    clip = np.clip(predicted, eps, 1 - eps)
    rows = actual.shape[0]
    vsota = np.sum(actual * np.log(clip))
    return -1.0 / rows * vsota

In [15]:
# 基于word2vec特征在一个简单的Xgboost模型上进行拟合
clf = xgb.XGBClassifier(max_depth=7, n_estimators=200, colsample_bytree=0.8, 
                        subsample=0.8, nthread=10, learning_rate=0.1, silent=False)
clf.fit(x_train_w2v, y_train)
predictions = clf.predict_proba(x_valid_w2v)

print ("logloss: %0.3f " % multiclass_logloss(y_valid, predictions))

logloss: 0.377 



size = 100: logloss: 0.375 /0.368 