# 基于维基百科的词向量构建

## Step-01: Download Wikipedia Chinese Corpus: https://dumps.wikimedia.org/zhwiki/20190720/

```
wget https://dumps.wikimedia.org/zhwiki/20190720/zhwiki-20190720-pages-articles-multistream.xml.bz2
```

## Step-02: Using https://github.com/attardi/wikiextractor to extract the wikipedia corpus

```
git clone https://github.com/attardi/wikiextractor.git
cd wikiextracter
python WikiExtractor.py -o zhwiki zhwiki-20190720-pages-articles-multistream.xml.bz2
```

## Step-03: Using gensim get word vectors:

### 用jieba分词，存储为文件corpus_zhwiki.txt

In [18]:
import os
import re
import jieba
def preprocess_file(in_file, fout):
    with open(in_file, 'r', encoding='utf-8') as fin:
        for line in fin:
            sline = line.strip()
            # remove empty line
            if sline == "":
                continue
            # remove html mark line
            pattern = re.compile('<.*?>')
            if pattern.match(sline):
                continue
            # jieba
            seg_list = jieba.cut(sline)
            seg_res = ' '.join(seg_list)
            fout.write(seg_res)
            fout.write('\n')

def preprocess(in_folder, out_file):
    fout = open(out_file, 'a', encoding='utf-8')
    for root, dirs, files in os.walk(in_folder):
        for file_name in files:
            in_file = os.path.join(root,file_name)
            preprocess_file(in_file, fout)

```
preprocess('zhwiki', 'corpus_zhwiki.txt')
```

### gensim训练模型

In [29]:
import sys
from gensim.models import Word2Vec
from gensim.models.word2vec import LineSentence
import multiprocessing

def word2vec_train(in_file, out_model, out_vector):
    sentences = LineSentence(in_file)
    model = Word2Vec(sentences, workers=multiprocessing.cpu_count())
    model.save(out_model)
    model.wv.save_word2vec_format(out_vector, binary=False)

```
word2vec_train('corpus_zhwiki.txt', 'zhwiki.word2vec.model', 'zhwiki.word2vec.vectors')
```

## Step-04: 使用词向量模型测试同义词

In [32]:
from gensim.models import Word2Vec
zh_wiki_word2vec_model = Word2Vec.load('zhwiki.word2vec.model')

In [34]:
zh_wiki_word2vec_model.wv.most_similar(u"足球")

[('足球运动', 0.7593598365783691),
 ('排球', 0.7379463911056519),
 ('冰球', 0.7345577478408813),
 ('手球', 0.7277976274490356),
 ('足球联赛', 0.7249800562858582),
 ('板球', 0.7136474847793579),
 ('橄欖球隊', 0.6775943040847778),
 ('橄欖球', 0.6566696166992188),
 ('踢球', 0.656656801700592),
 ('德甲球', 0.6550092697143555)]

In [35]:
zh_wiki_word2vec_model.wv.most_similar(u"上海")

[('天津', 0.8689776659011841),
 ('北京', 0.8609808683395386),
 ('南京', 0.8212747573852539),
 ('杭州', 0.8176919221878052),
 ('广州', 0.7898254990577698),
 ('上海市', 0.7564722895622253),
 ('武汉', 0.7561065554618835),
 ('北平', 0.7394840717315674),
 ('重庆', 0.729672908782959),
 ('成都', 0.71846604347229)]

## Step-05: 词向量可视化

https://www.kaggle.com/jeffd23/visualizing-word-vectors-with-t-sne

In [36]:
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt
%matplotlib inline

def tsne_plot(model):
    "Creates and TSNE model and plots it"
    labels = []
    tokens = []

    for word in model.wv.vocab:
        tokens.append(model[word])
        labels.append(word)
    
    tsne_model = TSNE(perplexity=40, n_components=2, init='pca', n_iter=2500, random_state=23)
    new_values = tsne_model.fit_transform(tokens)

    x = []
    y = []
    for value in new_values:
        x.append(value[0])
        y.append(value[1])
        
    plt.figure(figsize=(16, 16)) 
    for i in range(len(x)):
        plt.scatter(x[i],y[i])
        plt.annotate(labels[i],
                     xy=(x[i], y[i]),
                     xytext=(5, 2),
                     textcoords='offset points',
                     ha='right',
                     va='bottom')
    plt.show()

In [None]:
tsne_plot(zh_wiki_word2vec_model)

  # This is added back by InteractiveShellApp.init_path()
