## WikiCorpus 清理文檔資料

In [1]:
# coding: utf-8
"""
Extracted the content in the wiki_database
"""
from gensim.corpora import WikiCorpus


# Load data
wiki_corpus = WikiCorpus('../wiki_data/zhwiki-20220101-pages-articles-multistream1.xml-p1p187712.bz2', dictionary={})


# Save data
with open('wiki_text.txt', 'w', encoding='utf-8') as f:
    print('Start to preprocess.')
    for times, text in enumerate(wiki_corpus.get_texts()):
        f.write(' '.join(text)+'\n')

        if (times+1) % 10000 == 0:
            print(times+1)

Start to preprocess.
10000
20000
30000


## Jieba 斷詞

In [2]:
# coding: utf-8
"""
Tokenize
"""
# jieba斷詞
import jieba
# 簡轉繁工具
from opencc import OpenCC


# Initial
cc = OpenCC('s2t')


# Tokenize
with open('jieba_seg/wiki_text_seg.txt', 'w', encoding='utf-8') as new_f:
    with open('wiki_text.txt', 'r', encoding='utf-8') as f:
        for data in f:
            data = cc.convert(data)
            data = jieba.cut(data)
            data = [word for word in data if word != ' ']
            data = ' '.join(data)

            new_f.write(data)

Building prefix dict from the default dictionary ...
Dumping model to file cache /tmp/jieba.cache
Loading model cost 1.050 seconds.
Prefix dict has been built successfully.


## FastText (Word Embedding) 模型訓練

In [3]:
# coding: utf-8
from gensim.models import word2vec, fasttext

 
# Settings
seed = 666
sg = 0
window_size = 10
vector_size = 100
min_count = 1
workers = 8
epochs = 5
batch_words = 10000


# Train
train_data = word2vec.LineSentence('jieba_seg/wiki_text_seg.txt')
model = fasttext.FastText(
    train_data,
    min_count=min_count,
    vector_size=vector_size,
    workers=workers,
    epochs=epochs,
    window=window_size,
    sg=sg,
    seed=seed,
    batch_words=batch_words,
)


model.save('jieba_model/fasttext.model')

KeyboardInterrupt: 

## 載入預訓練模型

In [51]:
# coding: utf-8
"""
Test the w2v model
"""
from gensim.models import word2vec
# from fasttext_train import EpochLogger
import time

start = time.time()
# Load the model
model = word2vec.Word2Vec.load('fasttext_model/fasttext.model')
end = time.time()
print(end - start)

    

2.3657643795013428


In [83]:
# Test
# print(model.wv['生物'])

print('臺中 -->')
similarities = model.wv.most_similar('臺中', topn=10)
for i in range(len(similarities)):
    print(similarities[i])

print('\n')
print('台中 -->')
similarities = model.wv.most_similar('台中', topn=10)
for i in range(len(similarities)):
    print(similarities[i])
    

臺中 -->
('臺中市', 0.7898079752922058)
('豐原', 0.7198904156684875)
('烏日', 0.7067549228668213)
('文心', 0.6914694905281067)
('臺中港', 0.6855792999267578)
('彰化', 0.6835096478462219)
('taichung', 0.6778730750083923)
('仁友', 0.6771103739738464)
('潭子', 0.674151599407196)
('北屯', 0.6613706946372986)


台中 -->
('benjamin', 0.8150575757026672)
('samuel', 0.7168459296226501)
('joseph', 0.7030758857727051)
('robert', 0.6753789186477661)
('herbert', 0.6723706126213074)
('thomas', 0.6719692349433899)
('richard', 0.662871778011322)
('abraham', 0.661365270614624)
('charles', 0.6542357802391052)
('kenneth', 0.6532087922096252)
