# 作業 : 調整 word2vec 模型的不同訓練參數

# [作業目標]
- 調整 word2vec 模型的不同參數, 分別觀察效果並比較

# [作業重點]
- 調整 word2vec 模型的不同訓練參數, 分別觀察效果並比較

In [1]:
# 下載語料庫(text8)
!wget http://mattmahoney.net/dc/text8.zip
!unzip text8.zip

--2021-01-14 13:29:35--  http://mattmahoney.net/dc/text8.zip
Resolving mattmahoney.net (mattmahoney.net)... 67.195.197.24
Connecting to mattmahoney.net (mattmahoney.net)|67.195.197.24|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 31344016 (30M) [application/zip]
Saving to: ‘text8.zip’


2021-01-14 13:29:50 (2.23 MB/s) - ‘text8.zip’ saved [31344016/31344016]

Archive:  text8.zip
  inflating: text8                   


In [2]:
# 載入 gensim 與 word2vec 模型
import gensim
from gensim.models import word2vec

# 忽略警告訊息
import warnings
warnings.filterwarnings("ignore")

# Word2Vec 訓練參數
- size : 詞向量的維度
- min_count : 最小次數，一個詞出現的次數若小於 min_count，則拋棄不參與訓練。
- window : 訓練窗格大小，也就是一個詞在看上下文關係時，上下應該各看幾個字的意思。
- 更多參數說明，請參閱官方文件
https://radimrehurek.com/gensim/models/word2vec.html#gensim.models.word2vec.Text8Corpus

In [3]:
# 使用 gensim 訓練 word2vec 詞向量
sentences = word2vec.Text8Corpus('text8')
# model = word2vec.Word2Vec(sentences, size=10)
model = word2vec.Word2Vec(sentences, size=10, min_count=3, window=5)

In [4]:
# 顯示最相近的字彙
model.most_similar(['woman'])

[('siblings', 0.9515708684921265),
 ('dressed', 0.9446461200714111),
 ('girls', 0.931336522102356),
 ('babies', 0.9206811189651489),
 ('loved', 0.91548091173172),
 ('angry', 0.9091489315032959),
 ('heard', 0.8999865055084229),
 ('younger', 0.8966966271400452),
 ('husband', 0.8949397802352905),
 ('pregnant', 0.891036868095398)]

In [5]:
# 顯示最相近的字彙(附加反義詞)
model.most_similar(positive=['woman', 'king'], negative=['man'], topn=5)

[('deposed', 0.9536306262016296),
 ('brennus', 0.9466484785079956),
 ('reigned', 0.9464030861854553),
 ('visited', 0.9313496351242065),
 ('empress', 0.9307871460914612)]

In [6]:
# 挑選最不相同的字彙
model.wv.doesnt_match("breakfast cereal dinner lunch".split())

'cereal'

In [7]:
# 顯示字彙間的相似性
model.wv.similarity('woman', 'man')

0.84294856

In [8]:
# 顯示字彙的詞向量
model['computer']

array([ 1.910302  ,  2.5080907 ,  4.329898  ,  0.22177306, -0.80510587,
       -5.710003  , -0.45526665,  6.4719987 , -0.5645616 , -4.5551887 ],
      dtype=float32)