# the application of word2vec

In [2]:
from gensim.models import word2vec
from sklearn.cluster import KMeans
import numpy as np
import pandas as pd
import os

## turn back to main directory
os.chdir("../")
os.getcwd()

'/home/jovyan/Davis_Practice/NLP'

In [3]:
## load word2vec model (由 part1 03_word2vec_build.ipynb 產生)
model = word2vec.Word2Vec.load('part2/word2vec_model/CBOW')

## similarity

In [4]:
## get most similarity with given words
model.wv.most_similar('AI')

[('人工智慧', 0.6728485822677612),
 ('人工智能', 0.6132781505584717),
 ('機器人', 0.5838004946708679),
 ('ai', 0.5285176634788513),
 ('AlphaGo', 0.5206490755081177),
 ('圍棋', 0.4965389668941498),
 ('自動化', 0.48469728231430054),
 ('AlphaGO', 0.48197638988494873),
 ('演算法', 0.4817844331264496),
 ('IOT', 0.47610437870025635)]

In [7]:
## get most similarity with given words's relationship
model.wv.most_similar(positive=['AI'], negative=['game'])

[('人工智慧', 0.4605787396430969),
 ('人工智能', 0.4210156202316284),
 ('自動化', 0.3925265073776245),
 ('馬斯克', 0.37045812606811523),
 ('量子', 0.3544861078262329),
 ('天網', 0.3482709527015686),
 ('自駕車', 0.33507484197616577),
 ('奧創', 0.33338847756385803),
 ('植入', 0.3301090598106384),
 ('機器人', 0.32774603366851807)]

## clustering

In [8]:
## create a dictionary: words as key ; count as values
words = {word: vocab.count for word, vocab in model.wv.vocab.items()}

In [9]:
## sort and select the top 10000 count of words
words = sorted(words.items(), key=lambda x: x[1], reverse=True)
words = words[:10000]
words = np.array(words)[:, 0]
words

array(['人', '八卦', '有沒有', ..., '唯有', '考到', '派遣'], dtype='<U20')

In [10]:
## extract the word vectors 
vecs = model.wv[words]

In [11]:
## run clustering algorithm
kmeans = KMeans(n_clusters=50)
cluster = kmeans.fit_predict(vecs)

In [12]:
## print the result
df = pd.DataFrame([words.tolist(), cluster.tolist()], index=['words', 'no. cluster']).T
df.head(n=5)

Unnamed: 0,words,no. cluster
0,人,14
1,八卦,4
2,有沒有,29
3,說,14
4,好,14


In [13]:
## print every cluster of words
data = pd.concat([d['words'].reset_index(drop=True).rename(columns={0: k}) for k, d in df.groupby('no. cluster')], axis=1)
data

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,40,41,42,43,44,45,46,47,48,49
0,穿,字,台,一個,八卦,工作,停電,問題,男,之內,...,看到,聽,這種,醫生,年,小心,肥宅,禁止,影片,處理
1,衣服,這是,台北,後,請,公司,發電,是否,女,殺,...,發現,歌,事,愛滋,月,身體,最近,板規,電影,資料
2,一件,話,地方,前,一下,老闆,反核,下,歲,館長,...,找,唱,罵,醫院,小時,手,題,政府,拍,電話
3,T,寫,高雄,已經,請問,薪水,地震,社會,男生,神,...,玩,音樂,相信,健康,幾天,懶,小弟,要求,節目,查
4,內褲,懂,南部,兩個,鄉民,勞工,台電,完全,小孩,最強,...,跑,唱歌,垃圾,機率,天,身上,安安,法律,廣告,現場
5,戴,意思,附近,未滿,相關,年輕人,建設,需要,甲甲,地球,...,開,好聽,嗆,醫師,一年,摸,本魯,開放,電視,檢舉
6,t,內容,台南,一次,注意,員工,環保,發生,結婚,seafood,...,變成,歌手,討厭,感冒,幾年,眼睛,鬧板,安全,當年,人員
7,穿著,名字,城市,每個,謝謝,賺錢,限電,無法,男人,王,...,記得,首歌,崩潰,檢查,久,尿,想到,規定,直播,收到
8,頭髮,講,地區,日,問,加班,核能,一種,生,角色,...,抓,歌詞,嘴,有效,一個月,硬,小魯,理由,經典,資訊
9,褲子,一句,台北市,幾個,分享,企業,核電,認為,女人,主角,...,上面,表演,一群,病,禮拜,屁股,有個,法,聖,確定
