In [2]:
#導入相關模組
import json
import random
import re

import jieba
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer

#讀取jieba所需的自建辭典
jieba.load_userdict("E:/Python 3.7/pyetl/Demodb0103/gym/data/dict.txt")
import time

In [3]:
#讀取要斷詞的json檔
DATASET_DIR = 'E:/Python 3.7/pyetl/Demodb0103/gym/data/gymz.json'
with open(DATASET_DIR, encoding='utf8') as f:
    dataset = json.load(f)

In [6]:
#讀取停止詞典
STOP_WORDS_DIR = 'E:/Python 3.7/pyetl/Demodb0103/gym/data/stopWords.txt'
with open(STOP_WORDS_DIR, encoding='utf8') as f:
    stop_words = f.read().splitlines() 

In [7]:
#抽出文章的標題與內容
content_list = list(map(lambda d: d['content'], dataset))
title_list = list(map(lambda d: d['title'], dataset))

In [8]:
#替換不需要的句子，並正規化
# start = time.clock()
gym_list =[]
for i in content_list:
    i = re.sub('※ 發信站: 批踢踢實業坊',' ', i)
    i = re.sub('※ 文章網址:',' ', i)
    i = re.sub('※ 編輯:',' ', i)
    i = re.sub('(臺灣)',' ', i)
    i = re.sub('來自:',' ', i)
    #只保留中文
    rule = re.compile(r"[^\u4e00-\u9fa5]")
    i = rule.sub('', i)
    gym_list.append(i)


In [10]:
#將文章正規化，並進行jieba斷詞
rule = re.compile(r"[^\u4e00-\u9fa5]")
gym_list = [list(jieba.cut(rule.sub('', speech))) for speech in gym_list]

In [11]:
##去除停止詞
for idx, speech in enumerate(gym_list):
    gym_list[idx] = ' '.join([word for word in speech if word not in stop_words])
# end = time.clock()
# print('運行時間: ' + str(end - start))

In [13]:
#Word2Vec
# 資料形式為一個文檔各為list元素為字串
from gensim.models import Word2Vec

#全文章轉為各list包起來
doc_clean1 = [doc.split() for doc in gym_list]

# 建立模型
#window:CBOW下決定Word2Vec一次取多少詞來預測中間詞
#min_count:出現次數大於等於min_count的詞，才會納入Word2Vec的詞典中
#negative:Negative Sampling的取樣數量，5~20適合小數據，2~5適合大數據
# worker=使用多核計算機進行更快的訓練
model = Word2Vec(doc_clean1,window=5, negative=5,min_count=1,size=250, iter=10)

# # 基於2d PCA擬合數據
# X = model[model.wv.vocab]
# pca = PCA(n_components=2)
# result = pca.fit_transform(X)

In [15]:
#顯示指定詞的關聯詞
model.wv.most_similar('肥')

[('很胖', 0.8617879152297974),
 ('很瘦', 0.8611124753952026),
 ('紙片', 0.8551267385482788),
 ('瘦子', 0.8493390679359436),
 ('胖子', 0.8449896574020386),
 ('想瘦', 0.8367601037025452),
 ('回娘家', 0.8110179305076599),
 ('下屆齡', 0.8108158111572266),
 ('瘦下', 0.810620903968811),
 ('變美', 0.8088332414627075)]

In [17]:
import pandas as pd
#顯示各個指定詞的關聯詞
def most_similar(w2v_model, words, topn=10):
    similar_df = pd.DataFrame()
    for word in words:
        try:
            similar_words = pd.DataFrame(w2v_model.wv.most_similar(word, topn=topn), columns=[word, 'cos'])
            similar_df = pd.concat([similar_df, similar_words], axis=1)
        except:
            print(word, "not found in Word2Vec model!")
    return similar_df
most_similar(model, ['增肌', '肥'])

Unnamed: 0,增肌,cos,肥,cos.1
0,減脂,0.79888,很胖,0.861788
1,增脂,0.784306,很瘦,0.861112
2,增肌期,0.777738,紙片,0.855127
3,減脂期,0.773392,瘦子,0.849339
4,實施實,0.77062,胖子,0.84499
5,訓外,0.769521,想瘦,0.83676
6,主感謝,0.757237,回娘家,0.811018
7,增肌先,0.754268,下屆齡,0.810816
8,抓五個,0.734153,瘦下,0.810621
9,增肌為,0.729781,變美,0.808833


In [265]:
# import word2vec #需要word2vec模組
# indexes = model.cosine(u'增肌')
# for index in indexes[0]:
#     print (model.vocab[index])

ModuleNotFoundError: No module named 'word2vec'

In [18]:
# fasttext
from gensim.models import FastText
model2 = FastText(doc_clean1,window=5, negative=5,min_count=1,size=250, iter=10)


In [19]:
#顯示指定詞的關聯詞
model2.wv.most_similar('肥')

[('很瘦', 0.8827726244926453),
 ('很胖', 0.8802703619003296),
 ('紙片', 0.8683609962463379),
 ('人紙片', 0.8507423400878906),
 ('瘦子', 0.8441587090492249),
 ('胖子', 0.83818519115448),
 ('想瘦', 0.8364203572273254),
 ('瘦瘦', 0.828603982925415),
 ('臉胖要', 0.8275570273399353),
 ('變胖變', 0.8255159258842468)]

In [20]:
#顯示各個指定詞的關聯詞
def most_similar(fas_model, words, topn=10):
    similar_df = pd.DataFrame()
    for word in words:
        try:
            similar_words = pd.DataFrame(fas_model.wv.most_similar(word, topn=topn), columns=[word, 'cos'])
            similar_df = pd.concat([similar_df, similar_words], axis=1)
        except:
            print(word, "not found in Word2Vec model!")
    return similar_df
most_similar(model2, ['增肌', '肥'])

Unnamed: 0,增肌,cos,肥,cos.1
0,增肌量,0.966287,很瘦,0.882773
1,增肌菜,0.964887,很胖,0.88027
2,增肌增,0.963615,紙片,0.868361
3,增肌壯,0.963447,人紙片,0.850742
4,增肌健,0.962881,瘦子,0.844159
5,增肌消,0.962729,胖子,0.838185
6,增肌當,0.962651,想瘦,0.83642
7,增肌文,0.962407,瘦瘦,0.828604
8,增肌臉,0.962385,臉胖要,0.827557
9,增肌早,0.962175,變胖變,0.825516
