In [1]:
# Word2Vec 구현
# (WikiDocs 참고 : https://wikidocs.net/60855)

In [2]:
# Import Module
import nltk # (Natural Language Tool Kit)
import urllib.request
import zipfile
from lxml import etree
import re
from nltk.tokenize import word_tokenize, sent_tokenize
from gensim.models import Word2Vec, KeyedVectors

In [3]:
# Data Download
# urllib.request.urlretrieve("https://raw.githubusercontent.com/GaoleMeng/RNN-and-FFNN-textClassification/master/ted_en-20160408.xml", filename="ted_en-20160408.xml")

# Data Preprocessing
target_file = open("ted_en-20160408.xml", "r", encoding="UTF8")
target_text = etree.parse(target_file) #<content>에 해당하는 내용만 가져오기
content_text = '\n'.join(target_text.xpath('//content/text()')) 
content_text = re.sub(r'\([^)]*\)', '', content_text) # 배경음 제거

sent_text = sent_tokenize(content_text)
normalized_text = []
for sentence in sent_text :
    tokens = re.sub(r"[^a-z0-9]+", " ", sentence.lower()) # 구두점 제거 / 대문자 -> 소문자
    normalized_text.append(tokens)

sentences = [word_tokenize(sentence) for sentence in normalized_text]

In [4]:
# Train Word2Vec Model
word2vec = Word2Vec(sentences=sentences, size=100, window=5, min_count=5, workers=4, sg=1)

# Get Result
model_result = word2vec.wv.most_similar("man")
print(f"Similar Words with man : {model_result}")

model_result = word2vec.wv.most_similar("study")
print(f"Similar Words with study : {model_result}")

# Save Model
word2vec.wv.save_word2vec_format("word2vec(gensim)")

Similar Words with man : [('woman', 0.7622225284576416), ('guy', 0.7191064357757568), ('gentleman', 0.6915066242218018), ('rabbi', 0.6880382299423218), ('soldier', 0.6879733800888062), ('joe', 0.6849181652069092), ('boy', 0.6792556047439575), ('dancer', 0.6760324835777283), ('michelangelo', 0.6685644388198853), ('lady', 0.6656143069267273)]
Similar Words with effort : [('biomechanics', 0.6969070434570312), ('studies', 0.695245623588562), ('anthropology', 0.6941718459129333), ('psychology', 0.678261399269104), ('conducted', 0.675713300704956), ('survey', 0.6755701303482056), ('phd', 0.6609260439872742), ('undertook', 0.6603608727455139), ('demonstrated', 0.6579597592353821), ('thorough', 0.6526756286621094)]
