In [6]:
import os
path = 'C:/Users/leemj/Desktop/텍스트마이닝1_실습자료'
os.chdir(os.path.join(path))

In [7]:
import warnings
warnings.filterwarnings('ignore')

In [8]:
import re
import nltk

import pandas as pd
import numpy as np

from bs4 import BeautifulSoup
from nltk.corpus import stopwords
from nltk.stem.snowball import SnowballStemmer

from gensim.models import word2vec

In [9]:
data = pd.read_csv('imdb_review_classification.csv',engine='python')
train = data.loc[data['type']=='train',:]
test = data.loc[data['type']=='test',:]
train = train.drop(['type'],axis=1)
test = test.drop(['type'],axis=1)
train = train.reset_index(inplace=False,drop=True)
test = test.reset_index(inplace=False,drop=True)

In [10]:
def review_to_wordlist(review, remove_stopwords=False):
    review_text = BeautifulSoup(review, 'html.parser').get_text()
    review_text = re.sub('[^a-zA-Z]',' ',review_text)
    words = review_text.lower().split()
    if remove_stopwords:
        stops = set(stopwords.words('english'))
        words = [w for w in words if not w in stops]
    stemmer = SnowballStemmer('english')
    words = [stemmer.stem(w) for w in words]
    return(words)

def review_to_sentences(review, tokenizer, remove_stopwords=False):
    raw_sentences = tokenizer.tokenize(review.strip())
    sentences = []
    for raw_sentence in raw_sentences:
        if len(raw_sentence) > 0:
            sentences.append( review_to_wordlist(raw_sentence) )
    return(sentences)

In [11]:
nltk.download('punkt')
tokenizer=nltk.data.load('tokenizers/punkt/english.pickle')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\leemj\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt.zip.


In [12]:
sentences = []
i = 1
total = train.shape[0] + test.shape[0]
for review in train['review']:
    if i % 5000 == 0:
        print('{}/{} preprocessing'.format(i,total))
    sentences += review_to_sentences(review,tokenizer,remove_stopwords=False)
    i += 1
for review in test['review']:
    if i % 5000 == 0:
        print('{}/{} preprocessing'.format(i,total))
    sentences += review_to_sentences(review,tokenizer,remove_stopwords=False)
    i += 1
print(len(sentences))       

5000/50000 preprocessing
10000/50000 preprocessing
15000/50000 preprocessing
20000/50000 preprocessing
25000/50000 preprocessing
30000/50000 preprocessing
35000/50000 preprocessing
40000/50000 preprocessing
45000/50000 preprocessing
50000/50000 preprocessing
540589


In [13]:
num_features = 300 # 문자 벡터 차원 수
min_word_count = 40 # 최소 문자 수
num_workers = 4 # 병렬 처리 스레드 수
context = 10 # 문자열 창 크기
downsampling = 1e-3 # 문자 빈도수 Downsample

In [14]:
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s: %(message)s',
                   level=logging.INFO)

model = word2vec.Word2Vec(sentences, 
                          workers=num_workers, 
                          size=num_features, 
                          min_count=min_word_count,
                          window=context,
                          sample=downsampling)
model

2019-06-27 22:13:06,936 : INFO: collecting all words and their counts
2019-06-27 22:13:06,938 : INFO: PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2019-06-27 22:13:07,012 : INFO: PROGRESS: at sentence #10000, processed 208074 words, keeping 10464 word types
2019-06-27 22:13:07,123 : INFO: PROGRESS: at sentence #20000, processed 416297 words, keeping 14566 word types
2019-06-27 22:13:07,192 : INFO: PROGRESS: at sentence #30000, processed 624721 words, keeping 17731 word types
2019-06-27 22:13:07,287 : INFO: PROGRESS: at sentence #40000, processed 828515 words, keeping 20402 word types
2019-06-27 22:13:07,361 : INFO: PROGRESS: at sentence #50000, processed 1041534 words, keeping 22657 word types
2019-06-27 22:13:07,417 : INFO: PROGRESS: at sentence #60000, processed 1249208 words, keeping 24380 word types
2019-06-27 22:13:07,472 : INFO: PROGRESS: at sentence #70000, processed 1464197 words, keeping 26331 word types
2019-06-27 22:13:07,551 : INFO: PROGRESS: at sentenc

2019-06-27 22:14:44,872 : INFO: EPOCH 1 - PROGRESS: at 1.68% examples, 1444 words/s, in_qsize 7, out_qsize 0
2019-06-27 22:14:59,837 : INFO: EPOCH 1 - PROGRESS: at 1.86% examples, 1378 words/s, in_qsize 7, out_qsize 0
2019-06-27 22:15:02,158 : INFO: EPOCH 1 - PROGRESS: at 2.05% examples, 1475 words/s, in_qsize 7, out_qsize 0
2019-06-27 22:15:17,379 : INFO: EPOCH 1 - PROGRESS: at 2.23% examples, 1411 words/s, in_qsize 8, out_qsize 0
2019-06-27 22:15:20,409 : INFO: EPOCH 1 - PROGRESS: at 2.42% examples, 1488 words/s, in_qsize 7, out_qsize 0
2019-06-27 22:15:34,551 : INFO: EPOCH 1 - PROGRESS: at 2.59% examples, 1443 words/s, in_qsize 8, out_qsize 0
2019-06-27 22:15:37,808 : INFO: EPOCH 1 - PROGRESS: at 2.76% examples, 1509 words/s, in_qsize 7, out_qsize 0
2019-06-27 22:15:51,783 : INFO: EPOCH 1 - PROGRESS: at 2.94% examples, 1467 words/s, in_qsize 7, out_qsize 0
2019-06-27 22:15:55,140 : INFO: EPOCH 1 - PROGRESS: at 3.11% examples, 1523 words/s, in_qsize 7, out_qsize 0
2019-06-27 22:16:10

KeyboardInterrupt: 

In [None]:
model.init_sims(replace=True)# 필요없는 메모리 unload
model_name = '300features_40minwords_10text'
model.save(model_name)

In [None]:
model.wv.most_similar(positive=['film'])

In [None]:
model.wv.most_similar(positive=['like'])

In [None]:
model.wv.doesnt_match('man woman child kitchen'.split())

In [None]:
model.wv.vocab

In [None]:
# t-SNE를 통해 시각화
from sklearn.manifold import TSNE
import matplotlib as mpl
import matplotlib.pyplot as plt
import gensim
import gensim.models as g
mpl.rcParams['axes.unicode_minus'] = False
model_name = '300features_40minwords_10text'
model = g.Doc2Vec.load(model_name)

In [None]:
vocab = list(model.wv.vocab)
X = model[vocab]

print(len(X))
print(X[0][:10])
tsne = TSNE(n_components=2)
# 100개 단어만 시각화
X_tsne = tsne.fit_transform(X[:100,:])

In [None]:
df = pd.DataFrame(X_tsne,index=vocab[:100],columns=['x','y'])

In [None]:
fig = plt.figure()
fig.set_size_inches(40,40)
ax = fig.add_subplot(1, 1, 1)
ax.scatter(df['x'],df['y'])

for word,pos in df.iterrows():
    ax.annotate(word, pos, fontsize=30)
plt.show()