## 1. 20뉴스그룹 데이터 전처리
- https://wikidocs.net/69141

In [23]:
import pandas as pd
import numpy as np
import nltk
import tqdm
from nltk.corpus import stopwords
from sklearn.datasets import fetch_20newsgroups
from tensorflow.keras.preprocessing.text import Tokenizer

In [4]:
# 하나의 샘플에 최소 단어 2개는 있어야 함. 
# 그래야만 중심 단어, 주변 단어의 관계가 성립하며 그렇지 않으면 샘플을 구성할 수 없어 에러가 발생

dataset = fetch_20newsgroups(shuffle=True, random_state=1, remove=('headers', 'footers', 'quotes'))
documents = dataset.data
print('총 샘플 수 :',len(documents))

총 샘플 수 : 11314


In [5]:
news_df = pd.DataFrame({'document':documents})
# 특수 문자 제거
news_df['clean_doc'] = news_df['document'].str.replace("[^a-zA-Z]", " ")

# 길이가 3이하인 단어는 제거 (길이가 짧은 단어 제거)
news_df['clean_doc'] = news_df['clean_doc'].apply(lambda x: ' '.join([w for w in x.split() if len(w)>3]))

# 전체 단어에 대한 소문자 변환
news_df['clean_doc'] = news_df['clean_doc'].apply(lambda x: x.lower())

  news_df['clean_doc'] = news_df['document'].str.replace("[^a-zA-Z]", " ")


In [6]:
news_df

Unnamed: 0,document,clean_doc
0,Well i'm not sure about the story nad it did s...,well sure about story seem biased what disagre...
1,"\n\n\n\n\n\n\nYeah, do you expect people to re...",yeah expect people read actually accept hard a...
2,Although I realize that principle is not one o...,although realize that principle your strongest...
3,Notwithstanding all the legitimate fuss about ...,notwithstanding legitimate fuss about this pro...
4,"Well, I will have to change the scoring on my ...",well will have change scoring playoff pool unf...
...,...,...
11309,"Danny Rubenstein, an Israeli journalist, will ...",danny rubenstein israeli journalist will speak...
11310,\n,
11311,\nI agree. Home runs off Clemens are always m...,agree home runs clemens always memorable kinda...
11312,I used HP DeskJet with Orange Micros Grappler ...,used deskjet with orange micros grappler syste...


In [7]:
# 빈값이 있는지 확인
news_df.replace("", float("NaN"), inplace=True)
news_df.isnull().values.any()

True

In [8]:
# null인 값 삭제 
news_df.dropna(inplace=True)
print('총 샘플 수 :',len(news_df))

총 샘플 수 : 10995


In [9]:
# 불용어를 제거, 모든 샘플 중 단어가 1개 이하인 경우도 삭제 
stop_words = stopwords.words('english')
tokenized_doc = news_df['clean_doc'].apply(lambda x: x.split())
tokenized_doc = tokenized_doc.apply(lambda x: [item for item in x if item not in stop_words])
tokenized_doc = tokenized_doc.to_list()

In [11]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(tokenized_doc)
# 정수 인코딩 
word2idx = tokenizer.word_index
idx2word = {value : key for key, value in word2idx.items()}
encoded = tokenizer.texts_to_sequences(tokenized_doc)

In [13]:
encoded[:2]

[[9,
  59,
  603,
  207,
  3278,
  1495,
  474,
  702,
  9470,
  13686,
  5533,
  15229,
  702,
  442,
  702,
  70,
  1148,
  1095,
  1036,
  20294,
  984,
  705,
  4295,
  702,
  217,
  207,
  1979,
  15230,
  13686,
  4865,
  4520,
  87,
  1530,
  6,
  52,
  149,
  581,
  661,
  4406,
  4988,
  4866,
  1920,
  755,
  10668,
  1103,
  7838,
  442,
  957,
  10669,
  634,
  51,
  228,
  2669,
  4989,
  178,
  66,
  222,
  4521,
  6066,
  68,
  4296],
 [1027,
  532,
  2,
  60,
  98,
  582,
  107,
  800,
  23,
  79,
  4522,
  333,
  7839,
  864,
  421,
  3825,
  458,
  6488,
  458,
  2700,
  4730,
  333,
  23,
  9,
  4731,
  7263,
  186,
  310,
  146,
  170,
  642,
  1260,
  107,
  33571,
  13,
  985,
  33572,
  33573,
  9471,
  11491]]

In [14]:
vocab_size = len(word2idx) + 1 
print('단어 집합의 크기 :', vocab_size)

단어 집합의 크기 : 64282


## 2. 네거티브 샘플링을 통한 데이터셋 구성하기

In [15]:
from tensorflow.keras.preprocessing.sequence import skipgrams
# 네거티브 샘플링
# 10개만 해보자 
skip_grams = [skipgrams(sample, vocabulary_size=vocab_size, window_size=10) for sample in encoded[:10]]

In [17]:
# 첫번째 샘플인 skip_grams[0] 내 skipgrams로 형성된 데이터셋 확인
pairs, labels = skip_grams[0][0], skip_grams[0][1]
for i in range(5):
    print("({:s} ({:d}), {:s} ({:d})) -> {:d}".format(
          idx2word[pairs[i][0]], pairs[i][0], 
          idx2word[pairs[i][1]], pairs[i][1], 
          labels[i]))

(world (70), described (984)) -> 1
(look (66), dominican (35851)) -> 0
(commited (7838), mcovingt (12663)) -> 0
(media (702), incidences (20294)) -> 1
(reports (755), reason (149)) -> 1


In [20]:
print('전체 샘플 수 :',len(skip_grams))

전체 샘플 수 : 10


In [21]:
# 첫번째 뉴스그룹 샘플에 대해서 생긴 pairs와 labels의 개수
print(len(pairs))
print(len(labels))

2220
2220


In [26]:
# 모든 뉴스그룹 샘풀에 대해 수행
skip_grams = [skipgrams(sample, vocabulary_size=vocab_size, window_size=10) for sample in encoded]

## 3. Skip-Gram with Negative Sampling(SGNS) 구현하기

In [28]:
from tensorflow.keras.models import Sequential, Model
from tensorflow.keras.layers import Embedding, Reshape, Activation, Input
from tensorflow.keras.layers import Dot
from tensorflow.keras.utils import plot_model
from IPython.display import SVG

In [29]:
# 하이퍼파라미터인 임베딩 벡터의 차원은 100으로 설정하고 2개의 임베딩 층 추가 ㅠㅠ
embedding_dim = 100

# 중심 단어를 위한 임베딩 테이블
w_inputs = Input(shape=(1, ), dtype='int32')
word_embedding = Embedding(vocab_size, embedding_dim)(w_inputs)

# 주변 단어를 위한 임베딩 테이블
c_inputs = Input(shape=(1, ), dtype='int32')
context_embedding  = Embedding(vocab_size, embedding_dim)(c_inputs)

In [40]:
dot_product = Dot(axes=2)([word_embedding, context_embedding])
dot_product = Reshape((1,), input_shape=(1, 1))(dot_product)
output = Activation('sigmoid')(dot_product)

model = Model(inputs=[w_inputs, c_inputs], outputs=output)
model.summary()
model.compile(loss='binary_crossentropy', optimizer='adam')
plot_model(model, to_file='model3.png', show_shapes=True, show_layer_names=True, rankdir='TB')

Model: "model_3"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            [(None, 1)]          0                                            
__________________________________________________________________________________________________
input_2 (InputLayer)            [(None, 1)]          0                                            
__________________________________________________________________________________________________
embedding (Embedding)           (None, 1, 100)       6428200     input_1[0][0]                    
__________________________________________________________________________________________________
embedding_1 (Embedding)         (None, 1, 100)       6428200     input_2[0][0]                    
____________________________________________________________________________________________

In [1]:
for epoch in range(1, 2):
    loss = 0
    for _, elem in enumerate(skip_grams):
        first_elem = np.array(list(zip(*elem[0]))[0], dtype='int32')
        second_elem = np.array(list(zip(*elem[0]))[1], dtype='int32')
        labels = np.array(elem[1], dtype='int32')
        X = [first_elem, second_elem]
        Y = labels
        loss += model.train_on_batch(X,Y)  
    print('Epoch :',epoch, 'Loss :',loss)


NameError: name 'skip_grams' is not defined

## 4. 결과 확인하기

In [None]:
import gensim

f = open('vectors.txt' ,'w')
f.write('{} {}\n'.format(vocab_size-1, embed_size))
vectors = model.get_weights()[0]
for word, i in tokenizer.word_index.items():
    f.write('{} {}\n'.format(word, ' '.join(map(str, list(vectors[i, :])))))
f.close()

# 모델 로드
w2v = gensim.models.KeyedVectors.load_word2vec_format('./vectors.txt', binary=False)

In [None]:
w2v.most_similar(positive=['soldiers'])

In [None]:
w2v.most_similar(positive=['doctor'])

In [None]:
w2v.most_similar(positive=['police'])

In [None]:
w2v.most_similar(positive=['knife'])

In [None]:
w2v.most_similar(positive=['engine'])