이 자료는 위키독스 딥 러닝을 이용한 자연어 처리 입문의 네거티브 샘플링 구현하기 튜토리얼입니다.  

링크 : https://wikidocs.net/69141  

2021년 10월 14일에 마지막으로 테스트되었습니다.

# 1. 20뉴스그룹 데이터 전처리하기

In [1]:
import pandas as pd
import numpy as np
import nltk
from nltk.corpus import stopwords
from sklearn.datasets import fetch_20newsgroups
from tensorflow.keras.preprocessing.text import Tokenizer

In [2]:
dataset = fetch_20newsgroups(shuffle=True, random_state=1, remove=('headers', 'footers', 'quotes'))
documents = dataset.data
print('총 샘플 수 :',len(documents))

총 샘플 수 : 11314


In [3]:
news_df = pd.DataFrame({'document':documents})
# 특수 문자 제거
news_df['clean_doc'] = news_df['document'].str.replace("[^a-zA-Z]", " ")
# 길이가 3이하인 단어는 제거 (길이가 짧은 단어 제거)
news_df['clean_doc'] = news_df['clean_doc'].apply(lambda x: ' '.join([w for w in x.split() if len(w)>3]))
# 전체 단어에 대한 소문자 변환
news_df['clean_doc'] = news_df['clean_doc'].apply(lambda x: x.lower())

  This is separate from the ipykernel package so we can avoid doing imports until


In [4]:
news_df.head()

Unnamed: 0,document,clean_doc
0,Well i'm not sure about the story nad it did s...,well sure about story seem biased what disagre...
1,"\n\n\n\n\n\n\nYeah, do you expect people to re...",yeah expect people read actually accept hard a...
2,Although I realize that principle is not one o...,although realize that principle your strongest...
3,Notwithstanding all the legitimate fuss about ...,notwithstanding legitimate fuss about this pro...
4,"Well, I will have to change the scoring on my ...",well will have change scoring playoff pool unf...


In [5]:
#null값 확인
news_df.isnull().values.any()

False

In [6]:
#empty 값 확인
news_df.replace("", float("NaN"), inplace=True)
news_df.isnull().values.any()

True

In [7]:
news_df.dropna(inplace=True)
print('총 샘플의 수:' , len(news_df))

총 샘플의 수: 10995


In [8]:
# 불용어를 제거합니다.
import nltk
nltk.download('stopwords')
stop_words = stopwords.words('english') # NLTK로부터 불용어를 받아옵니다.
tokenized_doc = news_df['clean_doc'].apply(lambda x: x.split()) # 토큰화
tokenized_doc = tokenized_doc.apply(lambda x: [item for item in x if item not in stop_words])

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [9]:
tokenized_doc = tokenized_doc.to_list()

In [10]:
# 단어가 1개 이하인 경우 중심 단어, 주변 단어가 존재하지 않으므로 불가.
drop_train = [index for index, sentence in enumerate(tokenized_doc) if len(sentence) <= 1]

In [11]:
print(drop_train)

[44, 260, 353, 1651, 1839, 2321, 2336, 2371, 2862, 2963, 3290, 3387, 3395, 3396, 3421, 3563, 3591, 3713, 3874, 3897, 4180, 4524, 4587, 4617, 4947, 4970, 5129, 5525, 6015, 6227, 6652, 6723, 6883, 7080, 7956, 8000, 8156, 8212, 8283, 8588, 8867, 8903, 9045, 9555, 9696, 10439, 10447, 10564, 10707, 10730, 10750, 10838, 10896, 10908, 10967]


In [12]:
tokenized_doc = np.delete(tokenized_doc, drop_train, axis=0)

  arr = asarray(arr)


In [13]:
print("총 샘플 수:", len(tokenized_doc))

총 샘플 수: 10940


In [14]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(tokenized_doc)

word2idx = tokenizer.word_index
idx2word = {value : key for key, value in word2idx.items()}
encoded = tokenizer.texts_to_sequences(tokenized_doc)

In [15]:
#상위 2개의 샘플 추출

print(encoded[:2])

[[9, 59, 603, 207, 3278, 1495, 474, 702, 9470, 13686, 5533, 15227, 702, 442, 702, 70, 1148, 1095, 1036, 20294, 984, 705, 4294, 702, 217, 207, 1979, 15228, 13686, 4865, 4520, 87, 1530, 6, 52, 149, 581, 661, 4406, 4988, 4866, 1920, 755, 10668, 1102, 7837, 442, 957, 10669, 634, 51, 228, 2669, 4989, 178, 66, 222, 4521, 6066, 68, 4295], [1026, 532, 2, 60, 98, 582, 107, 800, 23, 79, 4522, 333, 7838, 864, 421, 3825, 458, 6488, 458, 2700, 4730, 333, 23, 9, 4731, 7262, 186, 310, 146, 170, 642, 1260, 107, 33568, 13, 985, 33569, 33570, 9471, 11491]]


In [16]:
vocab_size = len(word2idx) + 1 
print('단어 집합의 크기 :', vocab_size)

단어 집합의 크기 : 64277


In [17]:
from tensorflow.keras.preprocessing.sequence import skipgrams

In [18]:
# 네거티브 샘플링
skip_grams = [skipgrams(sample, vocabulary_size=vocab_size, window_size=10) for sample in encoded[:10]]

# 첫번째 샘플인 skip_grams[0] 내 skipgrams로 형성된 데이터셋 확인
pairs, labels = skip_grams[0][0], skip_grams[0][1]
for i in range(5):
    print("({:s} ({:d}), {:s} ({:d})) -> {:d}".format(
          idx2word[pairs[i][0]], pairs[i][0], 
          idx2word[pairs[i][1]], pairs[i][1], 
          labels[i]))

(israeli (442), austria (4866)) -> 1
(daily (1920), israeli (442)) -> 1
(austria (4866), nosebleed (29526)) -> 0
(clearly (661), soldiers (957)) -> 1
(soldiers (957), look (66)) -> 1


In [19]:
print('전체 샘플 수:' , len(skip_grams))

전체 샘플 수: 10


In [20]:
# 첫번째 샘플에 대해서 생긴 pairs와 labels
print(len(pairs))
print(len(labels))

2220
2220


In [21]:
#모든 뉴스그룹 샘플에 대해서 수행

skip_grams = [skipgrams(sample, vocabulary_size=vocab_size, window_size=10) for sample in encoded]

# 2. Skip-Gram with Negative Sampling(SGNS) 구현하기

In [30]:
from tensorflow.keras.models import Sequential, Model
from tensorflow.keras.layers import Embedding, Reshape, Activation, Input
from tensorflow.keras.layers import Dot
from tensorflow.keras.utils import plot_model
from IPython.display import SVG
import tensorflow as tf

In [31]:
embedding_dim = 100

In [32]:
# 중심 단어를 위한 임베딩 테이블
w_inputs = Input(shape=(1, ), dtype='int32')
word_embedding = Embedding(vocab_size, embedding_dim)(w_inputs)

# 주변 단어를 위한 임베딩 테이블
c_inputs = Input(shape=(1, ), dtype='int32')
context_embedding  = Embedding(vocab_size, embedding_dim)(c_inputs)

In [33]:
dot_product = Dot(axes=2)([word_embedding, context_embedding])
dot_product = Reshape((1,), input_shape=(1, 1))(dot_product)
output = Activation('sigmoid')(dot_product)

In [34]:
with tf.device('/device:GPU:0'):
    model = Model(inputs=[w_inputs, c_inputs], outputs=output)
    model.summary()
    model.compile(loss='binary_crossentropy', optimizer='adam')
    plot_model(model, to_file='skip_gram.png', show_shapes=True, show_layer_names=True, rankdir='TB')

Model: "model_1"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_3 (InputLayer)           [(None, 1)]          0           []                               
                                                                                                  
 input_4 (InputLayer)           [(None, 1)]          0           []                               
                                                                                                  
 embedding_2 (Embedding)        (None, 1, 100)       6427700     ['input_3[0][0]']                
                                                                                                  
 embedding_3 (Embedding)        (None, 1, 100)       6427700     ['input_4[0][0]']                
                                                                                            

In [35]:
with tf.device('/device:GPU:0'):  
    for epoch in range(1, 6):
        loss = 0
        for _, elem in enumerate(skip_grams):
            first_elem = np.array(list(zip(*elem[0]))[0], dtype='int32')
            second_elem = np.array(list(zip(*elem[0]))[1], dtype='int32')
            labels = np.array(elem[1], dtype='int32')
            X = [first_elem, second_elem]
            Y = labels
            loss += model.train_on_batch(X,Y)  
        print('Epoch :',epoch, 'Loss :',loss)

Epoch : 1 Loss : 4626.999836221337
Epoch : 2 Loss : 3667.6702266596258


# 3. 결과 확인하기

In [36]:
import gensim

In [37]:
f = open('vectors.txt' ,'w')
f.write('{} {}\n'.format(vocab_size-1, embedding_dim))
vectors = model.get_weights()[0]
for word, i in tokenizer.word_index.items():
    f.write('{} {}\n'.format(word, ' '.join(map(str, list(vectors[i, :])))))
f.close()

In [38]:
w2v = gensim.models.KeyedVectors.load_word2vec_format('./vectors.txt', binary=False)

In [39]:
w2v.most_similar(positive=['disease'])

[('lyme', 0.6410250663757324),
 ('treatment', 0.6345786452293396),
 ('patients', 0.6202646493911743),
 ('diseases', 0.6182955503463745),
 ('organism', 0.5998130440711975),
 ('yeast', 0.5960279107093811),
 ('quack', 0.5926334261894226),
 ('infection', 0.5876334309577942),
 ('doctors', 0.5827353596687317),
 ('dietary', 0.5818358659744263)]

In [40]:
w2v.most_similar(positive=['soldiers'])

[('villages', 0.7981277704238892),
 ('lebanon', 0.7938109636306763),
 ('wounded', 0.791567325592041),
 ('syrians', 0.7861190438270569),
 ('civilians', 0.7822908163070679),
 ('troops', 0.7757846713066101),
 ('terrorist', 0.774803876876831),
 ('massacred', 0.7662931680679321),
 ('baku', 0.7624136209487915),
 ('slaughtered', 0.7620065212249756)]

In [41]:
w2v.most_similar(positive=['police'])

[('governments', 0.6240807771682739),
 ('compromise', 0.6172217726707458),
 ('minorities', 0.6166990995407104),
 ('officers', 0.6066833734512329),
 ('prisons', 0.5983291864395142),
 ('banning', 0.587188184261322),
 ('casualties', 0.5846469402313232),
 ('grounds', 0.5840885639190674),
 ('enforcement', 0.5830293297767639),
 ('civilian', 0.5794256925582886)]

In [42]:
w2v.most_similar(positive=['hero'])

[('blaming', 0.602494478225708),
 ('unconvincing', 0.5977667570114136),
 ('lamb', 0.5845485329627991),
 ('grew', 0.5796825289726257),
 ('crucify', 0.5793640613555908),
 ('merciful', 0.571392297744751),
 ('thessalonians', 0.5708247423171997),
 ('proceeding', 0.5648764371871948),
 ('begun', 0.5648080110549927),
 ('parents', 0.5646618604660034)]

In [43]:
w2v.most_similar(positive=['engine'])

[('tires', 0.5625557899475098),
 ('steering', 0.5574847459793091),
 ('seat', 0.5367389917373657),
 ('tune', 0.5289426445960999),
 ('ground', 0.5249112844467163),
 ('throttle', 0.5156267881393433),
 ('valve', 0.507521390914917),
 ('slip', 0.5051360726356506),
 ('gear', 0.5050402283668518),
 ('pickup', 0.5048443078994751)]

In [44]:
w2v.most_similar(positive=['doctor'])

[('quack', 0.6410490274429321),
 ('atkins', 0.6277726292610168),
 ('pain', 0.6146212220191956),
 ('oral', 0.5994016528129578),
 ('anecdotal', 0.583179771900177),
 ('yeast', 0.5770706534385681),
 ('treatments', 0.574281632900238),
 ('tradition', 0.5669472813606262),
 ('weight', 0.5656561851501465),
 ('diet', 0.5589014291763306)]

In [45]:
w2v.most_similar(positive=['money'])

[('revisionists', 0.45867007970809937),
 ('budget', 0.43610745668411255),
 ('smoke', 0.4102879762649536),
 ('benz', 0.40686142444610596),
 ('spending', 0.3967535197734833),
 ('calif', 0.39170384407043457),
 ('tosar', 0.3898450434207916),
 ('taxes', 0.3777411878108978),
 ('funds', 0.37328413128852844),
 ('disgust', 0.3714861273765564)]