In [2]:
from gensim.models import Word2Vec
from nltk.tokenize import word_tokenize
import nltk
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /Users/gieunkwak/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/gieunkwak/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

### word2vec을 이용한 단어 embedding

In [124]:
# GPT야, simpsons 캐릭터 이름이 들어간 랜덤 문장 10개를 생성해줘

sentences = ["Homer Simpson forgot his lunch at home, so he had to buy a burger on his way to work.",
    "Marge was busy knitting a new sweater for Bart's upcoming school play.",
    "Lisa Simpson played a beautiful saxophone solo at the school concert.",
    "Mr. Burns secretly plotted another scheme from his office at the Springfield Nuclear Power Plant.",
    "Ned Flanders offered to help Homer fix the fence between their houses.",
    "Bart Simpson tried a new prank at school, but it didn't go as planned.",
    "Milhouse and Bart spent the afternoon playing video games and forgot to do their homework.",
    "Maggie Simpson's adorable giggle filled the room as she played with her toys.",
    "Apu had a busy day at the Kwik-E-Mart, dealing with a rush of customers.",
    "Krusty the Clown decided to change his show a bit to attract a new audience."]

In [125]:
# preprocessing
# get rid of stopwords, lower case

sentences = [s.lower().replace(".", "").split(" ") for s in sentences]

In [126]:
sentences[0]

['homer',
 'simpson',
 'forgot',
 'his',
 'lunch',
 'at',
 'home,',
 'so',
 'he',
 'had',
 'to',
 'buy',
 'a',
 'burger',
 'on',
 'his',
 'way',
 'to',
 'work']

In [129]:
# train word2vec

skip_gram = Word2Vec(sentences, vector_size=300, min_count=1, window=5, sg=1)
cbow = Word2Vec(sentences, vector_size=100, min_count=1, window=5, sg=0)

In [130]:
print("{} 의 vector representation : \n{}".format('homer', skip_gram.wv.get_vector(skip_gram.wv.key_to_index['homer'])))

homer 의 vector representation : 
[ 4.3701060e-04  2.2055381e-03  3.3186206e-03  2.9940126e-03
 -2.6567397e-03  2.1234783e-03 -1.8821131e-03 -2.3819870e-04
  1.6127196e-04  2.1778548e-03  1.5117687e-03  1.5151121e-03
  3.1674108e-03  1.1945881e-04 -2.0241914e-03 -2.1457686e-03
  2.2043383e-03 -1.7624284e-03 -9.4591797e-04  1.2793364e-03
 -7.2408590e-04 -2.0029263e-03 -7.5762207e-04  4.2041132e-04
  7.6597626e-04  2.0379657e-03 -1.7428604e-03  1.0297104e-03
  2.4278299e-03  6.9946179e-04  1.7967316e-03 -1.5996851e-03
  2.0529374e-03 -2.5236835e-03  1.1533408e-03 -3.0791005e-03
 -8.4886025e-04 -3.0566400e-03 -5.1756331e-04 -1.7981926e-03
 -1.2938769e-03  3.8499379e-04  9.4194955e-04 -5.1014154e-04
 -2.6721673e-03 -1.9110956e-03  2.8245136e-04 -1.2844786e-03
 -3.1514806e-03 -2.4119446e-04  2.2119523e-03  1.9877201e-03
 -3.3284628e-03  1.0791372e-03 -2.0474067e-03 -3.0453433e-03
  3.5906611e-05 -8.0902988e-05 -2.3292252e-03 -2.0569263e-03
 -7.9015619e-04  2.3795723e-03 -2.5085832e-03  2.559

In [131]:
skip_gram.wv.most_similar("homer")

[('video', 0.14035673439502716),
 ('his', 0.12365606427192688),
 ('adorable', 0.11183691769838333),
 ('burger', 0.10865367203950882),
 ('planned', 0.09786190837621689),
 ('she', 0.09258976578712463),
 ('do', 0.09069041162729263),
 ('as', 0.08781418949365616),
 ('concert', 0.08753109723329544),
 ('lisa', 0.08640199154615402)]

In [132]:
homer_vector = skip_gram.wv.get_vector(skip_gram.wv.key_to_index['homer'])
video_vector = skip_gram.wv.get_vector(skip_gram.wv.key_to_index['video'])

In [133]:
# 유사도 계산하기 from scratch
import numpy as np
from numpy.linalg import norm

def cosine_similarity(vector_a, vector_b):
    """Calculate the cosine similarity between two vectors."""
    dot_product = np.dot(vector_a, vector_b)
    norm_a = norm(vector_a)
    norm_b = norm(vector_b)
    similarity = dot_product / (norm_a * norm_b)
    return similarity


In [134]:
cosine_similarity(homer_vector, video_vector)

0.14035673

### Simpsons dataset을 활용한 Word2Vec

![](https://images.edrawmax.com/what-is/simpsons-family-tree/example.png) <br>
출처 : https://images.edrawmax.com/what-is/simpsons-family-tree/example.png

In [3]:
import re
import pandas as pd
from time import time

import spacy

In [4]:
df = pd.read_csv('simpsons_dataset.csv')
df.shape

(158314, 2)

In [5]:
df.head()

Unnamed: 0,raw_character_text,spoken_words
0,Miss Hoover,"No, actually, it was a little of both. Sometim..."
1,Lisa Simpson,Where's Mr. Bergstrom?
2,Miss Hoover,I don't know. Although I'd sure like to talk t...
3,Lisa Simpson,That life is worth living.
4,Edna Krabappel-Flanders,The polls will be open from now until the end ...


In [7]:
df.isnull().sum()

raw_character_text    17814
spoken_words          26459
dtype: int64

In [33]:
df.loc[0, 'spoken_words']

"No, actually, it was a little of both. Sometimes when a disease is in all the magazines and all the news shows, it's only natural that you think you have it."

#### 데이터 전처리

In [36]:
# We are lemmatizing and removing the stopwords and non-alphabetic characters for each line of dialogue.

nlp = spacy.load('en_core_web_sm', disable=['ner', 'parser'])

def cleaning(doc):
    # Lemmatizes and remove stopwords
    txt = [token.lemma_ for token in doc if not token.is_stop]
    if len(txt) > 2:
        return ' '.join(txt)

In [37]:
# only keep alphabets
cleaner = (re.sub("[^A-Za-z']+", ' ', str(row)).lower() for row in df['spoken_words'])

In [42]:
t = time()
txt = [cleaning(doc) for doc in nlp.pipe(cleaner, batch_size=5000)]

In [39]:
txt[0]

'actually little disease magazine news show natural think'

In [40]:
# dataframe에 넣어서 null이 있는 대화는 삭제
# 주로 null은 특정 행동을 했지만 대화가 없었을 때임

df_clean = pd.DataFrame({'clean': txt})
df_clean = df_clean.dropna().drop_duplicates()
df_clean.shape

(85956, 1)

In [80]:
# 하나의 문장을 여러 단위의 단어로 분할
sentences = [s.split(' ') for s in df_clean['clean']]

In [81]:
len(sentences)

85956

#### Word2Vec 모델 훈련

In [63]:
from gensim.models import Word2Vec

In [64]:
# help(Word2Vec)

- `window` : 문장 내에서 현재 단어와 예측 단어 사이의 최대 거리. ex) 타겟 단어의 왼쪽과 오른쪽 n번째 단어
- `vector_size` : 단어 벡터의 차원 수
- `min_count` : 이 값보다 총 절대 빈도수가 낮은 모든 단어를 무시함 - (2, 100)
- `sg` : 1은 skip-gram, 0은 CBOW method를 사용

In [89]:
# 모델 정의 하기
w2v_model = Word2Vec(min_count=20,
                     window=2,
                     vector_size=300,
                     sample=6e-5, 
                     alpha=0.03, 
                     min_alpha=0.0007)

In [90]:
# 문장에 들어있는 각 단어들을 Word2Vec 모델이 인식할 수 있는 형태로 변환
w2v_model.build_vocab(sentences)

In [96]:
# 모델 훈련
w2v_model.train(sentences, total_examples=w2v_model.corpus_count, epochs=100)

(19987756, 54001900)

In [68]:
# help(w2v_model.train)

### 단어간 유사도 확인하기

In [107]:
# dir(w2v_model.wv)

- most_similar : 주어진 조건에 가장 적합한 단어 탐색
- similarity : 주어진 단어들의 유사도 계산
- doesnt_match : 주어진 단어들 중 가장 '덜 유사한' 단어

In [104]:
# help(w2v_model.wv.most_similar)

In [105]:
# help(w2v_model.wv.similarity)

In [98]:
w2v_model.wv.most_similar(positive=["homer"])

[('marge', 0.42559731006622314),
 ('simpson', 0.36902758479118347),
 ('bart', 0.32775843143463135),
 ('mr', 0.28152111172676086),
 ('lisa', 0.2735844552516937),
 ('wife', 0.2713277339935303),
 ('people', 0.2501447796821594),
 ('son', 0.21831999719142914),
 ('moe', 0.2168947011232376),
 ('family', 0.20857103168964386)]

In [99]:
w2v_model.wv.most_similar(positive=["bart"])

[('lisa', 0.4613502621650696),
 ('dad', 0.35596969723701477),
 ('boy', 0.3391043245792389),
 ('homer', 0.3277583718299866),
 ('child', 0.30410629510879517),
 ('mom', 0.302735298871994),
 ('milhouse', 0.29639771580696106),
 ('parent', 0.29103147983551025),
 ('mother', 0.28199145197868347),
 ('kid', 0.2726426422595978)]

In [100]:
w2v_model.wv.most_similar(positive=["woman", "homer"], negative=["marge"], topn=3)

[('man', 0.2712000608444214),
 ('guide', 0.2056269347667694),
 ('modern', 0.19630517065525055)]

In [101]:
w2v_model.wv.most_similar(positive=["woman", "bart"], negative=["man"], topn=3)

[('lisa', 0.25555509328842163),
 ('mom', 0.23903565108776093),
 ('embarrassing', 0.22747930884361267)]

In [108]:
w2v_model.wv.doesnt_match(['bart', 'homer', 'marge'])

'bart'

In [109]:
w2v_model.wv.doesnt_match(['bart', 'lisa', 'marge'])

'marge'

### 단어 임베딩의 한계점

In [138]:
bank_vector = w2v_model.wv.get_vector(w2v_model.wv.key_to_index['bank'])

- 단어에 불과하기 때문에 context를 고려하지 못 한다

In [140]:
bank_vector

array([ 0.48501942,  0.43569145, -0.29942   , -0.22898197, -0.9711868 ,
       -0.32046235,  0.5643262 , -0.01186122, -0.5335374 ,  0.27579722,
       -0.18607752,  0.9545646 ,  0.20190725, -0.34718508,  0.6243724 ,
       -0.06537967,  0.12549879,  0.75643426,  0.8152887 ,  0.7127831 ,
       -0.7926212 ,  1.0434473 , -1.4697722 , -0.36695647,  1.1339116 ,
        0.49978697,  0.92579854, -1.8430809 ,  0.05365827, -0.81561154,
       -0.505158  ,  0.03750321,  0.17283091, -0.21995084,  0.6998929 ,
        0.24177593,  0.98734653,  0.1569476 , -0.4521674 , -0.7442057 ,
       -0.13561198,  0.22614641, -0.77001715, -0.88620585,  0.17705469,
       -0.28627655,  0.43960652, -0.29148743, -0.38570356, -1.1539671 ,
       -0.7714859 ,  0.04027198, -0.32321674, -0.22973727,  0.29156774,
        0.6530905 ,  1.428275  ,  0.83454007,  0.38090903,  0.18053168,
        0.35904467,  0.19431151,  0.3605693 ,  0.19162191,  0.2537165 ,
       -1.6577637 , -0.45984957,  0.29928204, -0.70044196, -0.04

### sentence embeddings

In [141]:
from transformers import BertTokenizer, BertModel
import torch

  from .autonotebook import tqdm as notebook_tqdm


In [142]:
# pre-trained model tokenizer와 and bert model
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') # smaller & uncased model
model = BertModel.from_pretrained('bert-base-uncased')

tokenizer.json: 100%|██████████| 466k/466k [00:00<00:00, 3.70MB/s]


In [None]:
# bank가 들어간 유사한 문장 두 개
sentence1 = "I deposited money at the bank."
sentence2 = "The ducks swam to the river bank."

In [143]:
# 문장을 BERT가 인식할 수 있는 형태로 Tokenize
encoded_input1 = tokenizer(sentence1, return_tensors='pt')
encoded_input2 = tokenizer(sentence2, return_tensors='pt')

In [148]:
encoded_input1

{'input_ids': tensor([[  101,  1045, 14140,  2769,  2012,  1996,  2924,  1012,   102]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1]])}

- `input_ids` : 각 단어별로 매핑된 key. 101은 문장의 시작을, 102는 문장의 끝을 의미
- `token_type_ids` : 문장 번호
- `attention_mask` : attention을 가져야 하는 단어는 1, 그렇지 않은 단어는 0. (만약 input이 실제 단어들이라면 1)

In [147]:
# embedding 생성!
with torch.no_grad():
    output1 = model(**encoded_input1)
    output2 = model(**encoded_input2)

In [144]:
# embedding 내에서 bank라는 단어 찾아오기 (문장의 5번째에 있는 단어)
bank_embedding_sentence1 = output1.last_hidden_state[0, 5, :]
bank_embedding_sentence2 = output2.last_hidden_state[0, 5, :]

In [146]:
# cosine similarity 계산을 통해 얼마나 유사한지 검증

similarity = torch.nn.functional.cosine_similarity(bank_embedding_sentence1, bank_embedding_sentence2, dim=0)

# print("Embedding for 'bank' in sentence 1:", bank_embedding_sentence1)
# print("Embedding for 'bank' in sentence 2:", bank_embedding_sentence2)
print("Cosine similarity between the two embeddings:", similarity)

Cosine similarity between the two embeddings: tensor(0.5922)
