# 2023-2 언어데이터과학 24강 (2023-11-29) 실습 (3) Word2Vec을 사용한 연령별 단어 분포 비교

## 코퍼스 준비

In [1]:
import pandas as pd
from gensim.models import Word2Vec

In [2]:
DATA_PATH = '../data/NIKL_OM_form_age_sex.csv.tar.gz'

In [3]:
utterances = pd.read_csv(DATA_PATH, compression='gzip', on_bad_lines='skip')
utterances.dropna(inplace=True)
utterances.rename(columns={utterances.columns[0]: 'id'}, inplace=True)
utterances.set_index('id', inplace=True)
utterances

Unnamed: 0_level_0,form,speaker_id,age,sex
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
MDRW2100000001.1.1,안녕하세요,MDRW2100000001_1,20대,여성
MDRW2100000001.1.4,이거 해봐요><,MDRW2100000001_1,20대,여성
MDRW2100000001.1.7,오 ㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋ,MDRW2100000001_1,20대,여성
MDRW2100000001.1.8,안챙겨도 잘커요,MDRW2100000001_1,20대,여성
MDRW2100000001.1.9,너무 맞는데요ㅜㅜ?,MDRW2100000001_1,20대,여성
...,...,...,...,...
MMRW2100000241.1.2774,그 낚시대회 전용 투망 있을걸???,MMRW2100000241_2,20대,여성
MMRW2100000241.1.2775,한 번도 안 써봄...?,MMRW2100000241_2,20대,여성
MMRW2100000241.1.2776,그거 개꿀인디,MMRW2100000241_2,20대,여성
MMRW2100000241.1.2780,ㅋㅋㅋㅋㅋㅋ잠수복 개귀여웤ㅋㅋㅋㅋ,MMRW2100000241_2,20대,여성


## 코퍼스 가공

In [4]:
corpus = utterances['form'].apply(str.split)
print(corpus[:5])

id
MDRW2100000001.1.1            [안녕하세요]
MDRW2100000001.1.4        [이거, 해봐요><]
MDRW2100000001.1.7    [오, ㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋ]
MDRW2100000001.1.8        [안챙겨도, 잘커요]
MDRW2100000001.1.9      [너무, 맞는데요ㅜㅜ?]
Name: form, dtype: object


## 코퍼스 분할

발화를 발화자의 연령에 따라 분할하기

In [8]:
utterances['age'].value_counts()

age
20대       1593040
30대       1024097
40대 이상     251095
10대        109608
Name: count, dtype: int64

In [36]:
corpus20 = utterances[utterances['age'] == '20대']['form'].apply(str.split)
corpus30 = utterances[utterances['age'] == '30대']['form'].apply(str.split)
corpus40 = utterances[utterances['age'] == '40대 이상']['form'].apply(str.split)
corpus40

id
MDRW2100000021.1.2                  [왜?]
MDRW2100000021.1.4                   [왜]
MDRW2100000021.1.5               [티브시청중]
MDRW2100000021.1.7                   [왜]
MDRW2100000021.1.8                  [허준]
                              ...       
MMRW2100000225.1.6326         [name30이?]
MMRW2100000225.1.6349        [현금으로, 나와?]
MMRW2100000225.1.6351       [날씨, 진짜, 덥네]
MMRW2100000225.1.6354                [응]
MMRW2100000225.1.6355    [오는동안이, 너무, 더워]
Name: form, Length: 251095, dtype: object

## 코퍼스별 Word2Vec 모델 훈련

In [31]:
d = 100 # dimension of word vectors
L = 2 # window size
k = 5 # negative samples per positive sample

In [37]:
model20 = Word2Vec(sentences=corpus20, sg=1, min_count=5, vector_size=d, window=L, negative=k)
model20.save('../models/word2vec-modu-online-age20s')
# model20 = Word2Vec.load('../models/word2vec-modu-online-age20s')

In [5]:
model30 = Word2Vec(sentences=corpus30, sg=1, min_count=5, vector_size=d, window=L, negative=k)
model30.save('../models/word2vec-modu-online-age30s')
# model30 = Word2Vec.load('../models/word2vec-modu-online-age30s')

In [6]:
model40 = Word2Vec(sentences=corpus40, sg=1, min_count=5, vector_size=d, window=L, negative=k)
model40.save('../models/word2vec-modu-online-age40s')
# model40 = Word2Vec.load('../models/word2vec-modu-online-age40s')

## '아주'와 '완전'의 연령별 분포 비교

In [38]:
w1 = '아주'
w2 = '완전'

In [39]:
model20.wv.most_similar([w1])

[('매우', 0.7886679172515869),
 ('굉장히', 0.7853104472160339),
 ('편하고', 0.7430997490882874),
 ('훨', 0.7225634455680847),
 ('시원하고', 0.7224810123443604),
 ('넘나', 0.7207802534103394),
 ('너무너무', 0.7156858444213867),
 ('나름', 0.7131124138832092),
 ('좋고', 0.710421621799469),
 ('보기', 0.708667516708374)]

In [40]:
def get_most_similar_words(model, word, topn=30):
    return [w for (w, _) in model.wv.most_similar([word], topn=topn)]

In [41]:
# w1: '아주'
print(get_most_similar_words(model20, w1))
print(get_most_similar_words(model30, w1))
print(get_most_similar_words(model40, w1))

['매우', '굉장히', '편하고', '훨', '시원하고', '넘나', '너무너무', '나름', '좋고', '보기', '되게', '훨씬', '무척', '참', '워낙', '상당히', '완전', '은근', '너뮤', '가격도', '보여서', '시원해서', '여러모로', '분위기가', '살짝', '선선하니', '귀엽고', '의외로', '특유의', '예쁘고']


NameError: name 'model30' is not defined

In [11]:
# w2: '완전'
print(get_most_similar_words(model20, w2))
print(get_most_similar_words(model30, w2))
print(get_most_similar_words(model40, w2))

['짱', '넘나', '너무너무', '겁나', '되게', '대박', '아주', '넘', '디게', '왕', '연기', '들어도', '너뮤', '생각만해도', '굉장히', '진쨔', '졸라', '은근', 'ㅈㄴ', '왤캐', '!!!', '겁내', '진짜..', '!!!!', '목소리', '귀엽고', '최고', '으악', '진짜진짜', '매우']
['대박', '너무너무', '넘나', '짱', '되게', '겁나', '은근', 'ㅠㅠㅠㅠㅠ', '우왕', '크', '참', '넘', '맛있고', '매우', '무지', '최고', '진짜', '세상', '와우', '생각만해도', '정말', '굉장히', '좋다', 'ㅜㅜㅜ', '!!', 'ㅋㅋㅋㅋ아', '아주', '맛있겠다', '둘다', '증말']
['그쵸', 'ㅋㅋㅋㅋㅋ', '대박', '저런', '역시', '그렇죠', '헐', '그러게요', '저도요', '좋네요', 'ㅎㅎㅎ', '그것도', '헉', '앗', '웃겨', '아하', '웅', '그런가요', 'ㅋㅋㅋㅋㅋㅋ', '벌써', '그건', 'ㅎㅎㅎㅎ', '다들', '그래요?', '좋네', '그래요', '그렇구나', '좋죠', '그러시군요', '우와']


In [21]:
model20.wv.most_similar(positive=['아주', '완전'])

[('짱', 0.8095357418060303),
 ('넘나', 0.8067042231559753),
 ('너무너무', 0.7877914905548096),
 ('매우', 0.7732204794883728),
 ('굉장히', 0.7706792950630188),
 ('되게', 0.7693825960159302),
 ('겁나', 0.7388333678245544),
 ('너뮤', 0.7318142056465149),
 ('무척', 0.7240220904350281),
 ('넘', 0.7208937406539917)]

In [22]:
model30.wv.most_similar(positive=['아주', '완전'])

[('넘나', 0.879448413848877),
 ('너무너무', 0.8639437556266785),
 ('매우', 0.8635697364807129),
 ('되게', 0.8561486005783081),
 ('은근', 0.8422956466674805),
 ('참', 0.8250255584716797),
 ('굉장히', 0.8187215924263),
 ('생각보다', 0.8132363557815552),
 ('겁나', 0.8108944892883301),
 ('무지', 0.807982325553894)]

In [23]:
model40.wv.most_similar(positive=['아주', '완전'])

[('역시', 0.9787756204605103),
 ('그쵸', 0.9763455986976624),
 ('ㅎㅎㅎㅎ', 0.9728397727012634),
 ('그러게요', 0.9717161655426025),
 ('그것도', 0.9711495041847229),
 ('오호', 0.9670016765594482),
 ('저도요', 0.9667932987213135),
 ('그런가요', 0.9662917256355286),
 ('ㅋㅋㅋㅋㅋㅋ', 0.9652441740036011),
 ('좋네', 0.9650589227676392)]

In [39]:
print(get_most_similar_words(model40, '너무'))

['엄청', '넘', '참', '정말', '진짜', '젤', '날씨가', '요즘은', '요새', '가을', '생각보다', '완전', 'ㅠ', '반갑습니다^^', '다들', '요즘', '날씨', '그나마', '헐', '가을이', '겁나', '와', '특히', '좋아서', '제일', '아주', '아무래도', '그런가', '그러게요', '눈이']
