<a href="https://colab.research.google.com/github/daraha76/Gensim_practice/blob/main/gensim_test.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import gensim.models
from gensim.models import FastText
from gensim.models import KeyedVectors

# FastText 학습하기

In [None]:
# 예제 sentences 불러오기
# ㄴsentences: a list of lists of strings

from gensim.test.utils import common_texts
for text in common_texts:
  print(text)

['human', 'interface', 'computer']
['survey', 'user', 'computer', 'system', 'response', 'time']
['eps', 'user', 'interface', 'system']
['system', 'human', 'system', 'eps']
['user', 'response', 'time']
['trees']
['graph', 'trees']
['graph', 'minors', 'trees']
['graph', 'minors', 'survey']


In [None]:
# defnie a new model

model = FastText(min_count=1, size=10)

# ㄴmin_count: min_count 보다 적게 나온 단어 무시
# ㄴsize: embedded vector의 size


In [None]:
# build a vocabulary from corpus

model.build_vocab(common_texts)

In [None]:
print(model.corpus_count)  # 문장 개수
print(model.corpus_total_words)  # 전체 단어 개수

9
29


In [None]:
# train the model

model.train(sentences=common_texts, epochs=20, total_examples=len(common_texts))

# ㄴsentences: list of documents
# ㄴtotal_examples: length of 'sentences' list

   

# 학습된 word vector 를 사용하여 계산하기

In [None]:
# KeyedVector
# model.wv: train이 완료된 KeyedVector, [number of words]x[size of each vector]

# KeyedVector의 크기
wv_len = len(model.wv.index2word)
print("size of vocabulary:\t%i" %(wv_len))

# KeyedVector에 저장된 단어
for index, word in enumerate(model.wv.index2word):
  print("%ith word of %i is\t%s" %(index, wv_len, word))

size of vocabulary:	12
0th word of 12 is	system
1th word of 12 is	user
2th word of 12 is	trees
3th word of 12 is	graph
4th word of 12 is	human
5th word of 12 is	interface
6th word of 12 is	computer
7th word of 12 is	survey
8th word of 12 is	response
9th word of 12 is	time
10th word of 12 is	eps
11th word of 12 is	minors


In [None]:
# wv에서 단어 검색
print('system' in model.wv.index2word)
print('systems' in model.wv.index2word)

# KeyedVector 값
print(model.wv['system'])
print(model.wv['systems'])


True
False
[-0.01673033 -0.00173604 -0.01406091 -0.00444463 -0.0120219   0.00897863
 -0.0078133   0.02575558  0.01035542  0.00251172]
[-0.01627489 -0.01034656 -0.00327163 -0.00874158 -0.01601067  0.02255329
 -0.01308308  0.02472889  0.01730176 -0.00352729]


In [None]:
# 'human'과 유사한 단어 검색
similarity = model.wv.most_similar(positive=['human'], topn=wv_len)   # 최대 topn개의 유사 단어들을 찾음(default=10) 
print("most similar word is [%s]\n" %(similarity[0][0]))

print("similar words in descneding order of similarity")
for word_score in similarity: 
  print("%s: %.8f" %(word_score[0], word_score[1]))
print('\n')


# vector 연산을 이용한 검색
# 'human' + computer' - 'user' = ?
similarity = model.wv.most_similar(positive=['human', 'computer'], negative=['user'], topn=wv_len)
print("most similar word is [%s]\n" %(similarity[0][0]))

print("similar words in descneding order of similarity")
for word_score in similarity: 
  print("%s: %.8f" %(word_score[0], word_score[1]))
print('\n')


# 두 단어의 유사도 
print(model.wv.similarity("human", "humans"))
print(model.wv.similarity("human", "system"))
print(model.wv.similarity("human", "interface"))

most similar word is [system]

similar words in descneding order of similarity
system: 0.68845606
eps: 0.39895320
user: 0.19184408
response: 0.17416205
computer: -0.01064765
graph: -0.03586841
trees: -0.07688204
time: -0.11120591
survey: -0.16889901
interface: -0.29637104
minors: -0.45915908


most similar word is [response]

similar words in descneding order of similarity
response: 0.49505752
interface: 0.22022764
system: 0.19848153
minors: -0.05950591
graph: -0.22420032
trees: -0.23892218
time: -0.28261757
eps: -0.32439154
survey: -0.34125909


0.9048916
0.68845606
-0.296371


# model, wv 저장 불러오기

In [None]:
# model 저장: model.save(filepath)
# wv만 저장: model.wv.save(filepath)

import tempfile
with tempfile.NamedTemporaryFile(prefix="gensim_fasttext_model-", delete=False) as tmpf:
  temporary_filepath = tmpf.name
  model.save(temporary_filepath)

print("saved at %s"%(temporary_filepath))

# model 불러오기: FastText.load(filepath)

new_model = FastText.load(temporary_filepath)

saved at /tmp/gensim_fasttext_model-h2qa2pu0


In [None]:
!ls /tmp
print('\n')
!rm /tmp/gensim_fasttext_model-*
print('\n')
!ls /tmp

dap_multiplexer.b6554d95a371.root.log.INFO.20210411-153444.53
dap_multiplexer.INFO
debugger_21egpoqpsl
gensim_fasttext_model-h2qa2pu0
initgoogle_syslog_dir.0




dap_multiplexer.b6554d95a371.root.log.INFO.20210411-153444.53
dap_multiplexer.INFO
debugger_21egpoqpsl
initgoogle_syslog_dir.0


In [None]:
# 이미 학습된 model(new_model)에 계속 학습

# new_model에 새로 추가할 documents
more_texts = [
    ['computer', 'aided', 'design'],
    ['computer', 'science'],
    ['computational', 'complexity'],
    ['military', 'supercomputer'],
    ['central', 'processing', 'unit'],
    ['onboard', 'car', 'computer'],
]

# new_model에 새로운 단어 추가
new_model.build_vocab(sentences=more_texts, update=True)  # 이미 학습된 모델에 단어를 추가할 경우, update=True

wv_len = len(new_model.wv.index2word)
print("size of updated vocabulary: %i" %(wv_len))
for index, word in enumerate(new_model.wv.index2word):
  print("%ith word of i% is %s" %(index, wv_len, word))
print('\n')


# 새로운 sentences에 대해 training
new_model.train(sentences=more_texts, total_examples=len(more_texts), epochs=10)

print(model.wv['system'] == new_model.wv['system'])
print('car' in model.wv.index2word)
print('car' in new_model.wv.index2word)   # 'car'단어가 추가로 학습됨

size of updated vocabulary: 24
0th word of i 24s system
1th word of i 24s user
2th word of i 24s trees
3th word of i 24s graph
4th word of i 24s human
5th word of i 24s interface
6th word of i 24s computer
7th word of i 24s survey
8th word of i 24s response
9th word of i 24s time
10th word of i 24s eps
11th word of i 24s minors
12th word of i 24s aided
13th word of i 24s design
14th word of i 24s science
15th word of i 24s computational
16th word of i 24s complexity
17th word of i 24s military
18th word of i 24s supercomputer
19th word of i 24s central
20th word of i 24s processing
21th word of i 24s unit
22th word of i 24s onboard
23th word of i 24s car


[ True  True  True  True  True  True  True  True  True  True]
False
True


# import한 corpus class를 이용하여 학습하기

In [None]:
import gensim.downloader as api

corpus_text8 = api.load('text8') # load corpus from gensim library

import inspect
print(inspect.getsource(corpus_text8.__class__)) # 'text8' corpus의 definition 출력

# corpus 'text8'의 내용 출력하기
corpus_text8.__init__
index = 0
for line in corpus_text8.__iter__():
  index = index + 1
  if index == 5:
    break
  print(line)

# corpus 'text8'을 이용하여 모델 생성, training
model_text8 = FastText(size=50, min_count=10, sg=1)
model_text8.build_vocab(sentences=corpus_text8)
model_text8.train(sentences=corpus_text8, epochs=2, total_examples=model_text8.corpus_count)



class Dataset(object):
    def __init__(self, fn):
        self.fn = fn

    def __iter__(self):
        corpus = Text8Corpus(self.fn)
        for doc in corpus:
            yield doc

['anarchism', 'originated', 'as', 'a', 'term', 'of', 'abuse', 'first', 'used', 'against', 'early', 'working', 'class', 'radicals', 'including', 'the', 'diggers', 'of', 'the', 'english', 'revolution', 'and', 'the', 'sans', 'culottes', 'of', 'the', 'french', 'revolution', 'whilst', 'the', 'term', 'is', 'still', 'used', 'in', 'a', 'pejorative', 'way', 'to', 'describe', 'any', 'act', 'that', 'used', 'violent', 'means', 'to', 'destroy', 'the', 'organization', 'of', 'society', 'it', 'has', 'also', 'been', 'taken', 'up', 'as', 'a', 'positive', 'label', 'by', 'self', 'defined', 'anarchists', 'the', 'word', 'anarchism', 'is', 'derived', 'from', 'the', 'greek', 'without', 'archons', 'ruler', 'chief', 'king', 'anarchism', 'as', 'a', 'political', 'philosophy', 'is', 'the', 'belief', 'that', 'rulers', 'are', 'unneces

- 학습된 모델로 유사도 계산

In [None]:
similarity_0 = model_text8.wv.most_similar(positive=['theater'])
for word_score in similarity_0:
  print(word_score)
print('\n')

similarity_1 = model_text8.wv.most_similar(positive=['theater'], negative=['actor'])
for word_score in similarity_1:
  print(word_score)
print('\n')

similarity_2 = model_text8.wv.most_similar(positive=['actor', 'female'], negative=['male'])
for word_score in similarity_2:
  print(word_score)
print('\n')

similarity_3 = model_text8.wv.most_similar(positive=['actor', 'anarchism'], negative=['theater'])
for word_score in similarity_3:
  print(word_score)
print('\n')


('theatre', 0.912081778049469)
('theaters', 0.9039516448974609)
('theatres', 0.8766969442367554)
('studios', 0.8491259813308716)
('bollywood', 0.8413671255111694)
('filmfare', 0.8338989019393921)
('hollywood', 0.8268975019454956)
('entertainment', 0.8229313492774963)
('amphitheatre', 0.817478358745575)
('broadway', 0.816353440284729)


('buildings', 0.5261033177375793)
('building', 0.5084084868431091)
('rebuilding', 0.48145592212677)
('warehouses', 0.4671057462692261)
('underground', 0.4568570554256439)
('builds', 0.45230287313461304)
('venue', 0.4513894319534302)
('theaters', 0.44461295008659363)
('skyscrapers', 0.4439327120780945)
('studios', 0.4428037405014038)


('actress', 0.9452847242355347)
('actresses', 0.8837642669677734)
('screenwriter', 0.8780832886695862)
('songwriter', 0.8712148070335388)
('comedian', 0.8694100379943848)
('singer', 0.86919766664505)
('musician', 0.8627228736877441)
('razzie', 0.8526320457458496)
('comedienne', 0.8524459600448608)
('choreographer', 0.846301

- Gdrive에 모델, wv 저장 불러오기

In [None]:
# mount gdrive
from google.colab import drive
drive.mount('/content/gdrive')


save_path_dir = 'gdrive/MyDrive/Colab Notebooks/Gensim_practice/'
save_path_model = save_path_dir + 'model_text8_save'
save_path_wv = save_path_dir + 'wv_text8_save'
model_text8.save(save_path_model) # gdrive에 모델 저장
model_text8.wv.save(save_path_wv) # gdrive에 wv 저장

Mounted at /content/gdrive


In [None]:
# gdrive에서 저장된 model 불러오기
model_text8_loaded = FastText.load(save_path_model)

similarity = model_text8_loaded.wv.most_similar(positive=['theater'])
for word_score in similarity:
  print(word_score)
print('\n')

# gdrive에서 저장된 wv 불러오기
wv_text8_loaded = KeyedVectors.load(save_path_wv)
similarity = wv_text8_loaded.most_similar(positive=['theater'])
for word_score in similarity:
  print(word_score)
print('\n')

('theatre', 0.912081778049469)
('theaters', 0.9039516448974609)
('theatres', 0.8766969442367554)
('studios', 0.8491259813308716)
('bollywood', 0.8413671255111694)
('filmfare', 0.8338989019393921)
('hollywood', 0.8268975019454956)
('entertainment', 0.8229313492774963)
('amphitheatre', 0.817478358745575)
('broadway', 0.816353440284729)


('theatre', 0.912081778049469)
('theaters', 0.9039516448974609)
('theatres', 0.8766969442367554)
('studios', 0.8491259813308716)
('bollywood', 0.8413671255111694)
('filmfare', 0.8338989019393921)
('hollywood', 0.8268975019454956)
('entertainment', 0.8229313492774963)
('amphitheatre', 0.817478358745575)
('broadway', 0.816353440284729)


