In [15]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import fasttext
from gensim.models import Word2Vec, FastText


# for specific usage of packages, refer to the links below

# FastText
# https://fasttext.cc/docs/en/supervised-tutorial.html
# https://fasttext.cc/docs/en/unsupervised-tutorial.html
# https://fasttext.cc/docs/en/cheatsheet.html

# gensim 
# https://radimrehurek.com/gensim/models/fasttext.html

In [16]:
direc = "/home/eppen/Documents/YBIGTA/WinterSchool/NLP/tokenized/"

mecab_direc = direc + "ratings_mecab.txt"

corpus = [sent.strip().split(" ") for sent in open(mecab_direc).readlines()]

In [17]:
# Text Classification

model_without_char = fasttext.train_supervised(input="/home/eppen/fastText/cooking.train", lr=0.5, epoch=25, wordNgrams=2, dim=50, loss='hs', maxn=0)
model_with_char = fasttext.train_supervised(input="/home/eppen/fastText/cooking.train", lr=0.5, epoch=25, wordNgrams=2, dim=50, loss='hs', minn = 2, maxn = 5)

In [18]:
print("creat" in model_without_char.words)

model_without_char.get_nearest_neighbors("creat")

False


[(0.0, 'the'),
 (0.0, 'to'),
 (0.0, 'How'),
 (0.0, 'of'),
 (0.0, 'I'),
 (0.0, 'for'),
 (0.0, 'What'),
 (0.0, 'explosive?'),
 (0.0, 'savory?'),
 (0.0, 'formal')]

In [19]:
print("creat" in model_with_char.words)

model_with_char.get_nearest_neighbors("creat")

False


[(0.818427324295044, 'creates'),
 (0.8138300180435181, 'create'),
 (0.7946866750717163, 'creating'),
 (0.7774243354797363, 'cream"?'),
 (0.7765219807624817, 'cream,'),
 (0.7743051648139954, 'cream"'),
 (0.7678096890449524, 'creamer-'),
 (0.7646734714508057, 'cream.'),
 (0.7602362036705017, 'creamy?'),
 (0.7580905556678772, 'creams')]

In [20]:
model_without_char.predict("Howe creatss chokolate berger?")

(('__label__spicy-hot',), array([0.07255475]))

In [21]:
model_with_char.predict("Howe creatss chokolate berger?")

(('__label__chocolate',), array([0.15656056]))

In [22]:
### traditional word2vec

w2v_model = Word2Vec(corpus, size=100, workers=4, window = 5, sg = 1)

In [23]:
w2v_model.wv.vocab

{'어릴': <gensim.models.keyedvectors.Vocab at 0x7fd93266f5d0>,
 '때': <gensim.models.keyedvectors.Vocab at 0x7fd931c93310>,
 '보': <gensim.models.keyedvectors.Vocab at 0x7fd93287ef90>,
 '고': <gensim.models.keyedvectors.Vocab at 0x7fd93287ee90>,
 '지금': <gensim.models.keyedvectors.Vocab at 0x7fd93287ec90>,
 '다시': <gensim.models.keyedvectors.Vocab at 0x7fd931c985d0>,
 '봐도': <gensim.models.keyedvectors.Vocab at 0x7fd931c98610>,
 '재밌': <gensim.models.keyedvectors.Vocab at 0x7fd931c98110>,
 '어요': <gensim.models.keyedvectors.Vocab at 0x7fd93287e990>,
 'ㅋㅋ': <gensim.models.keyedvectors.Vocab at 0x7fd93287ea10>,
 '디자인': <gensim.models.keyedvectors.Vocab at 0x7fd931c98810>,
 '을': <gensim.models.keyedvectors.Vocab at 0x7fd931c987d0>,
 '배우': <gensim.models.keyedvectors.Vocab at 0x7fd931c98a10>,
 '는': <gensim.models.keyedvectors.Vocab at 0x7fd931c98a50>,
 '학생': <gensim.models.keyedvectors.Vocab at 0x7fd931c988d0>,
 '으로': <gensim.models.keyedvectors.Vocab at 0x7fd931c98890>,
 ',': <gensim.models.keyedve

In [24]:
w2v_model.wv["학생"]

array([-1.22366361e-01, -4.22951996e-01, -1.40534148e-01, -1.39218107e-01,
        2.01533690e-01, -2.37655371e-01, -3.14339072e-01,  6.18804753e-01,
       -4.26740795e-01, -1.61584556e-01,  2.16992825e-01, -2.48778254e-01,
        7.58886337e-02,  1.09758355e-01, -1.91073984e-01,  1.19586669e-01,
       -2.37545922e-01, -2.36076251e-01, -6.29425824e-01,  9.77123454e-02,
        4.52619903e-02, -3.00871015e-01, -4.42951262e-01,  7.29622468e-02,
       -4.81326461e-01,  1.85424581e-01, -4.01591271e-01, -1.20665580e-01,
        2.93009222e-01, -2.59617895e-01,  1.89215943e-01, -2.64486462e-01,
       -2.76598930e-01,  4.23589617e-01, -9.42701623e-02, -3.68814260e-01,
        5.38994968e-02,  8.37390199e-02, -1.83879901e-02, -3.19145441e-01,
       -1.07608646e-01, -4.35492843e-01,  2.39408150e-01,  1.99872062e-01,
       -2.59281784e-01,  1.55252188e-01,  2.42085814e-01, -3.84904966e-02,
       -7.14441717e-01, -9.62827280e-02,  5.97994030e-01,  3.54680568e-01,
        1.89730749e-01, -

In [25]:
w2v_model.wv.most_similar("학생")

[('초등학생', 0.7217746376991272),
 ('고등학생', 0.7041956186294556),
 ('자녀', 0.6973063945770264),
 ('저학년', 0.6896257400512695),
 ('유치원생', 0.6845433712005615),
 ('대학', 0.6829220652580261),
 ('대학생', 0.6768278479576111),
 ('저희', 0.6766531467437744),
 ('위인', 0.673999547958374),
 ('학부모', 0.6730868816375732)]

In [26]:
# default: 100 dimensions
# default: 3-6 characters
# default: 1~5 windows

In [27]:
# Trained with fasttext

# fasttext.train_unsupervised(corpus)  --> 컴퓨터 사망

In [28]:
# Trained with gensim

model_gs = FastText()

model_gs.build_vocab(sentences = corpus)
model_gs.train(sentences = corpus, total_examples = len(corpus), epochs=10)

In [29]:
model_gs.wv.vocab

{'어릴': <gensim.models.keyedvectors.Vocab at 0x7fd9325491d0>,
 '때': <gensim.models.keyedvectors.Vocab at 0x7fd931c93ed0>,
 '보': <gensim.models.keyedvectors.Vocab at 0x7fd9310cb090>,
 '고': <gensim.models.keyedvectors.Vocab at 0x7fd9310cb650>,
 '지금': <gensim.models.keyedvectors.Vocab at 0x7fd9310e6150>,
 '다시': <gensim.models.keyedvectors.Vocab at 0x7fd9310e6310>,
 '봐도': <gensim.models.keyedvectors.Vocab at 0x7fd9310e6350>,
 '재밌': <gensim.models.keyedvectors.Vocab at 0x7fd9310e6390>,
 '어요': <gensim.models.keyedvectors.Vocab at 0x7fd9310cb490>,
 'ㅋㅋ': <gensim.models.keyedvectors.Vocab at 0x7fd9310e62d0>,
 '디자인': <gensim.models.keyedvectors.Vocab at 0x7fd9310e63d0>,
 '을': <gensim.models.keyedvectors.Vocab at 0x7fd9310e6410>,
 '배우': <gensim.models.keyedvectors.Vocab at 0x7fd9310e6450>,
 '는': <gensim.models.keyedvectors.Vocab at 0x7fd9310e6490>,
 '학생': <gensim.models.keyedvectors.Vocab at 0x7fd9310e64d0>,
 '으로': <gensim.models.keyedvectors.Vocab at 0x7fd9310e6510>,
 ',': <gensim.models.keyedve

In [30]:
model_gs.wv["학생"]

array([-0.52581   , -1.0934186 , -0.9574324 , -0.27513927,  0.20751745,
       -0.73592126, -0.3066942 , -0.03671782,  1.2113864 , -1.5966767 ,
       -0.04829606,  0.6636174 ,  0.71173656, -0.8952629 ,  0.5595318 ,
        0.517037  ,  0.2958585 ,  0.8424259 ,  0.01710206,  0.9094163 ,
       -1.7995136 , -0.14174937,  0.15538885,  0.23469862,  0.0334975 ,
        0.10562485, -0.2833562 , -1.2087464 , -0.37238446,  0.48001927,
        0.34592766,  0.3995207 , -2.4925432 ,  0.87367666,  0.32112178,
        1.0716906 ,  0.76836824, -0.91551554, -1.598063  , -0.18676566,
        0.39577305,  0.8671778 , -1.719949  , -1.6464224 ,  2.0012023 ,
        1.4972422 ,  1.1322541 ,  0.49116156,  0.14903441, -0.03475015,
        1.6114464 , -0.02337382, -0.67867506, -1.1208861 , -0.43157303,
        0.15924348,  1.295212  , -0.28538644, -0.9334858 , -1.1463976 ,
        2.2043018 , -0.22067876, -0.26891482, -0.6873666 ,  0.7413365 ,
        1.458967  ,  1.1029818 ,  0.7285539 ,  0.1808214 , -1.30

In [31]:
model_gs.wv.most_similar("학생")

[('여학생', 0.8855874538421631),
 ('국민학생', 0.8625161051750183),
 ('대학생', 0.8388940095901489),
 ('고등학생', 0.801305890083313),
 ('초등학생', 0.759042501449585),
 ('중학생', 0.7336475253105164),
 ('어린놈', 0.6862475872039795),
 ('서양인', 0.65251225233078),
 ('군인', 0.6464606523513794),
 ('어린애', 0.643879771232605)]

In [32]:
# Trained on bash, imported with fasttext

model = fasttext.load_model('/home/eppen/fastText/model.bin')

model.words
model.get_word_vector



<bound method _FastText.get_word_vector of <fasttext.FastText._FastText object at 0x7fd931124110>>

In [33]:
model.words

['.',
 '</s>',
 '이',
 '는',
 '영화',
 '다',
 '고',
 '하',
 '도',
 '의',
 '가',
 '은',
 '에',
 '을',
 '보',
 '한',
 '..',
 '게',
 ',',
 '들',
 '!',
 '지',
 '를',
 '있',
 '없',
 '?',
 '좋',
 '나',
 '1',
 '었',
 '만',
 '는데',
 '너무',
 '0',
 '봤',
 '안',
 '적',
 '정말',
 '로',
 '음',
 '것',
 '으로',
 '아',
 '재밌',
 '네요',
 '어',
 '점',
 '같',
 '진짜',
 '지만',
 '했',
 '에서',
 '기',
 '네',
 '않',
 '거',
 '았',
 '수',
 '되',
 '면',
 'ㅋㅋ',
 '과',
 '말',
 '연기',
 '인',
 '잘',
 '주',
 '최고',
 '평점',
 '내',
 '~',
 '이런',
 '던',
 '어요',
 '와',
 '2',
 '왜',
 '할',
 '해',
 '겠',
 '습니다',
 'ㅋㅋㅋ',
 '스토리',
 '...',
 '아니',
 '생각',
 '더',
 '드라마',
 '그',
 '싶',
 '사람',
 '듯',
 '때',
 '감동',
 '함',
 '배우',
 '까지',
 '본',
 '좀',
 '볼',
 '내용',
 '보다',
 '뭐',
 '만들',
 '알',
 '감독',
 '라',
 '재미',
 '그냥',
 '중',
 '지루',
 '시간',
 '재미있',
 '3',
 '년',
 '잼',
 '였',
 '재미없',
 '사랑',
 '냐',
 '못',
 '쓰레기',
 '서',
 '라고',
 '번',
 '야',
 '면서',
 '다시',
 '니',
 '나오',
 '작품',
 '이거',
 '하나',
 '해서',
 '줄',
 '개',
 '끝',
 '남',
 '정도',
 '이건',
 '마지막',
 '임',
 '액션',
 '기대',
 'ㅋ',
 '분',
 '라는',
 '입니다',
 '다는',
 '건',
 '완전',
 '많',
 '참',
 '아깝',
 '처음'

In [34]:
model.get_word_vector("학생")

array([ 1.43882930e-02, -1.41155377e-01, -2.22145274e-01, -2.94394225e-01,
        2.20219165e-01,  1.47111475e-01,  7.76096508e-02,  8.75757486e-02,
        2.80630231e-01, -1.26182348e-01, -5.33749819e-01, -4.04617861e-02,
       -1.54051334e-01, -4.99048054e-01,  2.03076228e-01, -2.40003280e-02,
        8.25563550e-01, -1.77272752e-01, -1.55364469e-01,  1.07840836e-01,
        8.83810580e-01,  1.52548239e-01, -7.46772066e-02,  1.49235606e-01,
       -1.40668839e-01, -1.30642340e-01, -5.28521776e-01,  2.88921118e-01,
       -3.56071323e-01, -1.44427627e-01,  8.59987140e-02, -1.76830843e-01,
        2.00584650e-01, -3.76861133e-02, -2.57336259e-01, -1.99530393e-01,
        9.41755623e-02, -6.73193038e-02, -1.60313696e-01, -4.06207293e-02,
       -5.44040263e-01,  1.28239319e-02,  1.13327794e-01, -7.95557499e-02,
       -3.03320915e-01, -1.17209360e-01,  6.57771587e-01,  7.07331449e-02,
        7.47766018e-01, -1.77244574e-01, -4.67090487e-01, -2.22572953e-01,
       -7.54924774e-01,  

In [35]:
model.get_nearest_neighbors("학생")

[(0.7290460467338562, '고등학생'),
 (0.728935956954956, '대학생'),
 (0.7154802083969116, '대학교'),
 (0.6990119218826294, '대학'),
 (0.6916933655738831, '졸업'),
 (0.6820202469825745, '학부모'),
 (0.6735225319862366, '고등'),
 (0.6733785271644592, '여학생'),
 (0.6693524122238159, '국민학생'),
 (0.6642491817474365, '다닐')]

In [36]:
# Trained on bash, imported with gensim

# Getting fasttext models in vec/bin format

# type the command below on bash
#./fasttext skipgram -input ratings_mecab.txt -output model

from gensim.models.wrappers import FastText

ft_model = FastText.load_fasttext_format('/home/eppen/fastText/model.bin')

In [37]:
ft_model.wv.vocab

{'.': <gensim.models.deprecated.keyedvectors.Vocab at 0x7fd93113f650>,
 '</s>': <gensim.models.deprecated.keyedvectors.Vocab at 0x7fd93113f750>,
 '이': <gensim.models.deprecated.keyedvectors.Vocab at 0x7fd93113f7d0>,
 '는': <gensim.models.deprecated.keyedvectors.Vocab at 0x7fd93113f850>,
 '영화': <gensim.models.deprecated.keyedvectors.Vocab at 0x7fd93113f8d0>,
 '다': <gensim.models.deprecated.keyedvectors.Vocab at 0x7fd93113f9d0>,
 '고': <gensim.models.deprecated.keyedvectors.Vocab at 0x7fd93113fa50>,
 '하': <gensim.models.deprecated.keyedvectors.Vocab at 0x7fd93113fad0>,
 '도': <gensim.models.deprecated.keyedvectors.Vocab at 0x7fd93113fb50>,
 '의': <gensim.models.deprecated.keyedvectors.Vocab at 0x7fd93113f950>,
 '가': <gensim.models.deprecated.keyedvectors.Vocab at 0x7fd93113fbd0>,
 '은': <gensim.models.deprecated.keyedvectors.Vocab at 0x7fd93113fc50>,
 '에': <gensim.models.deprecated.keyedvectors.Vocab at 0x7fd93113fcd0>,
 '을': <gensim.models.deprecated.keyedvectors.Vocab at 0x7fd93113fd50>,
 '

In [38]:
ft_model["학생"]

array([-0.0330895 , -0.0270534 ,  0.00037072, -0.09784418, -0.00952482,
        0.00571445, -0.00171212,  0.05215607,  0.06713373, -0.00252355,
       -0.09293189, -0.01954213, -0.05718217, -0.09266279, -0.00115381,
        0.01568371,  0.10514381,  0.00435491, -0.04847056,  0.07961261,
        0.19566284, -0.00381837,  0.02261642, -0.03712045, -0.07944328,
       -0.02372825, -0.06165854,  0.06187534, -0.08997138, -0.05017286,
       -0.05437044, -0.0565555 ,  0.04570846, -0.02776191, -0.00513409,
        0.00552711,  0.04365916, -0.08508383, -0.01864137,  0.00766175,
       -0.12852769, -0.02061407,  0.04464441, -0.08951019, -0.0909242 ,
       -0.02846727,  0.01021706, -0.05265512,  0.1163597 , -0.04003516,
       -0.0213875 , -0.01149839, -0.08584761,  0.0685202 , -0.00140243,
       -0.04016709, -0.0203314 ,  0.12113051,  0.0255988 , -0.004652  ,
        0.03553133,  0.06672967,  0.03015299,  0.04488355, -0.06597161,
        0.01558247,  0.01330374,  0.01990565, -0.13588716, -0.07

In [39]:
ft_model.wv.most_similar("학생")

[('가해자', 0.5095288157463074),
 ('자녀', 0.4758945405483246),
 ('학부모', 0.4737958014011383),
 ('개신교', 0.4690389931201935),
 ('선생', 0.4661126136779785),
 ('반정부', 0.4637138545513153),
 ('배부른', 0.4583195745944977),
 ('욕구', 0.4576117992401123),
 ('주민', 0.45282793045043945),
 ('옥한흠', 0.44178345799446106)]

In [59]:
temp_model_gs = FastText.load_fasttext_format('/home/eppen/fastText/jamo_model.bin')

In [60]:
temp_model_ft = fasttext.load_model('/home/eppen/fastText/jamo_model.bin')



In [66]:
# from https://github.com/ratsgo/embedding

from soynlp.hangle import decompose, compose, character_is_korean
import re

doublespace_pattern = re.compile('\s+')

def jamo_sentence(sent):

    def transform(char):
        if char == ' ':
            return char
        cjj = decompose(char)
        if len(cjj) == 1:
            return cjj
        cjj_ = ''.join(c if c != ' ' else '-' for c in cjj)
        return cjj_

    sent_ = []
    for char in sent:
        if character_is_korean(char):
            sent_.append(transform(char))
        else:
            sent_.append(char)
    sent_ = doublespace_pattern.sub(' ', ''.join(sent_))
    return sent_


def jamo_to_word(jamo):
    jamo_list, idx = [], 0
    while idx < len(jamo):
        if not character_is_korean(jamo[idx]):
            jamo_list.append(jamo[idx])
            idx += 1
        else:
            jamo_list.append(jamo[idx:idx + 3])
            idx += 3
    word = ""
    for jamo_char in jamo_list:
        if len(jamo_char) == 1:
            word += jamo_char
        elif jamo_char[2] == "-":
            word += compose(jamo_char[0], jamo_char[1], " ")
        else:
            word += compose(jamo_char[0], jamo_char[1], jamo_char[2])
    return word

In [87]:
for i in temp_model_gs.wv.most_similar(jamo_sentence("서울특별시")):
    print(jamo_to_word(i[0]), i[1])

갈트 0.7068468928337097
슈프레발트 0.7039884328842163
폴트 0.7002385854721069
스톨트 0.6914335489273071
놀트 0.6879458427429199
비톨트 0.6848863959312439
어설트 0.664912223815918
안홀트 0.6637964248657227
빌리발트 0.6593713760375977
리볼트 0.6565208435058594


In [72]:
# ./fasttext skipgram -input /home/eppen/Documents/YBIGTA/WinterSchool/NLP/tokenized/corpus_mecab_jamo.txt -output jamo_model_nlp

In [2]:
###################################################################

In [None]:
# loading pre-trained data (doesnt work on jupyter)
from gensim.models import fasttext
#note the difference from gensim.models.FastText and gensim.models.wrappers.fasttext
model_pt_gs = fasttext.load_facebook_vectors("/home/eppen/Downloads/cc.ko.300.bin")

In [None]:
model_pt_gs.wv.vocab