In [1]:
import pandas as pd
import time
import numpy as np
import warnings
warnings.filterwarnings('ignore')
import re

df = pd.read_csv('data/dacon_news_data.csv')
df.tail()

Unnamed: 0.1,Unnamed: 0,index,title,topic_idx
14986,14986,45643,남친과 함께 잔혹하게 동거인 학대 20대 여성 구속영장,2
14987,14987,45648,일왕 부부 16일 동일본대지진 피해지역 위로방문,4
14988,14988,45650,1보 서울시교육청 신종코로나 확산에 개학 연기·휴업 검토,2
14989,14989,45652,답변하는 배기동 국립중앙박물관장,2
14990,14990,45653,2020 한국인터넷기자상 시상식 내달 1일 개최…특별상 김성후,2


## - Tokenization: SoyNLP

In [2]:
# 전처리

# 한글, 영어 제외한 문자 제거
df['title_prep'] = df['title'].str.replace("[^ㄱ-ㅎㅏ-ㅣ가-힣a-zA-z ]"," ") 


In [3]:
df['title_prep'][:5]

0            인천 핀란드 항공기 결항 휴가철 여행객 분통
1      실리콘밸리 넘어서겠다 구글   조원 들여  전역 거점화
2      이란 외무 긴장완화 해결책은 미국이 경제전쟁 멈추는 것
3    NYT 클린턴 측근 기업 특수관계 조명 공과 사 맞물려종합
4           시진핑 트럼프에 중미 무역협상 조속 타결 희망
Name: title_prep, dtype: object

In [4]:
# 데이터 분리
from sklearn.model_selection import train_test_split

train_x, test_x, train_y, test_y = train_test_split(df['title_prep'], df['topic_idx'], test_size=0.2, random_state=1)

print(train_x.shape, test_x.shape)
print(train_y.shape, test_y.shape)

(11992,) (2999,)
(11992,) (2999,)


명사 추출기의 명사 점수와 Cohesion 을 함께 이용할 수도 있다.

=> "Cohesion 점수 + 명사 점수"를 단어 점수로 이용

In [22]:
from soynlp.noun import LRNounExtractor
from soynlp.noun import LRNounExtractor_v2
from soynlp.word import WordExtractor
from soynlp.tokenizer import LTokenizer

# noun_extractor = LRNounExtractor()
text = train_x
# nouns = noun_extractor.train_extract(text)

In [23]:
word_extractor = WordExtractor(
    #min_frequency=100, 
    min_cohesion_forward=0.2,
    #min_right_branching_entropy=0.0
)

word_extractor.train(text)
words = word_extractor.extract()

cohesion_score = {word:score.cohesion_forward for word, score in words.items()}

training was done. used memory 1.370 Gbory 1.271 Gb
all cohesion probabilities was computed. # words = 7491
all branching entropies was computed # words = 10214
all accessor variety was computed # words = 10214


In [24]:
noun_extractor = LRNounExtractor_v2()
nouns = noun_extractor.train_extract(text)

noun_scores = {noun:score.score for noun, score in nouns.items()}
combined_scores = {noun:score + cohesion_score.get(noun, 0)
    for noun, score in noun_scores.items()}
combined_scores.update(
    {subword:cohesion for subword, cohesion in cohesion_score.items()
    if not (subword in combined_scores)}
)

# tokenizer 생성
tokenizer = LTokenizer(scores=combined_scores)

[Noun Extractor] use default predictors
[Noun Extractor] num features: pos=3929, neg=2321, common=107
[Noun Extractor] counting eojeols
[EojeolCounter] n eojeol = 30478 from 11992 sents. mem=1.371 Gb                    
[Noun Extractor] complete eojeol counter -> lr graph
[Noun Extractor] has been trained. #eojeols=86687, mem=1.372 Gb
[Noun Extractor] batch prediction was completed for 6731 words
[Noun Extractor] checked compounds. discovered 1577 compounds
[Noun Extractor] postprocessing detaching_features : 4021 -> 3945
[Noun Extractor] postprocessing ignore_features : 3945 -> 3913
[Noun Extractor] postprocessing ignore_NJ : 3913 -> 3910
[Noun Extractor] 3910 nouns (1577 compounds) with min frequency=1
[Noun Extractor] flushing was done. mem=1.376 Gb                    
[Noun Extractor] 50.98 % eojeols are covered


In [25]:
tok_sent = []
for i in range(len(text)):
    sent = text.iloc[i]
    tok_sent.append(tokenizer.tokenize(sent))

In [27]:
tok_sent[:10]

[['줄리아니', '러시아', '후원행사에', '돈받고', '참가', '제재', '대상도', '배석'],
 ['일왕', '신년인사', '세계', '안녕', '기원'],
 ['국토정보공사', '전남공고에', '억', '상당', '실습', '기자재', '기증'],
 ['퇴직금', '제도', '개선', '광주', '개구청', '공무직노조', '삭발투쟁'],
 ['박기수', '신임', 'TBN', '광주', '교통방송', '본부장'],
 ['또', '예측', '못', '하고', '뒷북만', '차바피해', '긴급', '지원', '해야'],
 ['게시', '판', '광주', '북부소방서', '여름철', '전기화재', '주의', '당부'],
 ['오바마', '내일', '역사', '적', '쿠바', '방문', '쿠바', '국교정상화', '대미종합'],
 ['아프리카', '연합', '수단', '군부', '에', '권력이양', '시간', '더', '주기로'],
 ['공주여고', '학생들', '등교', '안내', '하는', '김지철', '충남교육', '감']]

In [28]:
noun_sent = []
for doc in tok_sent:
    nouns = [word for word in doc if word in sorted_by_score.keys()]
    noun_sent.append(nouns)

In [30]:
noun_sent[:10]

[['제재'],
 ['일왕', '세계', '안녕'],
 ['상당', '기증'],
 ['제도', '개선'],
 ['신임', '본부장'],
 ['예측', '못', '지원'],
 ['게시', '주의'],
 ['오바마', '역사', '쿠바', '방문', '쿠바'],
 ['아프리카', '군부'],
 ['학생들', '등교', '안내']]

In [231]:
len(noun_sent)

11992

## - Word Embedding: FastText
Fasttext의 핵심 아이디어는 단어 단위가 아닌 sub 단어를 단위로 사용합니다. 즉 다음과 같습니다.
word2vec -> "apple" 학습

FastText -> "ap", "pp", "pl", "le" 학습

따라서 미리 학습되지 않은 단어들에 대한 vector도 표현해준다는 장점이 있습니다. 이용은 다음과 같습니다.

In [55]:
from gensim import models

# 1. 훈련시키기
model = models.FastText(noun_sent, vector_size=100, window=3, workers=4, sg=1)

# sg=1: skip gram

In [None]:
# model.wv.key_to_index

# {'명': 0,
#  '트럼프': 1,
#  '사망': 2,
#  '터키': 3,
#  '한국': 4,
#  '년': 5,
#  '코로나': 6,
#  '미국': 7,
#  '시리아': 8,
#  '게시': 9,
#  '정부': 10,
#  '대통령': 11,
#  '지원': 12,
#  ...

In [96]:
# # Pre-trained Korean fasttext
# ko_model = models.fasttext.load_facebook_model('data/cc.ko.300.bin')

In [97]:
# # 추가 학습
# ko_model.build_vocab(noun_sent, update=True)
# ko_model.train(noun_sent, total_examples=len(noun_sent), epochs=ko_model.epochs)

(8061, 43056)

In [98]:
# ko_model.save('data/fasttext_ko_model')
# ko_model.wv.save_word2vec_format('data/fasttext_ko_model_2', binary=False)

In [257]:
#from gensim.models import KeyedVectors

# 2. pre-trained model
wiki_model = models.fasttext.load_facebook_model('data/wiki.ko.bin')

In [258]:
wiki_model.vocab_size

879129

In [260]:
len(wiki_model.wv.vectors)

879129

In [261]:
# 추가 학습
wiki_model.build_vocab(noun_sent, update=True)
wiki_model.train(noun_sent, total_examples=len(noun_sent), epochs=wiki_model.epochs)

(107230, 215280)

In [262]:
wiki_model.vocab_size, len(wiki_model.wv.vectors)

(879129, 879196)

In [367]:
wiki_model.wv.similarity('오바마', '대통령')

0.7105539

In [266]:
wiki_model.wv.save_word2vec_format('data/fasttext_wiki_model', binary=False)

In [285]:
embedding_matrix = []
embeddings = []
for idx, sents in enumerate(noun_sent):
    for word in sents:
        embedding_vector = wiki_model.wv[word]
        embeddings.append(embedding_vector)
        
    embedding_matrix.append(embeddings/len(sents))

In [377]:
embedding_matrix[1]

array([-0.3306199 ,  0.74296415, -0.5438936 ,  0.5470522 ,  0.07490721,
       -0.10757225, -0.22044683, -0.01346713,  0.9021586 , -0.4260371 ,
       -0.02638024, -0.37494206,  0.57350415,  0.17435987, -0.3515242 ,
        0.29651707,  0.32946125,  0.47220796,  0.09709615, -0.72556275,
        0.07250775, -0.05558758,  0.23904957, -0.19166046,  0.16046385,
        0.07894684,  0.11156472, -0.11916998, -0.20707737,  0.23458737,
       -0.17148274, -0.12306082,  0.8589086 , -0.784264  ,  0.01085161,
        0.50145495,  0.11467633,  0.7191736 , -0.28603497,  0.3229578 ,
        0.23478001, -0.7358239 , -0.5134431 , -0.41610467,  0.49158797,
       -0.5763722 , -0.16075386, -0.4024595 ,  0.08873595,  0.16163649,
       -0.06011597,  0.3021502 , -0.39103097, -0.46456277, -0.19158858,
       -0.09553079, -0.40408877,  0.5471783 , -0.03515989,  0.02216857,
        0.6114115 ,  0.08342772, -0.04666803, -0.00756682,  0.1408804 ,
       -0.26962134,  0.48826987, -0.21538441,  0.6456442 , -0.53

In [374]:
# 이 방법이 아냐....
len(embedding_matrix)

43056

문장의 벡터 = Mean(각 단어의 벡터)

단어의 벡터는 알지만, 문장의 벡터는 어떻게 표현할 수 있을까?

이러한 질문의 답은, “문장의 벡터는 해당 문장의 단어들의 벡터 평균”이라고 볼 수 있다.

다른/혹은 더 높은 성능을 위한 방법으로는, 문맥을 이해하는 BERT, 혹은 Doc2Vec와 같은 문장단위 임베딩, 혹은 Word2Vec의 Mean을 취한 뒤 TF-IDF를 취해주는 방법 등이 있다.

In [323]:
train_df = df.loc[train_x.index]
train_df.head()

Unnamed: 0.1,Unnamed: 0,index,title,topic_idx,title_prep
9724,9724,33429,줄리아니 러시아 후원행사에 돈받고 참가…美 제재대상도 배석,4,줄리아니 러시아 후원행사에 돈받고 참가 제재대상도 배석
946,946,3259,일왕 신년인사…세계 안녕 기원,4,일왕 신년인사 세계 안녕 기원
10803,10803,37252,국토정보공사 전남공고에 4억 상당 실습 기자재 기증,2,국토정보공사 전남공고에 억 상당 실습 기자재 기증
7318,7318,23890,퇴직금 제도 개선 광주 5개구청 공무직노조 삭발투쟁,2,퇴직금 제도 개선 광주 개구청 공무직노조 삭발투쟁
9386,9386,31803,박기수 신임 TBN 광주교통방송 본부장,2,박기수 신임 TBN 광주교통방송 본부장


In [326]:
train_df['noun_sent'] = 'nan'

In [327]:
train_df.iloc[0]['noun_sent'] = noun_sent[0]

In [330]:
train_df.index[0]

9724

In [333]:
for idx, sents in enumerate(noun_sent):
    train_df['noun_sent'][train_df.index[idx]] = sents

In [335]:
train_df.tail()

Unnamed: 0.1,Unnamed: 0,index,title,topic_idx,title_prep,noun_sent
905,905,3128,차세대 지휘자 찾아…서울시향 부지휘자 최초로 공개 모집,2,차세대 지휘자 찾아 서울시향 부지휘자 최초로 공개 모집,"[찾아, 서울시, 향, 최초, 공개, 모집]"
5192,5192,17075,서양화 거장 장욱진 화백 기념관 세종시에 들어선다,2,서양화 거장 장욱진 화백 기념관 세종시에 들어선다,"[기념관, 세종시]"
12172,12172,39911,내년도 최저임금 노사 입장차 팽팽,2,내년도 최저임금 노사 입장차 팽팽,"[내년, 최저, 노사, 입장차, 팽팽]"
235,235,553,中 5G 지배하면 세계가 위험 전직 美NSC 장성 주장,4,G 지배하면 세계가 위험 전직 NSC 장성 주장,"[지배, 세계, 위험, 주장]"
13349,13349,42208,경주 동국대 코로나로 4·19 등반대회 취소…나무심기 대체,2,경주 동국대 코로나로 등반대회 취소 나무심기 대체,"[동국대, 코로나, 취소, 나무, 대체]"


In [338]:
def get_sentence_mean_vector(nouns):
    vector = []
    for i in nouns:
        try:
            vector.append(wiki_model.wv[i])
        except KeyError as e:
            pass
    try:
        return np.mean(vector, axis=0)
    except IndexError as e:
        pass

In [339]:
train_df['wv'] = train_df['noun_sent'].map(get_sentence_mean_vector)

In [342]:
train_df.tail()

Unnamed: 0.1,Unnamed: 0,index,title,topic_idx,title_prep,noun_sent,wv
905,905,3128,차세대 지휘자 찾아…서울시향 부지휘자 최초로 공개 모집,2,차세대 지휘자 찾아 서울시향 부지휘자 최초로 공개 모집,"[찾아, 서울시, 향, 최초, 공개, 모집]","[-0.01482521, -0.01304319, -0.62947524, 0.0619..."
5192,5192,17075,서양화 거장 장욱진 화백 기념관 세종시에 들어선다,2,서양화 거장 장욱진 화백 기념관 세종시에 들어선다,"[기념관, 세종시]","[0.09906987, 0.29745674, -0.7595968, -0.226649..."
12172,12172,39911,내년도 최저임금 노사 입장차 팽팽,2,내년도 최저임금 노사 입장차 팽팽,"[내년, 최저, 노사, 입장차, 팽팽]","[-0.14889012, -0.07366431, -0.26153788, -0.203..."
235,235,553,中 5G 지배하면 세계가 위험 전직 美NSC 장성 주장,4,G 지배하면 세계가 위험 전직 NSC 장성 주장,"[지배, 세계, 위험, 주장]","[-0.24608521, -0.2550412, -0.24612239, 0.00817..."
13349,13349,42208,경주 동국대 코로나로 4·19 등반대회 취소…나무심기 대체,2,경주 동국대 코로나로 등반대회 취소 나무심기 대체,"[동국대, 코로나, 취소, 나무, 대체]","[0.07859577, 0.23254462, -0.33864602, -0.23532..."


In [347]:
word_vectors = train_df.wv.to_list() 

In [242]:
len(model.wv.vectors)

1599

In [132]:
# ko_model.wv.similarity('오바마', '한국')

0.13986328

In [168]:
my_dict = dict({})
for key, idx in enumerate(model.wv.key_to_index):
    my_dict[idx] = model.wv[key]
    # Or my_dict[key] = model.wv.get_vector(key)
    # Or my_dict[key] = model.wv.word_vec(key, use_norm=False)

> 1차 시도 후 궁금한 사항들
1. 직접 워드임베딩 훈련시켰을 때 왜 벡터 사이즈가 작아졌냐 (train_df size가 11~~ -> 1599) 엥?
2. pretrained -> 추가 훈련 -> 벡터 사이즈랑 벡터의 총 길이가 왜 다르냐
3. embedding matrix 길이와 train_df 길이가 같도록 만드는 방법 

## - Clustering

### SOM (미완성)

In [193]:
from math import sqrt

from numpy import (array, unravel_index, nditer, linalg, random, subtract,
                   power, exp, pi, zeros, arange, outer, meshgrid, dot)
from collections import defaultdict
from warnings import warn


"""
    Minimalistic implementation of the Self Organizing Maps (SOM).
"""


def fast_norm(x):
    """Returns norm-2 of a 1-D numpy array.

    * faster than linalg.norm in case of 1-D arrays (numpy 1.9.2rc1).
    """
    return sqrt(dot(x, x.T))


class MiniSom(object):
    def __init__(self, x, y, input_len, sigma=1.0, learning_rate=0.5, decay_function=None, random_seed=None):
        """
            Initializes a Self Organizing Maps.

            x,y - dimensions of the SOM

            input_len - number of the elements of the vectors in input

            sigma - spread of the neighborhood function (Gaussian), needs to be adequate to the dimensions of the map.
            (at the iteration t we have sigma(t) = sigma / (1 + t/T) where T is #num_iteration/2)

            learning_rate - initial learning rate
            (at the iteration t we have learning_rate(t) = learning_rate / (1 + t/T) where T is #num_iteration/2)

            decay_function, function that reduces learning_rate and sigma at each iteration
                            default function: lambda x,current_iteration,max_iter: x/(1+current_iteration/max_iter)

            random_seed, random seed to use.
        """
        if sigma >= x/2.0 or sigma >= y/2.0:
            warn('Warning: sigma is too high for the dimension of the map.')
        if random_seed:
            self.random_generator = random.RandomState(random_seed)
        else:
            self.random_generator = random.RandomState(random_seed)
        if decay_function:
            self._decay_function = decay_function
        else:
            self._decay_function = lambda x, t, max_iter: x/(1+t/max_iter)
        self.learning_rate = learning_rate
        self.sigma = sigma
        self.weights = self.random_generator.rand(x,y,input_len)*2-1 # random initialization
        for i in range(x):
            for j in range(y):
                self.weights[i,j] = self.weights[i,j] / fast_norm(self.weights[i,j]) # normalization
        self.activation_map = zeros((x,y))
        self.neigx = arange(x)
        self.neigy = arange(y) # used to evaluate the neighborhood function
        self.neighborhood = self.gaussian

    def _activate(self, x):
        """ Updates matrix activation_map, in this matrix the element i,j is the response of the neuron i,j to x """
        s = subtract(x, self.weights) # x - w
        it = nditer(self.activation_map, flags=['multi_index'])
        while not it.finished:
            self.activation_map[it.multi_index] = fast_norm(s[it.multi_index])  # || x - w ||
            it.iternext()

    def activate(self, x):
        """ Returns the activation map to x """
        self._activate(x)
        return self.activation_map

    def gaussian(self, c, sigma):
        """ Returns a Gaussian centered in c """
        d = 2*pi*sigma*sigma
        ax = exp(-power(self.neigx-c[0], 2)/d)
        ay = exp(-power(self.neigy-c[1], 2)/d)
        return outer(ax, ay)  # the external product gives a matrix

    def diff_gaussian(self, c, sigma):
        """ Mexican hat centered in c (unused) """
        xx, yy = meshgrid(self.neigx, self.neigy)
        p = power(xx-c[0], 2) + power(yy-c[1], 2)
        d = 2*pi*sigma*sigma
        return exp(-p/d)*(1-2/d*p)

    def winner(self, x):
        """ Computes the coordinates of the winning neuron for the sample x """
        self._activate(x)
        return unravel_index(self.activation_map.argmin(), self.activation_map.shape)

    def update(self, x, win, t):
        """
            Updates the weights of the neurons.
            x - current pattern to learn
            win - position of the winning neuron for x (array or tuple).
            t - iteration index
        """
        eta = self._decay_function(self.learning_rate, t, self.T)
        sig = self._decay_function(self.sigma, t, self.T) # sigma and learning rate decrease with the same rule
        g = self.neighborhood(win, sig)*eta # improves the performances
        it = nditer(g, flags=['multi_index'])
        while not it.finished:
            # eta * neighborhood_function * (x-w)
            self.weights[it.multi_index] += g[it.multi_index]*(x-self.weights[it.multi_index])
            # normalization
            self.weights[it.multi_index] = self.weights[it.multi_index] / fast_norm(self.weights[it.multi_index])
            it.iternext()

    def quantization(self, data):
        """ Assigns a code book (weights vector of the winning neuron) to each sample in data. """
        q = zeros(data.shape)
        for i, x in enumerate(data):
            q[i] = self.weights[self.winner(x)]
        return q

    def random_weights_init(self, data):
        """ Initializes the weights of the SOM picking random samples from data """
        it = nditer(self.activation_map, flags=['multi_index'])
        while not it.finished:
            self.weights[it.multi_index] = data[self.random_generator.randint(len(data))]
            self.weights[it.multi_index] = self.weights[it.multi_index]/fast_norm(self.weights[it.multi_index])
            it.iternext()

    def train_random(self, data, num_iteration):
        """ Trains the SOM picking samples at random from data """
        self._init_T(num_iteration)
        for iteration in range(num_iteration):
            rand_i = self.random_generator.randint(len(data)) # pick a random sample
            self.update(data[rand_i], self.winner(data[rand_i]), iteration)

    def train_batch(self, data, num_iteration):
        """ Trains using all the vectors in data sequentially """
        self._init_T(len(data)*num_iteration)
        iteration = 0
        while iteration < num_iteration:
            idx = iteration % (len(data)-1)
            self.update(data[idx], self.winner(data[idx]), iteration)
            iteration += 1

    def _init_T(self, num_iteration):
        """ Initializes the parameter T needed to adjust the learning rate """
        self.T = num_iteration/2  # keeps the learning rate nearly constant for the last half of the iterations

    def distance_map(self):
        """ Returns the distance map of the weights.
            Each cell is the normalised sum of the distances between a neuron and its neighbours.
        """
        um = zeros((self.weights.shape[0], self.weights.shape[1]))
        it = nditer(um, flags=['multi_index'])
        while not it.finished:
            for ii in range(it.multi_index[0]-1, it.multi_index[0]+2):
                for jj in range(it.multi_index[1]-1, it.multi_index[1]+2):
                    if ii >= 0 and ii < self.weights.shape[0] and jj >= 0 and jj < self.weights.shape[1]:
                        um[it.multi_index] += fast_norm(self.weights[ii, jj, :]-self.weights[it.multi_index])
            it.iternext()
        um = um/um.max()
        return um

    def activation_response(self, data):
        """
            Returns a matrix where the element i,j is the number of times
            that the neuron i,j have been winner.
        """
        a = zeros((self.weights.shape[0], self.weights.shape[1]))
        for x in data:
            a[self.winner(x)] += 1
        return a

    def quantization_error(self, data):
        """
            Returns the quantization error computed as the average distance between
            each input sample and its best matching unit.
        """
        error = 0
        for x in data:
            error += fast_norm(x-self.weights[self.winner(x)])
        return error/len(data)

    def win_map(self, data):
        """
            Returns a dictionary wm where wm[(i,j)] is a list with all the patterns
            that have been mapped in the position i,j.
        """
        winmap = defaultdict(list)
        for x in data:
            winmap[self.winner(x)].append(x)
        return winmap

### unit tests
# '''
# from numpy.testing import assert_almost_equal, assert_array_almost_equal, assert_array_equal


# class TestMinisom:
#     def setup_method(self, method):
#         self.som = MiniSom(5, 5, 1)
#         for i in range(5):
#             for j in range(5):
#                 assert_almost_equal(1.0, linalg.norm(self.som.weights[i,j]))  # checking weights normalization
#         self.som.weights = zeros((5, 5))  # fake weights
#         self.som.weights[2, 3] = 5.0
#         self.som.weights[1, 1] = 2.0

#     def test_decay_function(self):
#         assert self.som._decay_function(1., 2., 3.) == 1./(1.+2./3.)

#     def test_fast_norm(self):
#         assert fast_norm(array([1, 3])) == sqrt(1+9)

#     def test_gaussian(self):
#         bell = self.som.gaussian((2, 2), 1)
#         assert bell.max() == 1.0
#         assert bell.argmax() == 12  # unravel(12) = (2,2)

#     def test_win_map(self):
#         winners = self.som.win_map([5.0, 2.0])
#         assert winners[(2, 3)][0] == 5.0
#         assert winners[(1, 1)][0] == 2.0

#     def test_activation_reponse(self):
#         response = self.som.activation_response([5.0, 2.0])
#         assert response[2, 3] == 1
#         assert response[1, 1] == 1

#     def test_activate(self):
#         assert self.som.activate(5.0).argmin() == 13.0  # unravel(13) = (2,3)

#     def test_quantization_error(self):
#         self.som.quantization_error([5, 2]) == 0.0
#         self.som.quantization_error([4, 1]) == 0.5

#     def test_quantization(self):
#         q = self.som.quantization(array([4, 2]))
#         assert q[0] == 5.0
#         assert q[1] == 2.0

#     def test_random_seed(self):
#         som1 = MiniSom(5, 5, 2, sigma=1.0, learning_rate=0.5, random_seed=1)
#         som2 = MiniSom(5, 5, 2, sigma=1.0, learning_rate=0.5, random_seed=1)
#         assert_array_almost_equal(som1.weights, som2.weights)  # same initialization
#         data = random.rand(100,2)
#         som1 = MiniSom(5, 5, 2, sigma=1.0, learning_rate=0.5, random_seed=1)
#         som1.train_random(data,10)
#         som2 = MiniSom(5, 5, 2, sigma=1.0, learning_rate=0.5, random_seed=1)
#         som2.train_random(data,10)
#         assert_array_almost_equal(som1.weights,som2.weights)  # same state after training

#     def test_train_batch(self):
#         som = MiniSom(5, 5, 2, sigma=1.0, learning_rate=0.5, random_seed=1)
#         data = array([[4, 2], [3, 1]])
#         q1 = som.quantization_error(data)
#         som.train_batch(data, 10)
#         assert q1 > som.quantization_error(data)

#     def test_train_random(self):
#         som = MiniSom(5, 5, 2, sigma=1.0, learning_rate=0.5, random_seed=1)
#         data = array([[4, 2], [3, 1]])
#         q1 = som.quantization_error(data)
#         som.train_random(data, 10)
#         assert q1 > som.quantization_error(data)

#     def test_random_weights_init(self):
#         som = MiniSom(2, 2, 2, random_seed=1)
#         som.random_weights_init(array([[1.0, .0]]))
#         for w in som.weights:
#             assert_array_equal(w[0], array([1.0, .0]))



# '''

In [350]:
# Training the SOM
#from minisom import MiniSom
som = MiniSom(x = 10, y = 10, input_len = 300, sigma = 1.0, learning_rate = 0.5)
som.random_weights_init(word_vectors)
som.train_random(data = word_vectors, num_iteration = 100)

In [None]:
#hdbscan -> 밀도기반 
#문서 군집화 -> 밀도기반 
#딥러닝 -> x (문서 분류, 문서 군집화)

In [202]:
embedding_matrix

array([[-0.11236012,  0.02490268,  0.02918391, ..., -0.14387944,
        -0.05629704,  0.11514557],
       [-0.18449359,  0.21177749,  0.02855687, ..., -0.29755563,
        -0.10659172,  0.09030319],
       [-0.17191753,  0.13481405, -0.00542197, ..., -0.26357353,
        -0.07554129,  0.06220521],
       ...,
       [-0.05578983,  0.06594275,  0.00600702, ..., -0.09093149,
        -0.03218354,  0.03526275],
       [-0.07333402,  0.07885507,  0.00681297, ..., -0.10983935,
        -0.03924961,  0.04143555],
       [-0.06536539,  0.07210843,  0.005145  , ..., -0.10444382,
        -0.03277766,  0.03602912]])

### Spherical Kmeans

In [None]:
from soyclustering import SphericalKMeans

spherical_kmeans = SphericalKMeans(
    n_clusters=120,
    max_iter=10,
    verbose=1,
    init='similar_cut'
)

labels = spherical_kmeans.fit_predict(word_vectors)

In [292]:
embedding_csr_matrix.shape

(1599, 100)

In [228]:
df.loc[train_x.index]

Unnamed: 0.1,Unnamed: 0,index,title,topic_idx,title_prep
9724,9724,33429,줄리아니 러시아 후원행사에 돈받고 참가…美 제재대상도 배석,4,줄리아니 러시아 후원행사에 돈받고 참가 제재대상도 배석
946,946,3259,일왕 신년인사…세계 안녕 기원,4,일왕 신년인사 세계 안녕 기원
10803,10803,37252,국토정보공사 전남공고에 4억 상당 실습 기자재 기증,2,국토정보공사 전남공고에 억 상당 실습 기자재 기증
7318,7318,23890,퇴직금 제도 개선 광주 5개구청 공무직노조 삭발투쟁,2,퇴직금 제도 개선 광주 개구청 공무직노조 삭발투쟁
9386,9386,31803,박기수 신임 TBN 광주교통방송 본부장,2,박기수 신임 TBN 광주교통방송 본부장
...,...,...,...,...,...
905,905,3128,차세대 지휘자 찾아…서울시향 부지휘자 최초로 공개 모집,2,차세대 지휘자 찾아 서울시향 부지휘자 최초로 공개 모집
5192,5192,17075,서양화 거장 장욱진 화백 기념관 세종시에 들어선다,2,서양화 거장 장욱진 화백 기념관 세종시에 들어선다
12172,12172,39911,내년도 최저임금 노사 입장차 팽팽,2,내년도 최저임금 노사 입장차 팽팽
235,235,553,中 5G 지배하면 세계가 위험 전직 美NSC 장성 주장,4,G 지배하면 세계가 위험 전직 NSC 장성 주장


In [224]:
from soyclustering import proportion_keywords

vocabs = [vocab for vocab, idx in sorted(vectorizer.vocabulary_.items(), key=lambda x:x[1])]
centers = spherical_kmeans.cluster_centers_

keywords = proportion_keywords(
    centers,
    labels=labels,
    index2word=vocabs)

In [226]:
keywords[0]

[('ms', 0.5470385308104908),
 ('slbm', 0.5203993825826435),
 ('kist', 0.5101978754038107),
 ('강조', 0.50940572826508),
 ('가입자', 0.5080748105626114),
 ('fbi', 0.507249354227293),
 ('강등', 0.5061128334584735),
 ('가방', 0.5060430891619824),
 ('tpp', 0.505481949563363),
 ('가족', 0.5054054806461908),
 ('강경파', 0.5037942838062077),
 ('감사원장', 0.5035189299691097),
 ('가입', 0.5031958049261712),
 ('감사', 0.5027072452435142),
 ('kbs', 0.5022125262856764),
 ('강력', 0.5021358504574223),
 ('가정폭력', 0.5012223704540489),
 ('anc', 0.5008006435278909),
 ('가능성', 0.5001275895294155)]

In [219]:
# sparse matrix 만들기

from scipy.sparse import csr_matrix

embedding_csr_matrix = csr_matrix(embedding_matrix)
embedding_csr_matrix

<1599x100 sparse matrix of type '<class 'numpy.float64'>'
	with 159900 stored elements in Compressed Sparse Row format>

In [220]:
embedding_csr_matrix[0]

<1x100 sparse matrix of type '<class 'numpy.float64'>'
	with 100 stored elements in Compressed Sparse Row format>