# Simple review for K-Means Clustering

- https://scikit-learn.org/stable/modules/generated/sklearn.cluster.KMeans.html

In [None]:
!pip install nltk
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import pandas as pd
import numpy as np
import gensim
# from konlpy.tag import Mecab

In [None]:
root_path = '/content/drive/My Drive/Colab Notebooks/2020-PoscoICT/Data/'
# mecab = Mecab()
model = gensim.models.Word2Vec.load(root_path+'ko/ko.bin')

In [None]:
sample_df = pd.read_csv(root_path+'nsmc/ratings_train.txt', sep='\t')

# 시간관계상 500개 sampling
sentences = list(sample_df['document'].values)[:500]
print(sentences[:5])

['아 더빙.. 진짜 짜증나네요 목소리', '흠...포스터보고 초딩영화줄....오버연기조차 가볍지 않구나', '너무재밓었다그래서보는것을추천한다', '교도소 이야기구먼 ..솔직히 재미는 없다..평점 조정', '사이몬페그의 익살스런 연기가 돋보였던 영화!스파이더맨에서 늙어보이기만 했던 커스틴 던스트가 너무나도 이뻐보였다']


In [None]:
def w2v_predict(model, embedding_size, tokenized_words):
    # words = sentence.split()
    feature_vec = np.zeros((embedding_size,), dtype='float32')
    n_words = 0
    index2word_set = set(model.wv.index2word)
    for word in tokenized_words:
        if word in index2word_set:
            n_words += 1
            feature_vec = np.add(feature_vec, model[word])
    if (n_words > 0):
        feature_vec = np.divide(feature_vec, n_words)
    return feature_vec

In [None]:
# tokenized_sents = [' '.join(mecab.morphs(x)) for x in sentences]
tokenized_sents = [' '.join(nltk.word_tokenize(x)) for x in sentences]

In [None]:
'\n'.join(tokenized_sents[:10])

In [None]:
sent_dict = dict()
vector_dict = dict()
for idx, s in enumerate(tokenized_sents):
    vec = w2v_predict(model, 200, s)
    sent_dict[idx] = s
    vector_dict[idx] = vec

  if __name__ == '__main__':


In [None]:
print(vector_dict)

### K-Means Clustering

In [None]:
from sklearn.cluster import KMeans

In [None]:
kmeans = KMeans(n_clusters=10, random_state=0).fit(list(vector_dict.values()))


In [None]:
label_dict = dict()
for idx, label in enumerate(kmeans.labels_):
    label_dict[idx] = label

In [None]:
print(label_dict)

{0: 9, 1: 4, 2: 1, 3: 4, 4: 9, 5: 3, 6: 1, 7: 1, 8: 1, 9: 1, 10: 9, 11: 9, 12: 1, 13: 1, 14: 9, 15: 1, 16: 9, 17: 1, 18: 9, 19: 3, 20: 9, 21: 1, 22: 1, 23: 9, 24: 1, 25: 4, 26: 1, 27: 6, 28: 3, 29: 4, 30: 1, 31: 9, 32: 1, 33: 4, 34: 9, 35: 9, 36: 9, 37: 4, 38: 9, 39: 1, 40: 3, 41: 1, 42: 9, 43: 1, 44: 9, 45: 9, 46: 9, 47: 9, 48: 4, 49: 9, 50: 3, 51: 9, 52: 9, 53: 9, 54: 9, 55: 8, 56: 4, 57: 1, 58: 3, 59: 9, 60: 9, 61: 9, 62: 3, 63: 9, 64: 1, 65: 9, 66: 9, 67: 1, 68: 3, 69: 9, 70: 9, 71: 1, 72: 9, 73: 9, 74: 1, 75: 9, 76: 1, 77: 4, 78: 3, 79: 5, 80: 4, 81: 1, 82: 1, 83: 9, 84: 9, 85: 3, 86: 4, 87: 1, 88: 9, 89: 4, 90: 3, 91: 9, 92: 1, 93: 3, 94: 3, 95: 6, 96: 1, 97: 6, 98: 9, 99: 9, 100: 9, 101: 9, 102: 1, 103: 1, 104: 1, 105: 1, 106: 7, 107: 6, 108: 9, 109: 1, 110: 9, 111: 4, 112: 6, 113: 3, 114: 1, 115: 1, 116: 9, 117: 4, 118: 9, 119: 9, 120: 1, 121: 1, 122: 3, 123: 0, 124: 4, 125: 1, 126: 4, 127: 1, 128: 9, 129: 4, 130: 0, 131: 9, 132: 4, 133: 1, 134: 4, 135: 3, 136: 3, 137: 9, 138: 

In [None]:
label_df = pd.DataFrame.from_dict(label_dict, orient='index', columns=['cluster_idx'])
print(label_df.shape)
print(label_df.head(10))

(500, 1)
   cluster_idx
0            9
1            4
2            1
3            4
4            9
5            3
6            1
7            1
8            1
9            1


In [None]:
sent_df = pd.DataFrame.from_dict(sent_dict, orient='index', columns=['sentence'])
print(sent_df.shape)
print(sent_df.head(10))

(500, 1)
                                            sentence
0                                아 더빙.. 진짜 짜증나네요 목소리
1              흠 ... 포스터보고 초딩영화줄 ... .오버연기조차 가볍지 않구나
2                                  너무재밓었다그래서보는것을추천한다
3                      교도소 이야기구먼 ..솔직히 재미는 없다..평점 조정
4  사이몬페그의 익살스런 연기가 돋보였던 영화 ! 스파이더맨에서 늙어보이기만 했던 커스...
5   막 걸음마 뗀 3세부터 초등학교 1학년생인 8살용영화.ㅋㅋㅋ ... 별반개도 아까움 .
6                             원작의 긴장감을 제대로 살려내지못했다 .
7  별 반개도 아깝다 욕나온다 이응경 길용우 연기생활이몇년인지..정말 발로해도 그것보단...
8                             액션이 없는데도 재미 있는 몇안되는 영화
9    왜케 평점이 낮은건데 ? 꽤 볼만한데.. 헐리우드식 화려함에만 너무 길들여져 있나 ?


In [None]:
result_df = pd.concat([sent_df, label_df], axis=1)
print(result_df.shape)
print(result_df.head(10))

In [None]:
grouped = result_df.groupby(by='cluster_idx')
for key in grouped.groups:
    print(key)
    print('\n'.join([result_df['sentence'].iloc[x] for x in list(grouped.groups[key])]))
    print("\n\n")