# Setting

In [5]:
import numpy as np
import pandas as pd
import os
import csv
import math
import copy

from datetime import datetime
from collections import OrderedDict # Dict. which the order of data adjusted 
from collections import defaultdict

# DBSCAN
from sklearn.cluster import DBSCAN

# Konlpy
from konlpy.tag import Komoran
from nltk.tokenize import sent_tokenize,word_tokenize
from nltk.corpus import stopwords

# Embedding tfidf
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer

# Load Data

In [2]:
path = 'data/'
fileName = 'rawData_10000.csv'
data_raw = pd.read_csv(path + fileName, names = None, encoding='CP949')

docno_raw = data_raw['no']
rawdata = data_raw['응답값']

# Word Embedding : Tf-idf

In [3]:
myzip = zip(docno_raw, rawdata)

raw_docs = OrderedDict(myzip)
print('The length of raw_docs: %d' %len(raw_docs))

# POS tag with Komoran
pos_tagger = Komoran()

def tokenize_komoran(doc):
    # norm, stem is optional
    return [x for x, t in pos_tagger.pos(doc) if t in ['NNG', 'NNP','NNB', 'VV', 'VA','XR','UN','MAG']]

token = []
for row in rawdata :
    token.append(' '.join(k for k in tokenize_komoran(str(row))))

# check np.nan
k=0
for i in token :
    if i == '':
        k = k + 1
print('The number of np.nan is  %d' %k)

# replace np.nan with raw data
for i in range(len(token)) :
    if token[i] == '':
        token[i] = rawdata[i]
        
# result of POS
tokened = []
for row in token :
    tokened.append(str(row).split(" "))
    
myzip = zip(docno_raw, tokened)
train_docs_tfidf = OrderedDict(myzip)
print('The length of train_docs_tfidf is %d, Check with that of raw_docs' %len(train_docs_tfidf))

The length of raw_docs: 10400
The number of np.nan is  65
The length of train_docs_tfidf is 10400, Check with that of raw_docs


In [6]:
def tokenize(doc):
    return [x for x in doc.split(" ")]

inputbow = token

# tfidv = TfidfVectorizer(tokenizer=tokenize, max_features = voca_count).fit(inputbow)
tfidv = TfidfVectorizer(tokenizer=tokenize).fit(inputbow)
tfidf=tfidv.transform(inputbow).toarray() 
test_tfidf=copy.copy(tfidf)

print(len(tfidv.vocabulary_.keys())) # 고유 단어 갯수 (2151 or 2083)

ae_inputsize = len(tfidv.vocabulary_.keys())

2316


In [7]:
tfidf.shape

(10400, 2316)

In [8]:
myzip = zip(docno_raw, tfidf)
tfidf_docs = OrderedDict(myzip)
len(tfidf_docs)

10400

# DBSCAN

In [9]:
section=1
step =1
eps_ini=0.01
min_samples_ini = MinPts_checkpoint = 50
metric="cosine"
cluster_EA_tfidf = 0

Loop_switch = True
input_data = tfidf
keylist=list(raw_docs.keys())

In [10]:
eps = eps_ini
min_samples = min_samples_ini

MinPts_list =[]
while MinPts_checkpoint > 0 :
    MinPts_list.append(MinPts_checkpoint)
    MinPts_checkpoint -= 5

# the dic of the result
DBSCAN_result_tfidf={}

In [11]:
while Loop_switch:
    print('\nSection %d.<Epsilon : %.3f>' % (section, eps))
    print('--------------------------------------------')
    MinPts_check=[]
    while min_samples >= 5:
        
        # step 1.
        db = DBSCAN(eps=eps, min_samples=min_samples, metric=metric).fit(input_data)
        core_samples_mask = np.zeros_like(db.labels_, dtype=bool)
        core_samples_mask[db.core_sample_indices_] = True
        labels = db.labels_ # clustering index

        n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0)
                
        # clustering 개수
        print('Step%d_Estimated number of clusters: %d, (MinPts= %d)' % (step, n_clusters_, min_samples))

        MinPts_check.append(n_clusters_)
        
        # clustering result        
        temp_labels=list(set(labels))
        temp_labels.append(-1)
        for i in temp_labels:
            target=i
            targetlist=[ i for i, x in enumerate(labels) if x == target ] 
            clusterdic={}
            for j in targetlist:
                clusterdic[keylist[j]]=raw_docs[keylist[j]]
            if i != -1:                                              # cluster numbering
                clusterNo=i+cluster_EA_tfidf
            else:
                clusterNo=i
            DBSCAN_result_tfidf[clusterNo] = clusterdic
        
        # replace input data wit noise 
        tfidf_noise_docs={}
        for i in tfidf_docs.keys():
            if i in DBSCAN_result_tfidf[-1].keys():
                tfidf_noise_docs[i]=tfidf_docs[i]
        tfidf_noise=list(tfidf_noise_docs.values())
        keylist=list(tfidf_noise_docs.keys())
        input_data = tfidf_noise
        
        step += 1
        min_samples += -5
        cluster_EA_tfidf += n_clusters_
        
        
    else :        
        if cluster_EA_tfidf == 0:    
            if eps == eps_ini:       # 첫 section에서 클러스터가 안 묶인 경우
                eps+=0.01
                section += 1
                
                index = next((i for i, x in enumerate(MinPts_check) if x), 0)
                min_samples= MinPts_list[index]
                
            else:                    # 첫 section이 아닌, 어느정도 진행 후 클러스터가 안 묶인 경우
                Loop_switch = False
                print('--------------------------------------------')    
                print('Cluster EA is %d' % (cluster_EA_tfidf))
                print('Noise EA is %d' % len(DBSCAN_result_tfidf[-1]))
                print('\n')
                
        elif eps < 0.6 and len(DBSCAN_result_tfidf[-1]) >= len(raw_docs)*(math.log(len(raw_docs),2)/100) :
            eps += 0.01
            index = next((i for i, x in enumerate(MinPts_check) if x), 0)
            MinPts_list=MinPts_list[index::]
            index = 0
            min_samples= MinPts_list[index]
            section += 1
        else:
            Loop_switch = False
            print('--------------------------------------------')    
            print('Cluster EA is %d' % (cluster_EA_tfidf))
            print('Noise EA is %d' % len(DBSCAN_result_tfidf[-1]))
            print('\n')


Section 1.<Epsilon : 0.010>
--------------------------------------------
Step1_Estimated number of clusters: 10, (MinPts= 50)
Step2_Estimated number of clusters: 2, (MinPts= 45)
Step3_Estimated number of clusters: 0, (MinPts= 40)
Step4_Estimated number of clusters: 2, (MinPts= 35)
Step5_Estimated number of clusters: 3, (MinPts= 30)
Step6_Estimated number of clusters: 4, (MinPts= 25)
Step7_Estimated number of clusters: 9, (MinPts= 20)
Step8_Estimated number of clusters: 4, (MinPts= 15)
Step9_Estimated number of clusters: 23, (MinPts= 10)
Step10_Estimated number of clusters: 70, (MinPts= 5)

Section 2.<Epsilon : 0.020>
--------------------------------------------
Step11_Estimated number of clusters: 0, (MinPts= 50)
Step12_Estimated number of clusters: 0, (MinPts= 45)
Step13_Estimated number of clusters: 0, (MinPts= 40)
Step14_Estimated number of clusters: 0, (MinPts= 35)
Step15_Estimated number of clusters: 0, (MinPts= 30)
Step16_Estimated number of clusters: 0, (MinPts= 25)
Step17_Esti

Step81_Estimated number of clusters: 19, (MinPts= 5)

Section 55.<Epsilon : 0.550>
--------------------------------------------
Step82_Estimated number of clusters: 18, (MinPts= 5)

Section 56.<Epsilon : 0.560>
--------------------------------------------
Step83_Estimated number of clusters: 21, (MinPts= 5)

Section 57.<Epsilon : 0.570>
--------------------------------------------
Step84_Estimated number of clusters: 22, (MinPts= 5)

Section 58.<Epsilon : 0.580>
--------------------------------------------
Step85_Estimated number of clusters: 25, (MinPts= 5)

Section 59.<Epsilon : 0.590>
--------------------------------------------
Step86_Estimated number of clusters: 16, (MinPts= 5)

Section 60.<Epsilon : 0.600>
--------------------------------------------
Step87_Estimated number of clusters: 17, (MinPts= 5)
--------------------------------------------
Cluster EA is 1097
Noise EA is 2273




In [12]:
DBSCAN_result_tfidf

{0: {3: '시원함',
  341: '시원하다',
  775: '시원해서',
  908: '시원함',
  1081: '시원해서',
  1808: '시원해서',
  1954: '시원함',
  2159: '시원함',
  2446: '시원하다',
  2526: '시원함',
  2779: '시원하다',
  2867: '시원하다',
  3123: '시원하다',
  3255: '시원해서',
  3351: '시원하다',
  3432: '시원해서',
  3476: '시원함',
  3755: '시원해보여서',
  3972: '시원하다',
  3977: '시원함.',
  4042: '시원해서',
  4044: '시원해서',
  4060: '시원하다',
  4203: '시원해요~',
  4249: '시원해서',
  4479: '시원함',
  4528: '시원하다',
  4652: '시원함',
  4673: '시원하다',
  4684: '시원하다',
  4745: '시원함',
  4886: '시원함',
  5012: '시원해서',
  5386: '시원함',
  5549: '시원하다',
  5559: '시원함',
  5674: '시원하다',
  5959: '시원',
  6088: '시원하다',
  6231: '시원함',
  6316: '시원하다',
  6368: '시원함',
  6459: '시원하다',
  6570: '시원함',
  6619: '시원해서',
  6809: '시원함',
  6875: '시원함',
  6899: '시원해서',
  7057: '시원해서',
  7080: '시원함',
  7083: '시원함',
  7292: '시원하다',
  7323: '시원함',
  7546: '시원해서',
  7676: '시원하다',
  7793: '시원함',
  8327: '시원하다',
  8340: '시원함',
  8392: '시원해보여서',
  8558: '시원함',
  8596: '시원함',
  8632: '시원',
  8663: '시원해서',
  8762: '시원해서',
  

In [16]:
# for result to csv file
DBSCAN_result_tfidf_tolist = {}
for i in DBSCAN_result_tfidf.keys() :
    totalvaluelist=[]
    for j in DBSCAN_result_tfidf[i].keys():
        valuelist=[]
        valuelist.append(j) # docNo
        valuelist.append(DBSCAN_result_tfidf[i][j])
        totalvaluelist.append(valuelist)
    DBSCAN_result_tfidf_tolist[i]=totalvaluelist

In [17]:
# to DataFrame
dflist_tfidf=[]
for i in DBSCAN_result_tfidf_tolist.keys() : 
    df = pd.DataFrame(DBSCAN_result_tfidf_tolist[i], columns = ['docNo', 'raw_doc'])
    dflist_tfidf.append(df)
result_tfidf = pd.concat(dflist_tfidf, keys=DBSCAN_result_tfidf_tolist.keys())

In [18]:
# save the result of DBSCAN_tfidf
if not os.path.isdir('result/'+datetime.now().strftime('%Y-%m-%d')+'/'):
    os.mkdir('result/'+ datetime.now().strftime('%Y-%m-%d')+'/')

result_path = 'result/'+ datetime.now().strftime('%Y-%m-%d')+'/'
result_tfidf.to_csv(result_path+'DBSCAN_result_Topic_refined_10000.csv', encoding='cp949')