### Opinion Review 데이터 세트를 이용한 문서 군집화 수행하기

In [1]:
import pandas as pd
import glob, os

path = '../06.Text분석/OpinosisDataset1.0/topics'                  
# path로 지정한 디렉토리 밑에 있는 모든 .data 파일들의 파일명을 리스트로 취합
all_files = glob.glob(os.path.join(path, "*.data"))    

In [2]:
all_files[0]

'../06.Text분석/OpinosisDataset1.0/topics\\accuracy_garmin_nuvi_255W_gps.txt.data'

In [3]:
filename_list = []
opinion_text = []

# 개별 파일들의 파일명은 filename_list 리스트로 취합, 
for file_ in all_files:
    with open(file_, encoding='latin1') as f:
        texts = f.read()
    
    # 절대경로로 주어진 file 명을 가공. 만일 Linux에서 수행시에는 아래 \\를 / 변경. 맨 마지막 .data 확장자도 제거
    filename_ = file_.split('\\')[-1]
    filename = filename_.split('.')[0]

    #파일명 리스트와 파일내용 리스트에 파일명과 파일 내용을 추가. 
    filename_list.append(filename)
    opinion_text.append(texts)

In [4]:
# 파일명 리스트와 파일내용 리스트를  DataFrame으로 생성
df = pd.DataFrame({'filename':filename_list, 'opinion_text':opinion_text})
df.head()

Unnamed: 0,filename,opinion_text
0,accuracy_garmin_nuvi_255W_gps,", and is very, very accurate .\n but for the m..."
1,bathroom_bestwestern_hotel_sfo,"The room was not overly big, but clean and ve..."
2,battery-life_amazon_kindle,After I plugged it in to my USB hub on my com...
3,battery-life_ipod_nano_8gb,short battery life I moved up from an 8gb .\...
4,battery-life_netbook_1005ha,"6GHz 533FSB cpu, glossy display, 3, Cell 23Wh ..."


In [5]:
df.opinion_text[0][:300]

", and is very, very accurate .\n but for the most part, we find that the Garmin software provides accurate directions, whereever we intend to go .\n This function is not accurate if you don't leave it in battery mode say, when you stop at the Cracker Barrell for lunch and to play one of those trangle "

In [6]:
from nltk import word_tokenize

def simple_tokenizer(text):
    word_list = word_tokenize(text.lower())
    return [word for word in word_list if len(word) > 2]

In [7]:
from sklearn.feature_extraction.text import TfidfVectorizer

tvect = TfidfVectorizer(tokenizer=simple_tokenizer, stop_words='english', 
                        ngram_range=(1,2), min_df=0.05, max_df=0.85)
feature_vect = tvect.fit_transform(df.opinion_text)

In [8]:
from sklearn.cluster import KMeans

# 5개 집합으로 군집화 수행. 예제를 위해 동일한 클러스터링 결과 도출용 random_state=0 
km_cluster = KMeans(n_clusters=5, max_iter=10000, random_state=0)
km_cluster.fit(feature_vect)
cluster_label = km_cluster.labels_
cluster_centers = km_cluster.cluster_centers_

In [9]:
df['cluster_label'] = cluster_label
df.cluster_label.value_counts()

0    16
1    10
2     9
3     9
4     7
Name: cluster_label, dtype: int64

In [10]:
df.head()

Unnamed: 0,filename,opinion_text,cluster_label
0,accuracy_garmin_nuvi_255W_gps,", and is very, very accurate .\n but for the m...",2
1,bathroom_bestwestern_hotel_sfo,"The room was not overly big, but clean and ve...",0
2,battery-life_amazon_kindle,After I plugged it in to my USB hub on my com...,1
3,battery-life_ipod_nano_8gb,short battery life I moved up from an 8gb .\...,1
4,battery-life_netbook_1005ha,"6GHz 533FSB cpu, glossy display, 3, Cell 23Wh ...",1


In [11]:
df[df['cluster_label']==0].sort_values(by='filename')

Unnamed: 0,filename,opinion_text,cluster_label
1,bathroom_bestwestern_hotel_sfo,"The room was not overly big, but clean and ve...",0
13,food_holiday_inn_london,The room was packed to capacity with queues a...,0
14,food_swissotel_chicago,The food for our event was delicious .\n The ...,0
15,free_bestwestern_hotel_sfo,The wine reception is a great idea as it is ni...,0
20,location_bestwestern_hotel_sfo,"Good Value good location , ideal choice .\nGr...",0
21,location_holiday_inn_london,Great location for tube and we crammed in a f...,0
24,parking_bestwestern_hotel_sfo,Parking was expensive but I think this is com...,0
28,price_holiday_inn_london,"All in all, a normal chain hotel on a nice loc...",0
32,room_holiday_inn_london,"We arrived at 23,30 hours and they could not r...",0
30,rooms_bestwestern_hotel_sfo,"Great Location , Nice Rooms , Helpless Con...",0


In [12]:
# 3개의 집합으로 군집화 
km_cluster = KMeans(n_clusters=3, max_iter=10000, random_state=0)
km_cluster.fit(feature_vect)
cluster_label = km_cluster.labels_
df['cluster_label'] = cluster_label
df.cluster_label.value_counts()

0    24
2    16
1    11
Name: cluster_label, dtype: int64

In [13]:
df.sort_values(by='cluster_label')

Unnamed: 0,filename,opinion_text,cluster_label
0,accuracy_garmin_nuvi_255W_gps,", and is very, very accurate .\n but for the m...",0
48,updates_garmin_nuvi_255W_gps,Another thing to consider was that I paid $50 ...,0
44,speed_windows7,"Windows 7 is quite simply faster, more stable,...",0
43,speed_garmin_nuvi_255W_gps,Another feature on the 255w is a display of th...,0
42,sound_ipod_nano_8gb,headphone jack i got a clear case for it and ...,0
41,size_asus_netbook_1005ha,A few other things I'd like to point out is t...,0
36,screen_netbook_1005ha,Keep in mind that once you get in a room full...,0
35,screen_ipod_nano_8gb,"As always, the video screen is sharp and brig...",0
34,screen_garmin_nuvi_255W_gps,It is easy to read and when touching the scr...,0
33,satellite_garmin_nuvi_255W_gps,It's fast to acquire satellites .\n If you've...,0


### 군집(Cluster)별 핵심 단어 추출하기


In [14]:
cluster_centers = km_cluster.cluster_centers_
print(f'cluster_centers shape: {cluster_centers.shape}')
print(cluster_centers)

cluster_centers shape: (3, 4154)
[[0.         0.00467086 0.00731702 ... 0.         0.00042877 0.00765451]
 [0.00297642 0.00178587 0.00025015 ... 0.         0.00265687 0.        ]
 [0.         0.00295311 0.00653383 ... 0.00165472 0.         0.        ]]


In [15]:
from cluster import get_cluster_details

In [16]:
feature_names = tvect.get_feature_names()

cluster_details = get_cluster_details(cluster_model=km_cluster, cluster_data=df,
                                  feature_names=feature_names, clusters_num=3, top_n_features=10 )
#print_cluster_details(cluster_details)
for cluster_num, cluster_detail in cluster_details.items():
    print(f'####### Cluster {cluster_num}')
    print('Top features:', cluster_detail['top_features'])
    print('Reviews 파일명:', cluster_detail['filenames'][:7])
    print('==================================================')

####### Cluster 0
Top features: ['screen', 'battery', 'keyboard', 'kindle', 'battery life', 'directions', 'life', 'size', 'voice', 'video']
Reviews 파일명: ['accuracy_garmin_nuvi_255W_gps', 'battery-life_amazon_kindle', 'battery-life_ipod_nano_8gb', 'battery-life_netbook_1005ha', 'buttons_amazon_kindle', 'directions_garmin_nuvi_255W_gps', 'display_garmin_nuvi_255W_gps']
####### Cluster 1
Top features: ['interior', 'performance', 'mileage', 'seats', 'comfortable', 'gas', 'gas mileage', 'transmission', 'car', 'quality']
Reviews 파일명: ['comfort_honda_accord_2008', 'comfort_toyota_camry_2007', 'gas_mileage_toyota_camry_2007', 'interior_honda_accord_2008', 'interior_toyota_camry_2007', 'mileage_honda_accord_2008', 'performance_honda_accord_2008']
####### Cluster 2
Top features: ['hotel', 'service', 'rooms', 'staff', 'room', 'food', 'location', 'clean', 'bathroom', 'parking']
Reviews 파일명: ['bathroom_bestwestern_hotel_sfo', 'food_holiday_inn_london', 'food_swissotel_chicago', 'free_bestwestern_ho