# 문서 군집화

## 실습: UCI의 Opinion Review 데이터 세트 활용
- 51개의 텍스트 데이터로 되어있음
- 각 파일은 tripadvisor.com(호텔), edmunds.com(자동차), amazon.com(전자제품) 사이트에서 가져온 리뷰로, 문서별로 100개 정도의 문장을 가지고 있음

In [4]:
import os
import glob
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')

pd.set_option('display.max_colwidth', 700)

### 폴더에 저장된 각 문서 불러오기

In [10]:
path = './필요 데이터/UCI_datasets/topics'

# 해당하는 문서 데이터 리스트화
all_files = glob.glob(os.path.join(path, '*.data'))
len(all_files), all_files[0]

(51, './필요 데이터/UCI_datasets/topics\\accuracy_garmin_nuvi_255W_gps.txt.data')

In [19]:
# 파일이름 리스트 만들기
filename_list = [file.split('\\')[1].split('.')[0] for file in all_files]
print(filename_list[:2])

# 파일 내용 리스트 만들기
opinion_list = [pd.read_table(file, encoding = 'latin1') for file in all_files]


# 데이터프레임화
opinion_df = pd.DataFrame(pd.Series(filename_list), columns = ['filename'])
opinion_df['opinion'] = opinion_list

opinion_df.head()

['accuracy_garmin_nuvi_255W_gps', 'bathroom_bestwestern_hotel_sfo']


Unnamed: 0,filename,opinion
0,accuracy_garmin_nuvi_255W_gps,", and is very, very accurate . 0 but for the most part, we find that the Garmin software provides accurate directions, whereever we intend to go . 1 This function is not accurate if you don't leave it in battery mode say, when you stop at the Cracker Barrell for lunch and to play one of those trangle games with the tees . 2 It provides immediate alternatives if the route from the online map program was inaccurate or b..."
1,bathroom_bestwestern_hotel_sfo,"The room was not overly big, but clean and very comfortable beds, a great shower and very clean bathrooms . 0 The second room was smaller, with a very inconvenient bathroom layout, but at least it was quieter and we were able to sleep . 1 Large comfortable room, wonderful bathroom . 2 The rooms were nice, very comfy bed and very clean bathroom . 3 Bathroom was spacious too and very clean . 4 ..."
2,battery-life_amazon_kindle,"After I plugged it in to my USB hub on my computer to charge the battery the charging cord design is very clever ! 0 After you have paged tru a 500, page book one, page, at, a, time to get from Chapter 2 to Chapter 15, see how excited you are about a low battery and all the time it took to get there ! 1 ..."
3,battery-life_ipod_nano_8gb,short battery life I moved up from an 8gb . 0 I love this ipod except for the battery life . 1 long battery scratch resistant 2 Battery drains even if I don't use it . 3 I only wonder why t...
4,battery-life_netbook_1005ha,"6GHz 533FSB cpu, glossy display, 3, Cell 23Wh Li, ion Battery , and a 1 . 0 Not to mention that as of now Asus will not sell you a spare 3 or 6, cell Li, Ion battery . 1 It also features a N270 cpu, 6, cell 48Wh Li, ion Battery 8 . 2 3MP webcam, 6, Cell 63Wh Li, ion Battery with a whopping 10 . 3 Realistic battery numbers are between 8 . 4 of battery life if you're using wifi & doing email word processing YouTube web surfing . .. ..."


In [45]:
opinion_df['opinion'] = opinion_df['opinion'].apply(lambda x: x.to_string())

### 피처벡터화

#### TFIDF- 벡터화

In [46]:
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.stem import WordNetLemmatizer
import nltk
import string

In [47]:
remove_punct_dict = dict((ord(punct), None) for punct in string.punctuation)

# 토큰화된 단어리스트 표제어 추출
def LemTokens(tokens):
    lemma = WordNetLemmatizer()
    return [lemma.lemmatize(token) for token in tokens]


# 소문자화 + 특수문자 제거 + 단어 토큰화 사용자함수
def LemNormalize(text):
    word_tokens = nltk.word_tokenize(text.lower().translate(remove_punct_dict))
    return LemTokens(word_tokens)


In [49]:
# 피처벡터화

tfidf_vect = TfidfVectorizer(max_df = 0.85, min_df = 0.05, 
                             ngram_range = (1, 2),
                             stop_words = 'english', 
                             tokenizer = LemNormalize)

# 적용 및 변환
## 변환할 때 무족권,, opinion 컬럼 내용을 string으로 변환해줘야 함! 안그럼 안돌아감
feature_vect = tfidf_vect.fit_transform(opinion_df['opinion'])

feature_vect.shape

(51, 4611)

In [55]:
print(feature_vect[0])

  (0, 1465)	0.023283541665665836
  (0, 385)	0.023283541665665836
  (0, 1470)	0.05108766228354363
  (0, 2652)	0.023283541665665836
  (0, 1357)	0.02182613746339447
  (0, 723)	0.023283541665665836
  (0, 4367)	0.02182613746339447
  (0, 4593)	0.01798716138533979
  (0, 4273)	0.02063535152550281
  (0, 1468)	0.019628556360506585
  (0, 1804)	0.019628556360506585
  (0, 4040)	0.023283541665665836
  (0, 1467)	0.015101445414252302
  (0, 2469)	0.023283541665665836
  (0, 2173)	0.02182613746339447
  (0, 4191)	0.02182613746339447
  (0, 4196)	0.019628556360506585
  (0, 234)	0.023283541665665836
  (0, 4041)	0.023283541665665836
  (0, 4092)	0.04127070305100562
  (0, 1533)	0.04365227492678894
  (0, 1996)	0.023283541665665836
  (0, 158)	0.02182613746339447
  (0, 1164)	0.023283541665665836
  (0, 3130)	0.02182613746339447
  :	:
  (0, 1894)	0.21584593662407742
  (0, 4265)	0.049468827747789774
  (0, 2291)	0.0475061586160301
  (0, 3207)	0.01729902651714019
  (0, 2651)	0.17719064637173385
  (0, 2916)	0.0166765328

### 군집화

In [56]:
from sklearn.cluster import KMeans

In [57]:
# 군집모델 컴파일
kmeans_clu = KMeans(n_clusters = 5,    # 군집 개수
                    max_iter = 1000,     # 최대 중심좌표 이동 횟수
                    random_state = 0)

# 군집화 적용
kmeans_clu.fit(feature_vect)

# 확인
cluster_labels = kmeans_clu.labels_   # 군집 레이블
cluster_centers = kmeans_clu.cluster_centers_  # 군집 중심점

In [58]:
opinion_df['cluster_label'] = cluster_labels

opinion_df.head()

Unnamed: 0,filename,opinion,cluster_label
0,accuracy_garmin_nuvi_255W_gps,", and is very, very accurate .\n0 but for the most part, we find that the Garmin software provides accurate directions, whereever we intend to go .\n1 This functi...",0
1,bathroom_bestwestern_hotel_sfo,"The room was not overly big, but clean and very comfortable beds, a great shower and very clean bathrooms .\n0 The second room was smaller, with a very inconvenient bathroom layout, but at least it was quieter and we were able to sleep .\n1 ...",1
2,battery-life_amazon_kindle,"After I plugged it in to my USB hub on my computer to charge the battery the charging cord design is very clever !\n0 After you have paged tru a 500, page book one, page, at, a, time to get from Chapter 2 to Chapter 15, see how excited you are about a low battery and all the time it took to get there !\n1 ...",3
3,battery-life_ipod_nano_8gb,short battery life I moved up from an 8gb .\n0 I love this ipod except for the battery life .\n1 ...,3
4,battery-life_netbook_1005ha,"6GHz 533FSB cpu, glossy display, 3, Cell 23Wh Li, ion Battery , and a 1 .\n0 Not to mention that as of now...",3


#### 군집화 결과 확인

- cluster 0: <br>
software, programmable(accrurate directions), graphics(wide screens), satellites, GPS, map, speedmeters, USB, GPS 등의 단어가 있는 것을 보니 gps관련 전자기기 인듯

In [64]:
opinion_df[opinion_df['cluster_label'] == 0]

Unnamed: 0,filename,opinion,cluster_label
0,accuracy_garmin_nuvi_255W_gps,", and is very, very accurate .\n0 but for the most part, we find that the Garmin software provides accurate directions, whereever we intend to go .\n1 This functi...",0
8,directions_garmin_nuvi_255W_gps,You also get upscale features like spoken directions including street names and programmable POIs .\n0 I used to hesitate to go out of my directions but no...,0
9,display_garmin_nuvi_255W_gps,"3 quot widescreen display was a bonus .\n0 This made for smoother graphics on the 255w of the vehicle moving along displayed roads, where the 750's display was more of a jerky movement .\n1 ...",0
33,satellite_garmin_nuvi_255W_gps,"It's fast to acquire satellites .\n0 If you've ever had a Brand X GPS take you on some strange route that adds 20 minutes to your trip, has you turn the wrong way down a one way road, tell you to turn AFTER you've passed the street, frequently loses the satellite signal, or has old maps missing streets, you know how important this stuff is .\n1 ...",0
34,screen_garmin_nuvi_255W_gps,It is easy to read and when touching the screen it works great !\n0 and zoom out buttons on the 255w to the same side of the screen which makes it a bit easier .\n1 ...,0
43,speed_garmin_nuvi_255W_gps,Another feature on the 255w is a display of the posted speed limit on the road which you are currently on right above your current displayed speed .\n0 I found myself not even looking at my car speedometer as I could easily see my current speed and the speed limit of my route at a glance .\n1 ...,0
47,transmission_toyota_camry_2007,"After slowing down, transmission has to be kicked to speed up .\n0 ...",0
48,updates_garmin_nuvi_255W_gps,"Another thing to consider was that I paid $50 less for the 750 and it came with the FM transmitter cable and a USB cord to connect it to your computer for updates and downloads .\n0 update and reroute much _more_ quickly than my other GPS .\n1 UPDATE ON THIS , It finally turned out that to see the elevation contours at lowe...",0


cluster1: <br>
room, food, wine reception, sightseeing, parking(expensive), service(disappointed), friendly staff등의 내용으로 보아 hotel 인듯

In [61]:
opinion_df[opinion_df['cluster_label'] == 1]

Unnamed: 0,filename,opinion,cluster_label
1,bathroom_bestwestern_hotel_sfo,"The room was not overly big, but clean and very comfortable beds, a great shower and very clean bathrooms .\n0 The second room was smaller, with a very inconvenient bathroom layout, but at least it was quieter and we were able to sleep .\n1 ...",1
13,food_holiday_inn_london,The room was packed to capacity with queues at the food buffets .\n0 The over zealous staff cleared our unfinished drinks while we were collecting cooked food and movement around the room with plates was difficult in the crowded circumstances .\n1 ...,1
14,food_swissotel_chicago,The food for our event was delicious .\n0 ...,1
15,free_bestwestern_hotel_sfo,The wine reception is a great idea as it is nice to meet other travellers and great having access to the free Internet access in our room .\n0 They also have a computer available with free internet which is a nice bonus but I didn't find that out till the day before we left but was still able to get on there to check our flight to Vegas the next day .\n1 ...,1
20,location_bestwestern_hotel_sfo,"Good Value good location , ideal choice .\n0 Great Location , Nice Rooms , Helpless Concierge\n1 ...",1
21,location_holiday_inn_london,"Great location for tube and we crammed in a fair amount of sightseeing in a short time .\n0 All in all, a normal chain hotel on a nice lo...",1
24,parking_bestwestern_hotel_sfo,Parking was expensive but I think this is common for San Fran .\n0 there is a fee for parking but well worth it seeing no where to park if you do have a car .\n1 ...,1
28,price_holiday_inn_london,"All in all, a normal chain hotel on a nice location , I will be back if I do not find anthing closer to Picadilly for a better price .\n0 ...",1
30,rooms_bestwestern_hotel_sfo,"Great Location , Nice Rooms , H...",1
31,rooms_swissotel_chicago,"The Swissotel is one of our favorite hotels in Chicago and the corner rooms have the most fantastic views in the city .\n0 The rooms look like they were just remodled and upgraded, there was an HD TV and a nice iHome docking station to put my iPod so I could set the alarm to wake up with my music instead of the radio .\n1 ...",1


- cluster 2:<br>
toyota, honda 등을 보니 자동차 관련인 듯

In [60]:
opinion_df[opinion_df['cluster_label'] == 2]

Unnamed: 0,filename,opinion,cluster_label
6,comfort_honda_accord_2008,"Drivers seat not comfortable, the car itself compared to other models of similar class .\n0 ...",2
7,comfort_toyota_camry_2007,"Ride seems comfortable and gas mileage fairly good averaging 26 city and 30 open road .\n0 Seats are fine, in fact of all the smaller sedans this is the most comfortable I found for the price as I am 6', 2 and 250# .\n1 Great gas mileage and comfortable on long trips ...",2
16,gas_mileage_toyota_camry_2007,Ride seems comfortable and gas mileage fairly good averaging 26 city and 30 open road .\n0 ...,2
17,interior_honda_accord_2008,I love the new body style and the interior is a simple pleasure except for the center dash .\n0 ...,2
18,interior_toyota_camry_2007,"First of all, the interior has way too many cheap plastic parts like the cheap plastic center piece that houses the clock .\n0 3 blown struts at 30,000 miles, interior trim coming loose and rattling squeaking, stains on paint, and bug splats taking paint off, premature uneven brake wear, on 3rd windsh...",2
22,mileage_honda_accord_2008,"It's quiet, get good gas mileage and looks clean inside and out .\n0 The mileage is great, and I've had to get used to stopping less for gas .\n1 Thought gas ...",2
25,performance_honda_accord_2008,"Very happy with my 08 Accord, performance is quite adequate it has nice looks and is a great long, distance cruiser .\n0 6, 4, 3 eco engine has poor performance and gas mileage of 22 highway .\n1 Overall performance is good but comfort level is poor .\n2 ...",2
29,quality_toyota_camry_2007,I previously owned a Toyota 4Runner which had incredible build quality and reliability .\n0 I bought the Camry because of Toyota reliability and qua...,2
37,seats_honda_accord_2008,"Front seats are very uncomfortable .\n0 No memory seats, no trip computer, can only display outside temp with trip odometer .\n1 ...",2


- cluster 3:<br>
USB, battery, ipod, CPU, headphone, download music 등을 보니 컴퓨터, 아이팟 등 전자기기 관련인 듯

In [65]:
opinion_df[opinion_df['cluster_label'] == 3]

Unnamed: 0,filename,opinion,cluster_label
2,battery-life_amazon_kindle,"After I plugged it in to my USB hub on my computer to charge the battery the charging cord design is very clever !\n0 After you have paged tru a 500, page book one, page, at, a, time to get from Chapter 2 to Chapter 15, see how excited you are about a low battery and all the time it took to get there !\n1 ...",3
3,battery-life_ipod_nano_8gb,short battery life I moved up from an 8gb .\n0 I love this ipod except for the battery life .\n1 ...,3
4,battery-life_netbook_1005ha,"6GHz 533FSB cpu, glossy display, 3, Cell 23Wh Li, ion Battery , and a 1 .\n0 Not to mention that as of now...",3
26,performance_netbook_1005ha,"The Eee Super Hybrid Engine utility lets users overclock or underclock their Eee PC's to boost performance or provide better battery life depending on their immediate requirements .\n0 In Super Performance mode CPU, Z shows the bus speed to increase up to 169 .\n1 One...",3
42,sound_ipod_nano_8gb,headphone jack i got a clear case for it and it i got a clear case for it and it like prvents me from being able to put the jack all the way in so the sound can b messsed up or i can get it in there and its playing well them go to move or something and it slides out .\n0 Picture and sound quality are excellent for this typ of devic .\n1 ...,3
49,video_ipod_nano_8gb,"I bought the 8, gig Ipod Nano that has the built, in video camera .\n0 Itunes has an on, line store, where you may purchase and download music and videos which will install onto the ipod .\n1 ...",3


- cluster 4:<br>
kindle, electronic device, window 등을 보니 전자기기 관련인 듯

In [66]:
opinion_df[opinion_df['cluster_label'] == 4]

Unnamed: 0,filename,opinion,cluster_label
5,buttons_amazon_kindle,"I thought it would be fitting to christen my Kindle with the Stephen King novella UR, so went to the Amazon site on my computer and clicked on the button to buy it .\n0 As soon as I'd clicked the button to confirm my order it appeared on my Kindle almost immediately !\n1 ...",4
10,eyesight-issues_amazon_kindle,"It feels as easy to read as the K1 but doesn't seem any crisper to my eyes .\n0 the white is really GREY, and to avoid considerable eye, strain I had to refresh pages every other page .\n1 The dream has always been a portable electronic device that could hold a ton of reading material, automate subscriptions and fa...",4
11,features_windows7,"I had to uninstall anti, virus and selected other programs, some of which did not have listings in the Programs and Features Control Panel section .\n0 This review briefly touches upon some of the key features and enhancements of Microsoft's latest OS .\n1 ...",4
12,fonts_amazon_kindle,"Being able to change the font sizes is awesome !\n0 For whatever reason, Amazon decided to make the Font on the Home Screen ...",4
19,keyboard_netbook_1005ha,", I think the new keyboard rivals the great hp mini keyboards .\n0 Since the battery life difference is minimum, the only reason to upgrade would be to get the better keyboard .\n1 The keyboard is now as good as t...",4
23,navigation_amazon_kindle,"In fact, the entire navigation structure has been completely revised , I'm still getting used to it but it's a huge step forward .\n0 ...",4
27,price_amazon_kindle,"If a case was included, as with the Kindle 1, that would have been reflected in a higher price .\n0 lower overall price, with nice leather cover .\n1 ...",4
35,screen_ipod_nano_8gb,"As always, the video screen is sharp and bright .\n0 2, inch screen and a glossy, polished aluminum finish that one CNET editor described as looking like a Christmas tree ornament .\n1 ...",4
36,screen_netbook_1005ha,Keep in mind that once you get in a room full of light or step outdoors screen reflections could become annoying .\n0 I've used mine outsi...,4
41,size_asus_netbook_1005ha,"A few other things I'd like to point out is that you must push the micro, sized right angle end of the ac adapter until it snaps in place or the battery may not charge .\n0 The full size right shift k...",4


- 최종적으로, <br>
자동차: cluster2 <br>
전자기기: cluster0, cluster3, cluster4  <br>
호텔: cluster1 <br>

### 재군집화

In [67]:
kmean_cluster2 = KMeans(n_clusters = 3,
                        max_iter = 1000, 
                        random_state = 0)

kmean_cluster2.fit(feature_vect)


cluster_labels2 = kmean_cluster2.labels_
cluster_centers2 = kmean_cluster2.cluster_centers_

opinion_df['cluster_label2'] = cluster_labels2
opinion_df.head()

Unnamed: 0,filename,opinion,cluster_label,cluster_label2
0,accuracy_garmin_nuvi_255W_gps,", and is very, very accurate .\n0 but for the most part, we find that the Garmin software provides accurate directions, whereever we intend to go .\n1 This functi...",0,0
1,bathroom_bestwestern_hotel_sfo,"The room was not overly big, but clean and very comfortable beds, a great shower and very clean bathrooms .\n0 The second room was smaller, with a very inconvenient bathroom layout, but at least it was quieter and we were able to sleep .\n1 ...",1,1
2,battery-life_amazon_kindle,"After I plugged it in to my USB hub on my computer to charge the battery the charging cord design is very clever !\n0 After you have paged tru a 500, page book one, page, at, a, time to get from Chapter 2 to Chapter 15, see how excited you are about a low battery and all the time it took to get there !\n1 ...",3,0
3,battery-life_ipod_nano_8gb,short battery life I moved up from an 8gb .\n0 I love this ipod except for the battery life .\n1 ...,3,0
4,battery-life_netbook_1005ha,"6GHz 533FSB cpu, glossy display, 3, Cell 23Wh Li, ion Battery , and a 1 .\n0 Not to mention that as of now...",3,0


- cluster 0:<br>
전자기기 관련인 듯

In [70]:
opinion_df[opinion_df['cluster_label2'] == 0]

Unnamed: 0,filename,opinion,cluster_label,cluster_label2
0,accuracy_garmin_nuvi_255W_gps,", and is very, very accurate .\n0 but for the most part, we find that the Garmin software provides accurate directions, whereever we intend to go .\n1 This functi...",0,0
2,battery-life_amazon_kindle,"After I plugged it in to my USB hub on my computer to charge the battery the charging cord design is very clever !\n0 After you have paged tru a 500, page book one, page, at, a, time to get from Chapter 2 to Chapter 15, see how excited you are about a low battery and all the time it took to get there !\n1 ...",3,0
3,battery-life_ipod_nano_8gb,short battery life I moved up from an 8gb .\n0 I love this ipod except for the battery life .\n1 ...,3,0
4,battery-life_netbook_1005ha,"6GHz 533FSB cpu, glossy display, 3, Cell 23Wh Li, ion Battery , and a 1 .\n0 Not to mention that as of now...",3,0
5,buttons_amazon_kindle,"I thought it would be fitting to christen my Kindle with the Stephen King novella UR, so went to the Amazon site on my computer and clicked on the button to buy it .\n0 As soon as I'd clicked the button to confirm my order it appeared on my Kindle almost immediately !\n1 ...",4,0
8,directions_garmin_nuvi_255W_gps,You also get upscale features like spoken directions including street names and programmable POIs .\n0 I used to hesitate to go out of my directions but no...,0,0
9,display_garmin_nuvi_255W_gps,"3 quot widescreen display was a bonus .\n0 This made for smoother graphics on the 255w of the vehicle moving along displayed roads, where the 750's display was more of a jerky movement .\n1 ...",0,0
10,eyesight-issues_amazon_kindle,"It feels as easy to read as the K1 but doesn't seem any crisper to my eyes .\n0 the white is really GREY, and to avoid considerable eye, strain I had to refresh pages every other page .\n1 The dream has always been a portable electronic device that could hold a ton of reading material, automate subscriptions and fa...",4,0
11,features_windows7,"I had to uninstall anti, virus and selected other programs, some of which did not have listings in the Programs and Features Control Panel section .\n0 This review briefly touches upon some of the key features and enhancements of Microsoft's latest OS .\n1 ...",4,0
12,fonts_amazon_kindle,"Being able to change the font sizes is awesome !\n0 For whatever reason, Amazon decided to make the Font on the Home Screen ...",4,0


- cluster 1:<br>
호텔 관련인 듯

In [71]:
opinion_df[opinion_df['cluster_label'] == 1]

Unnamed: 0,filename,opinion,cluster_label,cluster_label2
1,bathroom_bestwestern_hotel_sfo,"The room was not overly big, but clean and very comfortable beds, a great shower and very clean bathrooms .\n0 The second room was smaller, with a very inconvenient bathroom layout, but at least it was quieter and we were able to sleep .\n1 ...",1,1
13,food_holiday_inn_london,The room was packed to capacity with queues at the food buffets .\n0 The over zealous staff cleared our unfinished drinks while we were collecting cooked food and movement around the room with plates was difficult in the crowded circumstances .\n1 ...,1,1
14,food_swissotel_chicago,The food for our event was delicious .\n0 ...,1,1
15,free_bestwestern_hotel_sfo,The wine reception is a great idea as it is nice to meet other travellers and great having access to the free Internet access in our room .\n0 They also have a computer available with free internet which is a nice bonus but I didn't find that out till the day before we left but was still able to get on there to check our flight to Vegas the next day .\n1 ...,1,1
20,location_bestwestern_hotel_sfo,"Good Value good location , ideal choice .\n0 Great Location , Nice Rooms , Helpless Concierge\n1 ...",1,1
21,location_holiday_inn_london,"Great location for tube and we crammed in a fair amount of sightseeing in a short time .\n0 All in all, a normal chain hotel on a nice lo...",1,1
24,parking_bestwestern_hotel_sfo,Parking was expensive but I think this is common for San Fran .\n0 there is a fee for parking but well worth it seeing no where to park if you do have a car .\n1 ...,1,1
28,price_holiday_inn_london,"All in all, a normal chain hotel on a nice location , I will be back if I do not find anthing closer to Picadilly for a better price .\n0 ...",1,1
30,rooms_bestwestern_hotel_sfo,"Great Location , Nice Rooms , H...",1,1
31,rooms_swissotel_chicago,"The Swissotel is one of our favorite hotels in Chicago and the corner rooms have the most fantastic views in the city .\n0 The rooms look like they were just remodled and upgraded, there was an HD TV and a nice iHome docking station to put my iPod so I could set the alarm to wake up with my music instead of the radio .\n1 ...",1,1


- cluster 2:<br>
자동차 관련인 듯

In [72]:
opinion_df[opinion_df['cluster_label'] == 2]

Unnamed: 0,filename,opinion,cluster_label,cluster_label2
6,comfort_honda_accord_2008,"Drivers seat not comfortable, the car itself compared to other models of similar class .\n0 ...",2,2
7,comfort_toyota_camry_2007,"Ride seems comfortable and gas mileage fairly good averaging 26 city and 30 open road .\n0 Seats are fine, in fact of all the smaller sedans this is the most comfortable I found for the price as I am 6', 2 and 250# .\n1 Great gas mileage and comfortable on long trips ...",2,2
16,gas_mileage_toyota_camry_2007,Ride seems comfortable and gas mileage fairly good averaging 26 city and 30 open road .\n0 ...,2,2
17,interior_honda_accord_2008,I love the new body style and the interior is a simple pleasure except for the center dash .\n0 ...,2,2
18,interior_toyota_camry_2007,"First of all, the interior has way too many cheap plastic parts like the cheap plastic center piece that houses the clock .\n0 3 blown struts at 30,000 miles, interior trim coming loose and rattling squeaking, stains on paint, and bug splats taking paint off, premature uneven brake wear, on 3rd windsh...",2,2
22,mileage_honda_accord_2008,"It's quiet, get good gas mileage and looks clean inside and out .\n0 The mileage is great, and I've had to get used to stopping less for gas .\n1 Thought gas ...",2,2
25,performance_honda_accord_2008,"Very happy with my 08 Accord, performance is quite adequate it has nice looks and is a great long, distance cruiser .\n0 6, 4, 3 eco engine has poor performance and gas mileage of 22 highway .\n1 Overall performance is good but comfort level is poor .\n2 ...",2,2
29,quality_toyota_camry_2007,I previously owned a Toyota 4Runner which had incredible build quality and reliability .\n0 I bought the Camry because of Toyota reliability and qua...,2,2
37,seats_honda_accord_2008,"Front seats are very uncomfortable .\n0 No memory seats, no trip computer, can only display outside temp with trip odometer .\n1 ...",2,2


In [68]:
opinion_df[['cluster_label', 'cluster_label2']].value_counts()

cluster_label  cluster_label2
1              1                 16
4              0                 12
2              2                  9
0              0                  7
3              0                  6
0              2                  1
dtype: int64

### 군집별 핵심단어 추출
- cluster_model.cluster_centeres_ 함수는 각 군집을 구성하는 단어 피처가 군집의 중심을 기준으로 얼마나 가깝게 위치해 있는지 알려줌
- 1에 가까울수록 군집과 가깝다는 의미

In [74]:
cluster_centers2.shape  #(군집, 단어피처)

(3, 4611)

In [88]:
# 군집별 top n 핵심단어, 그 단어의 중심 위치 상댓값, 대상 파일명 반환하는 사용자 함수 생성
def get_cluster_details(cluster_model, cluster_df, feature_names, clusters_num, top_n_features = 10):
    cluster_details = {}
    
    # 핵심단어 추출
    cluster_center_idx = cluster_model.cluster_centers_.argsort()[:,::-1]
    for cluster_num in range(clusters_num):
        
        cluster_details[cluster_num] = {}
        
        # 군집 상세정보 채우기
        ## 군집 레이블
        cluster_details[cluster_num]['cluster'] = cluster_num
        ## 핵심단어
        top_n_feature_names = [feature_names[idx] for idx in cluster_center_idx[cluster_num, :top_n_features]]
        cluster_details[cluster_num]['top_features'] = top_n_feature_names
        ## 핵심단어의 중심위치 상댓값
        top_n_feature_centers = cluster_center_idx[cluster_num, :top_n_features]
        cluster_details[cluster_num]['top_features_value'] = top_n_feature_centers
        ## 파일 이름
        filenames = cluster_df[cluster_df['cluster_label2'] == cluster_num]['filename'].tolist()
        cluster_details[cluster_num]['filenames'] = filenames
    
    return cluster_details

In [97]:
cluster_df = opinion_df[['filename', 'opinion', 'cluster_label2']]
feature_names = tfidf_vect.get_feature_names()


cluster_details = get_cluster_details(kmean_cluster2, cluster_df,
                                      feature_names, 3, 
                                      top_n_features = 10)

for key, value in cluster_details.items():
    cluster_num = cluster_details[key]['cluster']
    top_n_features = cluster_details[key]['top_features']
    top_features_value = cluster_details[key]['top_features_value']
    filenames = cluster_details[key]['filenames']
    
    print(f'#### {cluster_num}')
    print(f'Top Features: {top_n_features}')
#     print(f'Review 파일명: {filenames}')
    print()
    

#### 0
Top Features: ['screen', 'battery', 'keyboard', 'battery life', 'life', 'kindle', 'direction', 'video', 'size', 'voice']

#### 1
Top Features: ['room', 'hotel', 'service', 'staff', 'food', 'location', 'bathroom', 'clean', 'price', 'parking']

#### 2
Top Features: ['interior', 'seat', 'mileage', 'comfortable', 'gas', 'gas mileage', 'transmission', 'car', 'performance', 'quality']

