In [1]:
import pandas as pd
import glob, os

path = r'/Users/jiwanhwang/Documents/GitHub/Practice_MachineLearning/Text_Analytics/OpinosisDataset1.0/topics'

In [9]:
all_files = glob.glob(os.path.join(path, "*.data"))
filename_list = []
opinion_text = []
all_files[:5]

['/Users/jiwanhwang/Documents/GitHub/Practice_MachineLearning/Text_Analytics/OpinosisDataset1.0/topics/battery-life_ipod_nano_8gb.txt.data',
 '/Users/jiwanhwang/Documents/GitHub/Practice_MachineLearning/Text_Analytics/OpinosisDataset1.0/topics/gas_mileage_toyota_camry_2007.txt.data',
 '/Users/jiwanhwang/Documents/GitHub/Practice_MachineLearning/Text_Analytics/OpinosisDataset1.0/topics/room_holiday_inn_london.txt.data',
 '/Users/jiwanhwang/Documents/GitHub/Practice_MachineLearning/Text_Analytics/OpinosisDataset1.0/topics/location_holiday_inn_london.txt.data',
 '/Users/jiwanhwang/Documents/GitHub/Practice_MachineLearning/Text_Analytics/OpinosisDataset1.0/topics/staff_bestwestern_hotel_sfo.txt.data']

In [7]:
for file_ in all_files:

    df = pd.read_table(file_, index_col=None, header=0, encoding='latin1')
    
    filename_ = file_.split('/')[-1]
    filename = filename_.split('.')[0]
    
    filename_list.append(filename)
    opinion_text.append(df.to_string())

document_df = pd.DataFrame({'filename':filename_list, 'opinion_text':opinion_text})
document_df.head()

Unnamed: 0,filename,opinion_text
0,battery-life_ipod_nano_8gb,...
1,gas_mileage_toyota_camry_2007,...
2,room_holiday_inn_london,...
3,location_holiday_inn_london,...
4,staff_bestwestern_hotel_sfo,...


## Feature Vectorization - TF-IDF

In [13]:
from nltk.stem import WordNetLemmatizer
import nltk
import string

remove_punct_dict = dict((ord(punct), None) for punct in string.punctuation)

lemmar = WordNetLemmatizer()

def LemTokens(tokens):
    return [lemmar.lemmatize(token) for token in tokens]

def LemNormalize(text):
    return LemTokens(nltk.word_tokenize(text.lower().translate(remove_punct_dict)))

In [14]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf_vect = TfidfVectorizer(tokenizer=LemNormalize, stop_words='english', 
                             ngram_range=(1,2), min_df=0.05, max_df=0.85)
feature_vect = tfidf_vect.fit_transform(document_df['opinion_text'])

  'stop_words.' % sorted(inconsistent))


In [29]:
from sklearn.cluster import KMeans

km_cluster = KMeans(n_clusters=3, max_iter=10000, random_state=0)
km_cluster.fit(feature_vect)
cluster_label = km_cluster.labels_
cluster_centers = km_cluster.cluster_centers_

In [30]:
document_df['cluster_label'] = cluster_label
document_df.head()

Unnamed: 0,filename,opinion_text,cluster_label
0,battery-life_ipod_nano_8gb,...,0
1,gas_mileage_toyota_camry_2007,...,2
2,room_holiday_inn_london,...,1
3,location_holiday_inn_london,...,1
4,staff_bestwestern_hotel_sfo,...,1


In [32]:
document_df[document_df['cluster_label']==0].sort_values(by='filename').head(10)

Unnamed: 0,filename,opinion_text,cluster_label
135,accuracy_garmin_nuvi_255W_gps,...,0
237,accuracy_garmin_nuvi_255W_gps,...,0
84,accuracy_garmin_nuvi_255W_gps,...,0
33,accuracy_garmin_nuvi_255W_gps,...,0
186,accuracy_garmin_nuvi_255W_gps,...,0
162,battery-life_amazon_kindle,...,0
111,battery-life_amazon_kindle,...,0
9,battery-life_amazon_kindle,...,0
60,battery-life_amazon_kindle,...,0
213,battery-life_amazon_kindle,...,0


In [33]:
document_df[document_df['cluster_label']==1].sort_values(by='filename').head(10)

Unnamed: 0,filename,opinion_text,cluster_label
133,bathroom_bestwestern_hotel_sfo,...,1
235,bathroom_bestwestern_hotel_sfo,...,1
82,bathroom_bestwestern_hotel_sfo,...,1
184,bathroom_bestwestern_hotel_sfo,...,1
31,bathroom_bestwestern_hotel_sfo,...,1
119,food_holiday_inn_london,...,1
17,food_holiday_inn_london,...,1
221,food_holiday_inn_london,...,1
170,food_holiday_inn_london,...,1
68,food_holiday_inn_london,...,1


In [34]:
document_df[document_df['cluster_label']==2].sort_values(by='filename').head(10)

Unnamed: 0,filename,opinion_text,cluster_label
18,comfort_honda_accord_2008,...,2
120,comfort_honda_accord_2008,...,2
222,comfort_honda_accord_2008,...,2
171,comfort_honda_accord_2008,...,2
69,comfort_honda_accord_2008,...,2
145,comfort_toyota_camry_2007,...,2
247,comfort_toyota_camry_2007,...,2
94,comfort_toyota_camry_2007,...,2
43,comfort_toyota_camry_2007,...,2
196,comfort_toyota_camry_2007,...,2


In [35]:
cluster_centers = km_cluster.cluster_centers_
print('cluster_centers shape: ', cluster_centers.shape)
print(cluster_centers)

cluster_centers shape:  (3, 4611)
[[0.00711363 0.         0.         ... 0.00779154 0.         0.        ]
 [0.         0.00103526 0.00180523 ... 0.         0.00189826 0.00150178]
 [0.00729964 0.00079732 0.         ... 0.         0.         0.        ]]


In [46]:
def get_cluster_details(cluster_model, cluster_data, feature_names, clusters_num, top_n_features=10):
    
    cluster_details = {}
    
    centroid_feature_ordered_ind = cluster_model.cluster_centers_.argsort()[:, ::-1]
    
    for cluster_num in range(clusters_num):
        cluster_details[cluster_num] = {}
        cluster_details[cluster_num]['cluster'] = cluster_num
        
        top_feature_indexes = centroid_feature_ordered_ind[cluster_num, :top_n_features]
        top_features = [feature_names[ind] for ind in top_feature_indexes]
        
        top_feature_values = cluster_model.cluster_centers_[cluster_num, top_feature_indexes].tolist()
        
        cluster_details[cluster_num]['top_features'] = top_features
        cluster_details[cluster_num]['top_feature_value'] = top_feature_values
        filenames = cluster_data[cluster_data['cluster_label']==cluster_num]['filename']
        filenames = filenames.values.tolist()
        
        cluster_details[cluster_num]['filenames'] = filenames
        
    return cluster_details

In [47]:
def print_cluster_details(cluster_details):
    for cluster_num, cluster_detail in cluster_details.items():
        print('##### Cluster {0}'.format(cluster_num))
        print('Top features: ', cluster_detail['top_features'])
        print('Reviews file: ', cluster_detail['filenames'][:7])
        print('=================================================')

In [50]:
feature_names = tfidf_vect.get_feature_names()

cluster_details = get_cluster_details(cluster_model=km_cluster, cluster_data=document_df, feature_names=feature_names, clusters_num=3, top_n_features=10)
print_cluster_details(cluster_details)

##### Cluster 0
Top features:  ['screen', 'battery', 'keyboard', 'kindle', 'battery life', 'direction', 'life', 'voice', 'video', 'feature']
Reviews file:  ['battery-life_ipod_nano_8gb', 'voice_garmin_nuvi_255W_gps', 'speed_garmin_nuvi_255W_gps', 'size_asus_netbook_1005ha', 'screen_garmin_nuvi_255W_gps', 'battery-life_amazon_kindle', 'satellite_garmin_nuvi_255W_gps']
##### Cluster 1
Top features:  ['room', 'hotel', 'service', 'staff', 'food', 'location', 'bathroom', 'clean', 'price', 'parking']
Reviews file:  ['room_holiday_inn_london', 'location_holiday_inn_london', 'staff_bestwestern_hotel_sfo', 'service_swissotel_hotel_chicago', 'service_bestwestern_hotel_sfo', 'food_holiday_inn_london', 'staff_swissotel_chicago']
##### Cluster 2
Top features:  ['interior', 'seat', 'performance', 'mileage', 'comfortable', 'gas', 'gas mileage', 'transmission', 'car', 'quality']
Reviews file:  ['gas_mileage_toyota_camry_2007', 'performance_netbook_1005ha', 'comfort_honda_accord_2008', 'interior_toyota