In [12]:
import warnings
warnings.filterwarnings("ignore")
import numpy as np # to use numpy arrays instead of lists
import pandas as pd # DataFrame (table)
import matplotlib.pyplot as plt # to plot

In [11]:
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
from sklearn.metrics import adjusted_rand_score

In [5]:
from google.colab import drive
drive.mount('gdrive')

Mounted at gdrive


In [6]:
dataSet = pd.read_csv("gdrive/MyDrive/Colab Notebooks/Amazon_Unlocked_Mobile.zip", header=0, compression='zip')
print(dataSet)

                                             Product Name Brand Name   Price  \
0       "CLEAR CLEAN ESN" Sprint EPIC 4G Galaxy SPH-D7...    Samsung  199.99   
1       "CLEAR CLEAN ESN" Sprint EPIC 4G Galaxy SPH-D7...    Samsung  199.99   
2       "CLEAR CLEAN ESN" Sprint EPIC 4G Galaxy SPH-D7...    Samsung  199.99   
3       "CLEAR CLEAN ESN" Sprint EPIC 4G Galaxy SPH-D7...    Samsung  199.99   
4       "CLEAR CLEAN ESN" Sprint EPIC 4G Galaxy SPH-D7...    Samsung  199.99   
...                                                   ...        ...     ...   
413835  Samsung Convoy U640 Phone for Verizon Wireless...    Samsung   79.95   
413836  Samsung Convoy U640 Phone for Verizon Wireless...    Samsung   79.95   
413837  Samsung Convoy U640 Phone for Verizon Wireless...    Samsung   79.95   
413838  Samsung Convoy U640 Phone for Verizon Wireless...    Samsung   79.95   
413839  Samsung Convoy U640 Phone for Verizon Wireless...    Samsung   79.95   

        Rating                         

In [8]:
dataSet.dropna(inplace=True)
dataSet = dataSet.iloc[0:12000,:]

misDatos = dataSet[['Reviews']]
print(misDatos.head(20))

                                              Reviews
0   I feel so LUCKY to have found this used (phone...
1   nice phone, nice up grade from my pantach revu...
2                                        Very pleased
3   It works good but it goes slow sometimes but i...
4   Great phone to replace my lost phone. The only...
5   I already had a phone with problems... I know ...
6   The charging port was loose. I got that solder...
7   Phone looks good but wouldn't stay charged, ha...
8   I originally was using the Samsung S2 Galaxy f...
9   It's battery life is great. It's very responsi...
10  My fiance had this phone previously, but cause...
11  This is a great product it came after two days...
12  These guys are the best! I had a little situat...
13  I'm really disappointed about my phone and ser...
14  Ordered this phone as a replacement for the sa...
15  Had this phone before and loved it but was not...
16  I was able to get the phone I previously owned...
17  I brought this phone as 

In [10]:
vectorizer = TfidfVectorizer(stop_words='english',
                             min_df = 5,
                             max_df = 0.8,
                             sublinear_tf = True,
                             use_idf = True)
data_vectors = vectorizer.fit_transform(misDatos['Reviews'])

print(data_vectors)

  (0, 1131)	0.18335316150727016
  (0, 1767)	0.24793868370392552
  (0, 3178)	0.2695002745489852
  (0, 2138)	0.12294461919435358
  (0, 1360)	0.1804884512616642
  (0, 1698)	0.19900487289469765
  (0, 3161)	0.22262705331661028
  (0, 2761)	0.18487977888263166
  (0, 2769)	0.1747148349112169
  (0, 1691)	0.20405403246800308
  (0, 2020)	0.15361523147694917
  (0, 1149)	0.1920389299226406
  (0, 1135)	0.24163740275370474
  (0, 233)	0.24280908648388658
  (0, 3344)	0.18189113373621948
  (0, 857)	0.141297085387887
  (0, 3238)	0.15352511053945575
  (0, 3160)	0.18624422027892598
  (0, 2994)	0.15290075931636377
  (0, 2625)	0.2289613423064304
  (0, 2360)	0.13204038708473567
  (0, 248)	0.24163740275370474
  (0, 1424)	0.27053927172200365
  (0, 2555)	0.14866345633234895
  (0, 2394)	0.14123395959874743
  :	:
  (11995, 377)	0.23907921965523984
  (11996, 2453)	0.2957288436537269
  (11996, 1439)	0.28437112779221707
  (11996, 1045)	0.22922192381241954
  (11996, 3155)	0.2877346987118878
  (11996, 3044)	0.263710215

In [14]:
num_clusters = 6
model = KMeans(n_clusters=num_clusters, init='k-means++', max_iter=150, random_state=0)
model.fit(data_vectors)

In [16]:
misDatos['Clusters'] = model.labels_.tolist()
print(misDatos.head(20))

                                              Reviews  Clusters
0   I feel so LUCKY to have found this used (phone...         5
1   nice phone, nice up grade from my pantach revu...         5
2                                        Very pleased         5
3   It works good but it goes slow sometimes but i...         2
4   Great phone to replace my lost phone. The only...         0
5   I already had a phone with problems... I know ...         0
6   The charging port was loose. I got that solder...         5
7   Phone looks good but wouldn't stay charged, ha...         5
8   I originally was using the Samsung S2 Galaxy f...         5
9   It's battery life is great. It's very responsi...         5
10  My fiance had this phone previously, but cause...         5
11  This is a great product it came after two days...         5
12  These guys are the best! I had a little situat...         5
13  I'm really disappointed about my phone and ser...         5
14  Ordered this phone as a replacement 

In [17]:
# prompt: give me the most repeated work from each cluster in misDatos

from collections import Counter

def most_frequent_word_per_cluster(misDatos):
  """
  Finds the most frequent word in each cluster of the DataFrame.

  Args:
      misDatos: A DataFrame with 'Reviews' and 'Clusters' columns.

  Returns:
      A dictionary mapping cluster number to the most frequent word in that cluster.
  """
  cluster_word_counts = {}
  for cluster_num in misDatos['Clusters'].unique():
    cluster_data = misDatos[misDatos['Clusters'] == cluster_num]
    all_words = ' '.join(cluster_data['Reviews'].tolist()).split()
    word_counts = Counter(all_words)
    most_common_word = word_counts.most_common(1)[0][0] if word_counts else None
    cluster_word_counts[cluster_num] = most_common_word
  return cluster_word_counts


most_frequent_words = most_frequent_word_per_cluster(misDatos)
print("Most frequent word in each cluster:", most_frequent_words)

Most frequent word in each cluster: {5: 'the', 2: 'good', 0: 'the', 4: 'great', 3: 'Excellent', 1: 'love'}
