In [40]:
import pandas as pd
import numpy as np
import random

def initialize_centroids(data, k):
    """ Initialize k centroids randomly """
    indices = random.sample(range(len(data)), k)
    centroids = data[indices, :]
    return centroids

def assign_clusters(data, centroids):
    """ Assign each data point to the nearest centroid """
    clusters = []
    for point in data:
        distances = np.sqrt(((point - centroids) ** 2).sum(axis=1))
        cluster = np.argmin(distances)
        clusters.append(cluster)
    return clusters

def compute_centroids(data, clusters, k):
    """ Compute the new centroids """
    new_centroids = []
    for i in range(k):
        points = data[np.where(clusters == i)]
        centroid = points.mean(axis=0)
        new_centroids.append(centroid)
    return np.array(new_centroids)

def k_means(data, k, max_iters=1000):
    """ The K-means algorithm """
    data = np.array(data) 
    centroids = initialize_centroids(data, k)
    for i in range(max_iters):
        old_centroids = centroids.copy()
        clusters = assign_clusters(data, centroids)
        centroids = compute_centroids(data, np.array(clusters), k)
        if np.allclose(centroids, old_centroids):
            break
    return np.array(clusters), centroids




In [41]:
# Load the file
df = pd.read_pickle('data/pkl_vector_10k_reviews.pkl')

# Extract only the feature vectors
features = df.vector.to_list()

In [42]:
actual_labels = df.good_rating

In [43]:
# Apply K-means
k = 2  # number of clusters
clusters, centroids = k_means(features, k)


In [44]:
comparison = pd.DataFrame({'Actual Label': actual_labels, 'Cluster': clusters})

misclassified = comparison[comparison['Actual Label'] != comparison['Cluster']]
print((len(df) - len(misclassified)) / len(df))


    Actual Label  Cluster
1              1        0
2              1        0
9              1        0
10             1        0
11             1        0
0.5642


In [45]:
from sklearn.metrics import silhouette_samples, silhouette_score

# Extract only the feature vectors
array_features = np.array(features)
array_labels = np.array(actual_labels)

silhouette_avg = silhouette_score(array_features, array_labels)
sample_silhouette_values = silhouette_samples(array_features, array_labels)

In [46]:
print(silhouette_avg)
print(sample_silhouette_values)

0.05847046242104021
[0.07188645 0.06470027 0.06196227 ... 0.05032201 0.05075456 0.05009708]
