In [2]:
import pandas as pd
import numpy as np
from numpy.random import choice
from math import sqrt
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
# from gensim.models import Word2Vec
from gensim.utils import simple_preprocess


In [4]:
df = pd.read_csv('reviews_mixed.csv')

def get_training_and_validation_datas(df: pd.DataFrame, training_size = 0.8):
    data_size = df.shape[0]
    indexes = [i for i in range(data_size)]
    training_index = np.random.choice(indexes,int(data_size*training_size))
    validation_index = [i for i in range(data_size) if not i in training_index]
    training_input = [df['Text'].iloc[index] for index in training_index]
    training_output = [df['Sentiment'].iloc[index] for index in training_index]
    validation_input = [df['Text'].iloc[index] for index in validation_index]
    validation_output = [df['Sentiment'].iloc[index] for index in validation_index]
    return training_input, training_output, validation_input, validation_output

training_input,training_output,validation_input, validation_output = get_training_and_validation_datas(df)

In [5]:
vectorizer = CountVectorizer()
training_embeddings = vectorizer.fit_transform(training_input).toarray()
validation_embeddings = vectorizer.transform(validation_input).toarray()

In [6]:
class KMeans:
    def __init__(self, n_clusters=4, max_iter=300, tolerance=0.04):
        self.n_clusters = n_clusters
        self.max_iter = max_iter
        self.tolerance = tolerance
    
    def init_centroids(self, X):
        np.random.seed(42)
        initial_indices = np.random.choice(X.shape[0], self.n_clusters, replace=False)
        return X[initial_indices]
        
    def euclidean(self, X):
        distances = np.zeros((X.shape[0], self.n_clusters))
        for i in range(X.shape[0]):
            for j in range(self.n_clusters):
                distances[i, j] = np.linalg.norm(X[i] - self.centroids[j])
        return distances
       
    def update_centroids(self, X):
        new_centroids = np.zeros((self.n_clusters, X.shape[1]))
        for k in range(self.n_clusters):
            cluster_indexes = X[self.indexes == k]
            if len(cluster_indexes) > 0:
                new_centroids[k] = cluster_indexes.mean(axis=0)
        return new_centroids
       
    def is_converged(self, old_centroids, new_centroids):
        distances = np.linalg.norm(new_centroids - old_centroids, axis=1)
        return np.all(distances <= self.tolerance)

    def train(self, X_train):
        self.centroids = self.init_centroids(X_train)
        for _ in range(self.max_iter):
            distances = self.euclidean(X_train)
            self.indexes = np.argmin(distances, axis=1)
            new_centroids = self.update_centroids(X_train)
            if self.is_converged(self.centroids, new_centroids):
                break
            self.centroids = new_centroids
            
    def predict(self, X):
        distances = self.euclidean(X)
        return np.argmin(distances, axis=1)


In [7]:
kmeans = KMeans(n_clusters=2)
kmeans.train(training_embeddings)

In [8]:
from sklearn.metrics import accuracy_score

label_names = [name for name in set(training_output)]
validation_indexes = kmeans.predict(validation_embeddings)
computed_outputs = [label_names[value] for value in validation_indexes]

accuracy = accuracy_score(validation_output, computed_outputs)
print(f'Accuracy: {accuracy*100}%')


Accuracy: 65.93406593406593%
