In [4]:
import pandas as pd
import numpy as np
from numpy.random import choice
from math import sqrt
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
# from gensim.models import Word2Vec
from gensim.utils import simple_preprocess
from sklearn.metrics import accuracy_score, adjusted_rand_score, normalized_mutual_info_score
from sklearn.metrics import accuracy_score
from sklearn.metrics.cluster import contingency_matrix
from scipy.optimize import linear_sum_assignment
from collections import Counter

In [5]:
df = pd.read_csv('reviews_mixed.csv')

def get_training_and_validation_datas(df: pd.DataFrame, training_size = 0.8):
    data_size = df.shape[0]
    indexes = [i for i in range(data_size)]
    training_index = np.random.choice(indexes,int(data_size*training_size))
    validation_index = [i for i in range(data_size) if not i in training_index]
    training_input = [df['Text'].iloc[index] for index in training_index]
    training_output = [df['Sentiment'].iloc[index] for index in training_index]
    validation_input = [df['Text'].iloc[index] for index in validation_index]
    validation_output = [df['Sentiment'].iloc[index] for index in validation_index]
    return training_input, training_output, validation_input, validation_output

training_input,training_output,validation_input, validation_output = get_training_and_validation_datas(df)
vectorizer = CountVectorizer()
text = ["By choosing a bike over a car, I'm reducing my environmental footprint. Cycling promotes eco-friendly transportation, and I'm proud to be part of that movement."]
training_embeddings = vectorizer.fit_transform(training_input).toarray()
validation_embeddings = vectorizer.transform(validation_input).toarray()

## KMEANS

In [6]:
kmeans = KMeans(n_clusters=2, n_init=10)
kmeans.fit(training_embeddings)

label_names = [name for name in set(training_output)]
validation_indexes = kmeans.predict(validation_embeddings)
computed_outputs = [label_names[value] for value in validation_indexes]

accuracy = accuracy_score(validation_output, computed_outputs)
print(f'Accuracy: {accuracy*100}%')

Accuracy: 61.458333333333336%


In [7]:
text = ["By choosing a bike over a car, I'm reducing my environmental footprint. Cycling promotes eco-friendly transportation, and I'm proud to be part of that movement."]
input =  vectorizer.transform(text).toarray()
label = kmeans.predict(input)
print('Label:', label_names[label[0]])

Label: negative


## AgglomerativeClustering

In [15]:
from sklearn.cluster import AgglomerativeClustering

agglomerative = AgglomerativeClustering(n_clusters=2)
agglomerative.fit(training_embeddings)

label_names = [name for name in set(training_output)]
validation_indexes = agglomerative.fit_predict(validation_embeddings)
computed_outputs = [label_names[value] for value in validation_indexes]

accuracy = accuracy_score(validation_output, computed_outputs)
print(f'Accuracy: {accuracy * 100}%')

Accuracy: 29.166666666666668%


## GaussianMixture

In [9]:
from sklearn.mixture import GaussianMixture

gmm = GaussianMixture(n_components=2)
gmm.fit(training_embeddings)

label_names = [name for name in set(training_output)]
validation_indexes = gmm.predict(validation_embeddings)
computed_outputs = [label_names[value] for value in validation_indexes]

accuracy = accuracy_score(validation_output, computed_outputs)
print(f'Accuracy: {accuracy * 100}%')

Accuracy: 27.083333333333332%


##  Topic Modeling using LDA

In [14]:
import gensim
from gensim import corpora
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import nltk

nltk.download('punkt')
nltk.download('stopwords')

ImportError: cannot import name 'triu' from 'scipy.linalg' (/home/codespace/.local/lib/python3.10/site-packages/scipy/linalg/__init__.py)

In [13]:
texts = df['Text'].tolist()

stop_words = set(stopwords.words('english'))
texts = [[word for word in word_tokenize(text.lower()) if word.isalnum() and word not in stop_words] for text in texts]

dictionary = corpora.Dictionary(texts)
corpus = [dictionary.doc2bow(text) for text in texts]

lda_model = gensim.models.LdaModel(corpus, num_topics=3, id2word=dictionary, passes=12)

topics = lda_model.print_topics(num_words=2)
topicsx = []
for topic in topics:
    _, word_probs = topic
    words = [word.split('*')[1].strip().strip('"') for word in word_probs.split(' + ')]
    for word in words:
        if word not in topicsx:
            topicsx.append(word)
            
print(topicsx)


[nltk_data] Downloading package punkt to /home/codespace/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to
[nltk_data]     /home/codespace/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


NameError: name 'stopwords' is not defined