In [63]:
import pandas as pd
from gensim.models import Word2Vec
import gensim
from nltk.tokenize import sent_tokenize, word_tokenize
import nltk
nltk.download('punkt_tab')
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
import spacy
import string
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\nicco\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\nicco\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [44]:
df = pd.read_csv("reviews.csv")

In [45]:
df.head()

Unnamed: 0,Time_submitted,Review,Rating,Total_thumbsup,Reply
0,2022-07-09 15:00:00,"Great music service, the audio is high quality...",5,2,
1,2022-07-09 14:21:22,Please ignore previous negative rating. This a...,5,1,
2,2022-07-09 13:27:32,"This pop-up ""Get the best Spotify experience o...",4,0,
3,2022-07-09 13:26:45,Really buggy and terrible to use as of recently,1,1,
4,2022-07-09 13:20:49,Dear Spotify why do I get songs that I didn't ...,1,1,


In [46]:
def clean_text(text):
    tokens = word_tokenize(text)
    lower = [token.lower() for token in tokens]
    table = str.maketrans('', '', string.punctuation)
    stripped = [w.translate(table) for w in lower]
    words = [word for word in stripped if word.isalpha()]
    stop_words = set(stopwords.words('english'))
    porter = PorterStemmer()
    words = [porter.stem(word) for word in words]
    non_stop_words = [w for w in words if not w in stop_words]
    return " ".join(non_stop_words)

In [47]:
df["Review_cleaned"] = df["Review"].apply(clean_text)

In [48]:
def categorize_rating(rating):
    if rating < 3:
        return "negative"
    elif rating == 3:
        return "neutral"
    else:
        return "positive"

In [49]:
df["Rating_category"] = df["Rating"].apply(categorize_rating)

In [50]:
tokenized_reviews = [review.split() for review in df["Review_cleaned"]]
word2vec_model = Word2Vec(sentences=tokenized_reviews, vector_size=100, window=5, min_count=1, workers=4)

In [58]:
def get_average_word2vec(review, model):
    words = review.split()
    word_vectors = [model.wv[word] for word in words if word in model.wv]
    if word_vectors:
        return np.mean(word_vectors, axis=0)
    else:
        return np.zeros(model.vector_size)

df['vector'] = df['Review_cleaned'].apply(lambda x: get_average_word2vec(x, word2vec_model))

In [60]:
X = np.vstack(df['vector'].values)
y = df['Rating_category'].values

In [61]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [62]:
clf = LogisticRegression(max_iter=1000)
clf.fit(X_train, y_train)

In [64]:
y_pred = clf.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

    negative       0.71      0.89      0.79      4909
     neutral       0.16      0.01      0.01      1412
    positive       0.84      0.85      0.84      5998

    accuracy                           0.77     12319
   macro avg       0.57      0.58      0.55     12319
weighted avg       0.71      0.77      0.73     12319

