## Spotify App Reviews Sentiment Analysis

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
stf = pd.read_csv('reviews.csv')

### Data PreProcessing

In [47]:
!pip install nltk gensim

Defaulting to user installation because normal site-packages is not writeable


In [48]:
stf['Review'].head()

0    great music service audio high quality app eas...
1    please ignore previous negative rating app sup...
2    pop get best spotify experience android annoyi...
3                   really buggy terrible use recently
4     dear spotify get songs put playlist shuffle play
Name: Review, dtype: object

In [49]:
import gensim
from gensim.parsing.preprocessing import strip_non_alphanum, strip_multiple_whitespaces, preprocess_string, split_alphanum, strip_short, strip_numeric
import re 
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')
stop_words = stopwords.words('english')

[nltk_data] Downloading package stopwords to /home/canada/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [50]:
def lower_case(docs):
  return [doc.lower() for doc in docs]

def remove_punc(docs):
  return [strip_non_alphanum(doc).strip() for doc in docs]

def separate_num(docs):
  return [split_alphanum(doc) for doc in docs]

def remove_one_letter_word(docs):
  return [strip_short(doc) for doc in docs]

def remove_number(docs):
  return [strip_numeric(doc) for doc in docs]

def replace_multiple_whitespaces(docs):
  return [strip_multiple_whitespaces(doc) for doc in docs]

def remove_stopwords(docs):
    return [" ".join([word for word in doc.split() if word not in stop_words]) for doc in docs]

doc = lower_case(stf['Review'])
doc = remove_punc(doc)
doc = separate_num(doc)
doc = remove_one_letter_word(doc)
doc = remove_number(doc)
doc = replace_multiple_whitespaces(doc)
doc = remove_stopwords(doc)

stf['Review'] = doc

In [51]:
stf['Review'].head()

0    great music service audio high quality app eas...
1    please ignore previous negative rating app sup...
2    pop get best spotify experience android annoyi...
3                   really buggy terrible use recently
4     dear spotify get songs put playlist shuffle play
Name: Review, dtype: object

In [52]:
stf1 = stf.copy()
rate = [1,2,3,4,5]
sentiment = [0, 0, 1, 1, 1]
for i in range(5):
    stf1["Rating"].replace(rate[i], sentiment[i],inplace=True)

In [53]:
stf1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 61594 entries, 0 to 61593
Data columns (total 5 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   Time_submitted  61594 non-null  object
 1   Review          61594 non-null  object
 2   Rating          61594 non-null  int64 
 3   Total_thumbsup  61594 non-null  int64 
 4   Reply           216 non-null    object
dtypes: int64(2), object(3)
memory usage: 2.3+ MB


### SVM

In [83]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score

X_train, X_test, y_train, y_test = train_test_split(stf1['Review'], stf1['Rating'], test_size=0.2, random_state=42)

# TfidfVectorizer
vectorizer = TfidfVectorizer()
X_train_vectors = vectorizer.fit_transform(X_train)
X_test_vectors = vectorizer.transform(X_test)

svm = SVC(kernel='linear', verbose=True)
svm.fit(X_train_vectors, y_train)

train_accuracy_0 = accuracy_score(y_train, svm.predict(X_train_vectors))
test_accuracy_0 = accuracy_score(y_test, y_pred)

print("Train accuracy:", train_accuracy_0)
print("Test accuracy:", test_accuracy_0)

[LibSVM]................................*..................*.*
optimization finished, #iter = 51535
obj = -17712.833044, rho = 0.451040
nSV = 21613, nBSV = 17734
Total nSV = 21613


KeyboardInterrupt: 

In [None]:
y_pred = clf.predict(X_test_vectors)

f1 = f1_score(y_test, y_pred, average='binary')
print("F1 score:", f1)

In [18]:
new_data = ["terrible"]

new_data_vectors = vectorizer.transform(new_data)

new_predictions = svm.predict(new_data_vectors)

print("Predictions:", new_predictions)

Predictions: ['negative']


### Logistic Regression

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, f1_score

X_train, X_test, y_train, y_test = train_test_split(stf1['Review'], stf1['Rating'], test_size=0.2, random_state=42)

vectorizer = TfidfVectorizer()
X_train_vectors = vectorizer.fit_transform(X_train)
X_test_vectors = vectorizer.transform(X_test)

clf = LogisticRegression(solver='lbfgs')
clf.fit(X_train_vectors, y_train)

y_pred = clf.predict(X_test_vectors)

train_accuracy = accuracy_score(y_train, clf.predict(X_train_vectors))
test_accuracy = accuracy_score(y_test, y_pred)

print("Train accuracy:", train_accuracy)
print("Test accuracy:", test_accuracy)

In [62]:
y_pred = clf.predict(X_test_vectors)

f1 = f1_score(y_test, y_pred, average='binary')
print("F1 score:", f1)

F1 score: 0.8573592179540135


In [67]:
new_data = ["I feel this app is an exciting app"]

new_data_vectors = vectorizer.transform(new_data)

new_predictions = 'Positve' if (clf.predict(new_data_vectors) == 1) else 'Negative'

print("Predictions:", new_predictions)

Predictions: Positve


### Random Forest & Naive Bayes

In [68]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB

In [79]:
rf = RandomForestClassifier()
rf.fit(X_train_vectors, y_train)

y_pred = rf.predict(X_test_vectors)

train_accuracy_1 = accuracy_score(y_train, rf.predict(X_train_vectors))
test_accuracy_1 = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred, average='binary')

print("Train accuracy:", train_accuracy_1)
print("Test accuracy:", test_accuracy_1)
print("F1 score:", f1)

Train accuracy: 0.9986605783866058
Test accuracy: 0.8155694455718808
F1 score: 0.8415841584158416


In [80]:
mnb = MultinomialNB()
mnb.fit(X_train_vectors, y_train)

y_pred = mnb.predict(X_test_vectors)

train_accuracy_2 = accuracy_score(y_train, mnb.predict(X_train_vectors))
test_accuracy_2 = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred, average='binary')

print("Train accuracy:", train_accuracy_2)
print("Test accuracy:", test_accuracy_2)
print("F1 score:", f1)

Train accuracy: 0.8343581938102486
Test accuracy: 0.8098871661660849
F1 score: 0.8418422474338195


In [None]:
model = {'Model':['RandomForestClassifier-train','RandomForestClassifier-test', 
                  'MultinomialNBClassifier-train', 'MultinomialNBClassifier-test',
                  'LogisticRegression-train','LogisticRegression-test',
                  'SVM-train','SVM-test'],
         'Accuracy':[train_accuracy_1,test_accuracy_1,train_accuracy_2,test_accuracy_2, 
                     train_accuracy,test_accuracy,
                     train_accuracy_0,test_accuracy_0]
         }
model_df = pd.DataFrame(model)
model_df