In [1]:
from pathlib import Path
import pandas as pd

data = Path('10000_review_data.csv')
df = pd.read_csv(data)
# df = df.drop(columns=["Unnamed: 0"])
df.head()

Unnamed: 0,_id,stars,text,neg,neu,pos,compound
0,{'$oid': '645398bc6165a65397f700b0'},3,"If you decide to eat here, just be aware it is...",0.0,0.888,0.112,0.8597
1,{'$oid': '645398bc6165a65397f700b1'},5,I've taken a lot of spin classes over the year...,0.05,0.705,0.245,0.9858
2,{'$oid': '645398bc6165a65397f700b2'},3,Family diner. Had the buffet. Eclectic assortm...,0.035,0.709,0.257,0.9201
3,{'$oid': '645398bc6165a65397f700b3'},5,"Wow! Yummy, different, delicious. Our favo...",0.0,0.66,0.34,0.9588
4,{'$oid': '645398bc6165a65397f700b4'},4,Cute interior and owner (?) gave us tour of up...,0.017,0.711,0.272,0.9804


In [2]:
print(df.dtypes)

_id          object
stars         int64
text         object
neg         float64
neu         float64
pos         float64
compound    float64
dtype: object


In [3]:
df['stars'] = df['stars'].astype(float)
print(df.dtypes)

_id          object
stars       float64
text         object
neg         float64
neu         float64
pos         float64
compound    float64
dtype: object


In [4]:
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import CountVectorizer

corpus = []
for i in range(0, 10000):
    review = re.sub('[^a-zA-Z]', ' ', df['text'][i])
    review = review.lower()
    review = review.split()
    ps = PorterStemmer()
    review = [ps.stem(word) for word in review if not word in set(stopwords.words('english'))]
    review = ' '.join(review)
    corpus.append(review)

In [5]:
corpus

['decid eat awar go take hour begin end tri multipl time want like locat nj never bad experi food good take long time come waitstaff young usual pleasant mani experi spent way long wait usual opt anoth diner restaur weekend order done quicker',
 'taken lot spin class year noth compar class bodi cycl nice clean space amaz bike welcom motiv instructor everi class top notch work anyon struggl fit workout onlin schedul system make easi plan ahead need line way advanc like mani gym make way write review without give russel owner bodi cycl shout russel passion fit cycl evid desir client succeed alway drop class check provid encourag open idea recommend anyon russel alway wear smile face even kick butt class',
 'famili diner buffet eclect assort larg chicken leg fri jalap tamal two roll grape leav fresh melon good lot mexican choic also menu breakfast serv day long friendli attent staff good place casual relax meal expect next clarion hotel',
 'wow yummi differ delici favorit lamb curri korma

In [71]:
# from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(max_features = 11500)
X = cv.fit_transform(corpus).toarray()
y = df.iloc[:, 1].values

# feature_names = cv.get_feature_names_out()
# print(feature_names)

In [72]:
# threshold_positive = 0.1
# threshold_negative = -0.1

y_categorical = []
for rating in y:
    if rating >= 4:
        y_categorical.append("positive")
    elif rating <= 2:
        y_categorical.append("negative")
    else:
        y_categorical.append("neutral")
y = y_categorical


In [73]:
# Splitting the dataset into the Training set and Test set
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, 
                                                    y, 
                                                    random_state=2, 
                                                    )

In [74]:
# Multinomial NB

# Fitting Naive Bayes to the Training set
from sklearn.naive_bayes import MultinomialNB
classifier = MultinomialNB(alpha=0.1)
classifier.fit(X_train, y_train)

# Predicting the Test set results
y_pred = classifier.predict(X_test)

# Making the Confusion Matrix
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, classification_report
cm = confusion_matrix(y_test, y_pred)
# print ("Confusion Matrix:\n",cm)

# Print the classification report for the model 
names = ["Postive", "Negative", "Neutral"]
print(classification_report(y_test, y_pred, target_names=names))

# Accuracy, Precision, and Recall
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted')
recall = recall_score(y_test, y_pred, average='weighted')

print("\n")
print("Accuracy:", round(accuracy, 2))
print("Precision:", round(precision, 2))
print("Recall:", round(recall, 2))


              precision    recall  f1-score   support

     Postive       0.77      0.74      0.75      1091
    Negative       0.32      0.43      0.37       387
     Neutral       0.80      0.74      0.77      1022

    accuracy                           0.69      2500
   macro avg       0.63      0.63      0.63      2500
weighted avg       0.71      0.69      0.70      2500



Accuracy: 0.69
Precision: 0.71
Recall: 0.69


In [75]:
# Bernoulli NB

# Fitting Naive Bayes to the Training set
from sklearn.naive_bayes import BernoulliNB
classifier = BernoulliNB(alpha=0.8)
classifier.fit(X_train, y_train)

# Predicting the Test set results
y_pred = classifier.predict(X_test)

# Making the Confusion Matrix
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, y_pred)
# print ("Confusion Matrix:\n",cm)

# Print the classification report for the model 
names = ["Postive", "Negative", "Neutral"]
print(classification_report(y_test, y_pred, target_names=names))

# Accuracy, Precision, and Recall
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted')
recall = recall_score(y_test, y_pred, average='weighted')

print("\n")
print("Accuracy:", round(accuracy, 2))
print("Precision:", round(precision, 2))
print("Recall:", round(recall, 2))

              precision    recall  f1-score   support

     Postive       0.79      0.69      0.74      1091
    Negative       0.33      0.18      0.23       387
     Neutral       0.65      0.85      0.74      1022

    accuracy                           0.68      2500
   macro avg       0.59      0.58      0.57      2500
weighted avg       0.66      0.68      0.66      2500



Accuracy: 0.68
Precision: 0.66
Recall: 0.68


In [76]:
# Logistic Regression

# Fitting Logistic Regression to the Training set
from sklearn import linear_model
classifier = linear_model.LogisticRegression(max_iter=2000)
classifier.fit(X_train, y_train)

# Predicting the Test set results
y_pred = classifier.predict(X_test)

# Making the Confusion Matrix
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, y_pred)
# print ("Confusion Matrix:\n",cm)

# Print the classification report for the model 
names = ["Postive", "Negative", "Neutral"]
print(classification_report(y_test, y_pred, target_names=names))

# Accuracy, Precision, and Recall
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted')
recall = recall_score(y_test, y_pred, average='weighted')

print("\n")
print("Accuracy:", round(accuracy, 2))
print("Precision:", round(precision, 2))
print("Recall:", round(recall, 2))

              precision    recall  f1-score   support

     Postive       0.78      0.78      0.78      1091
    Negative       0.37      0.32      0.34       387
     Neutral       0.78      0.82      0.80      1022

    accuracy                           0.72      2500
   macro avg       0.64      0.64      0.64      2500
weighted avg       0.71      0.72      0.72      2500



Accuracy: 0.72
Precision: 0.71
Recall: 0.72
