# Explore here

In [1]:
import pandas as pd

# Load dataset from the URL
url = "https://raw.githubusercontent.com/4GeeksAcademy/naive-bayes-project-tutorial/main/playstore_reviews.csv"
df = pd.read_csv(url)

# Display the first few rows of the dataframe
df.head()


Unnamed: 0,package_name,review,polarity
0,com.facebook.katana,privacy at least put some option appear offli...,0
1,com.facebook.katana,"messenger issues ever since the last update, ...",0
2,com.facebook.katana,profile any time my wife or anybody has more ...,0
3,com.facebook.katana,the new features suck for those of us who don...,0
4,com.facebook.katana,forced reload on uploading pic on replying co...,0


In [2]:
df = df.drop(columns=['package_name'])


In [14]:
df['review'] = df['review'].str.strip().str.lower()

In [6]:
from sklearn.model_selection import train_test_split

X = df['review']
y = df['polarity']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [7]:
from sklearn.feature_extraction.text import CountVectorizer

vec_model = CountVectorizer(stop_words='english')

X_train_vec = vec_model.fit_transform(X_train).toarray()
X_test_vec = vec_model.transform(X_test).toarray()


In [8]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

nb_model = MultinomialNB()
nb_model.fit(X_train_vec, y_train)

y_pred = nb_model.predict(X_test_vec)

# Evaluate the model
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))


Accuracy: 0.8156424581005587
Classification Report:
               precision    recall  f1-score   support

           0       0.84      0.90      0.87       126
           1       0.73      0.60      0.66        53

    accuracy                           0.82       179
   macro avg       0.79      0.75      0.77       179
weighted avg       0.81      0.82      0.81       179

Confusion Matrix:
 [[114  12]
 [ 21  32]]


In [9]:
from sklearn.naive_bayes import GaussianNB, BernoulliNB

# GaussianNB
gaussian_nb = GaussianNB()
gaussian_nb.fit(X_train_vec, y_train)
y_pred_gaussian = gaussian_nb.predict(X_test_vec)
print("GaussianNB Accuracy:", accuracy_score(y_test, y_pred_gaussian))

# BernoulliNB
bernoulli_nb = BernoulliNB()
bernoulli_nb.fit(X_train_vec, y_train)
y_pred_bernoulli = bernoulli_nb.predict(X_test_vec)
print("BernoulliNB Accuracy:", accuracy_score(y_test, y_pred_bernoulli))


GaussianNB Accuracy: 0.8044692737430168
BernoulliNB Accuracy: 0.770949720670391


In [10]:
from sklearn.ensemble import RandomForestClassifier

rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train_vec, y_train)

y_pred_rf = rf_model.predict(X_test_vec)
print("Random Forest Accuracy:", accuracy_score(y_test, y_pred_rf))


Random Forest Accuracy: 0.7988826815642458


In [16]:
from sklearn.linear_model import LogisticRegression

# Logistic Regression
lr_model = LogisticRegression(max_iter=1000)
lr_model.fit(X_train_vec, y_train)
y_pred_lr = lr_model.predict(X_test_vec)
print("Logistic Regression Accuracy:", accuracy_score(y_test, y_pred_lr))



Logistic Regression Accuracy: 0.8324022346368715
