In [1]:
#Step 0 Load Libraries
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.naive_bayes import GaussianNB, MultinomialNB, BernoulliNB
from sklearn.metrics import (
    accuracy_score, f1_score,
    r2_score, ConfusionMatrixDisplay,
    classification_report, RocCurveDisplay
    )
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.inspection import permutation_importance

In [2]:
#Step 1 Load Data
url = 'https://raw.githubusercontent.com/4GeeksAcademy/naive-bayes-project-tutorial/main/playstore_reviews.csv'
df_raw = pd.read_csv(url)
df_raw.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 3 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   package_name  891 non-null    object
 1   review        891 non-null    object
 2   polarity      891 non-null    int64 
dtypes: int64(1), object(2)
memory usage: 21.0+ KB


In [3]:
df_raw.sample(10, random_state=2025)

Unnamed: 0,package_name,review,polarity
622,com.uc.browser.en,good good for slow connection this uc minilit...,1
25,com.facebook.katana,can't install (error code: -505) have samsung...,0
307,com.tencent.mm,"bad new update, sight by swipe in chats gone ...",0
783,org.mozilla.firefox,"all you need, easy and gives you control open...",0
834,com.hamropatro,well done nicely designed .....this app had c...,1
109,com.linkedin.android,organization logo whenever i try to add my or...,0
339,com.viber.voip,issue in last online time it doesn't refresh ...,0
560,jabanaki.todo.todoly,"great, simple, recommend love this app. looki...",1
595,com.evernote,"neat idea, but let-down by no linux support. ...",0
509,com.Slack,free the gifs not reliable on a slower networ...,0


In [4]:
#Step 2 Preprocessing
df_baking = df_raw.copy()

df_baking = df_baking.drop(columns="package_name")
df = df_baking.copy()

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   review    891 non-null    object
 1   polarity  891 non-null    int64 
dtypes: int64(1), object(1)
memory usage: 14.1+ KB


In [5]:
df_train, df_test = train_test_split(df, test_size=0.1, random_state=2025, stratify=df['polarity'])


df_train.shape, df_test.shape

((801, 2), (90, 2))

In [6]:
X_train = df_train.drop(columns=['polarity'])
y_train = df_train['polarity']

X_test = df_test.drop(columns=['polarity'])
y_test = df_test['polarity']

X_train = X_train.reset_index(drop=True)
y_train = y_train.reset_index(drop=True)
X_test = X_test.reset_index(drop=True)
y_test = y_test.reset_index(drop=True)
X_train

Unnamed: 0,review
0,bug?? i've updated my messenger to the latest...
1,good
2,no upgrades to the game till yet...the devel...
3,"ads, ads and more ads i don't mind having ads..."
4,updated version is down not able to sent conn...
...,...
796,two stars it used to work really well and the...
797,needed to learn the hard way about town hall ...
798,worse ! 1. startup become slow. its mini or n...
799,i met my life here my girlfriend heads off t...


In [7]:
vec_model = CountVectorizer(stop_words = "english")
X_train = vec_model.fit_transform(X_train["review"]).toarray()
X_test = vec_model.transform(X_test["review"]).toarray()

X_test

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], shape=(90, 3513))

In [8]:
X_train

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], shape=(801, 3513))

In [9]:
nb = GaussianNB()
bnb = BernoulliNB()
mnnb = MultinomialNB()

In [10]:
nb.fit(X_train, y_train)
y_hat = nb.predict(X_test)
print(f'Accuracy: {accuracy_score(y_test, y_hat):.2f}, F1 Score: {f1_score(y_test, y_hat):.2f}')

Accuracy: 0.79, F1 Score: 0.63


In [11]:
bnb.fit(X_train, y_train)
y_hat2 = bnb.predict(X_test)
print(f'Accuracy: {accuracy_score(y_test, y_hat2):.2f}, F1 Score: {f1_score(y_test, y_hat2):.2f}')

Accuracy: 0.78, F1 Score: 0.55


In [12]:
mnnb.fit(X_train, y_train)
y_hat3 = mnnb.predict(X_test)
print(f'Accuracy: {accuracy_score(y_test, y_hat3):.2f}, F1 Score: {f1_score(y_test, y_hat3):.2f}')

Accuracy: 0.83, F1 Score: 0.71


In [13]:
param_grid = {
    "alpha": np.linspace(0.01, 10.0, 200),
    "fit_prior": [True, False]
}

random_search = RandomizedSearchCV(mnnb, param_grid, n_iter = 50, scoring = "accuracy", cv = 5, random_state = 42)
random_search

0,1,2
,estimator,MultinomialNB()
,param_distributions,"{'alpha': array([ 0.01 ... 10. ]), 'fit_prior': [True, False]}"
,n_iter,50
,scoring,'accuracy'
,n_jobs,
,refit,True
,cv,5
,verbose,0
,pre_dispatch,'2*n_jobs'
,random_state,42

0,1,2
,alpha,1.0
,force_alpha,True
,fit_prior,True
,class_prior,


In [14]:
random_search.fit(X_train, y_train)

print(f"Best hyperparameters: {random_search.best_params_}")

Best hyperparameters: {'fit_prior': False, 'alpha': np.float64(2.3192462311557787)}


In [15]:
mnnb2 = MultinomialNB(alpha = 1.917638190954774, fit_prior = False)
mnnb2.fit(X_train, y_train)
mnnb2.fit(X_train, y_train)
y_hat4 = mnnb2.predict(X_test)
print(f'Accuracy: {accuracy_score(y_test, y_hat4):.2f}, F1 Score: {f1_score(y_test, y_hat4):.2f}')

Accuracy: 0.82, F1 Score: 0.68
