# Explore here

In [14]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from pickle import dump
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.metrics import accuracy_score, recall_score, f1_score, precision_score

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB,GaussianNB, BernoulliNB
from sklearn.ensemble import RandomForestClassifier

In [15]:
url = 'https://raw.githubusercontent.com/4GeeksAcademy/naive-bayes-project-tutorial/main/playstore_reviews.csv'

data = pd.read_csv(url, delimiter=',')
data.head()

Unnamed: 0,package_name,review,polarity
0,com.facebook.katana,privacy at least put some option appear offli...,0
1,com.facebook.katana,"messenger issues ever since the last update, ...",0
2,com.facebook.katana,profile any time my wife or anybody has more ...,0
3,com.facebook.katana,the new features suck for those of us who don...,0
4,com.facebook.katana,forced reload on uploading pic on replying co...,0


In [16]:
data = data.drop(columns=["package_name"])

In [17]:
data["review"] = data["review"].str.strip().str.lower()
data.head()

Unnamed: 0,review,polarity
0,privacy at least put some option appear offlin...,0
1,"messenger issues ever since the last update, i...",0
2,profile any time my wife or anybody has more t...,0
3,the new features suck for those of us who don'...,0
4,forced reload on uploading pic on replying com...,0


In [18]:
X = data["review"]
y = data["polarity"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42, stratify=y)

In [19]:
vec_model = CountVectorizer(stop_words = "english")
X_train = vec_model.fit_transform(X_train).toarray()
X_test = vec_model.transform(X_test).toarray()

In [20]:
model_MN = MultinomialNB()
model_MN.fit(X_train, y_train)
y_pred_MN = model_MN.predict(X_test)
print("Accuracy :", round(accuracy_score(y_test, y_pred_MN), 3))
print("Precision:", round(precision_score(y_test, y_pred_MN), 3))
print("Recall   :", round(recall_score(y_test, y_pred_MN), 3))
print("F1 Score :", round(f1_score(y_test, y_pred_MN), 3))

Accuracy : 0.855
Precision: 0.891
Recall   : 0.661
F1 Score : 0.759


In [21]:
model_BR = BernoulliNB()
model_BR.fit(X_train, y_train)
y_pred_BR = model_BR.predict(X_test)
print("Accuracy :", round(accuracy_score(y_test, y_pred_BR), 3))
print("Precision:", round(precision_score(y_test, y_pred_BR), 3))
print("Recall   :", round(recall_score(y_test, y_pred_BR), 3))
print("F1 Score :", round(f1_score(y_test, y_pred_BR), 3))

Accuracy : 0.782
Precision: 0.871
Recall   : 0.435
F1 Score : 0.581


In [22]:
model_GA = GaussianNB()
model_GA.fit(X_train, y_train)
y_pred_GA = model_GA.predict(X_test)
print("Accuracy :", round(accuracy_score(y_test, y_pred_GA), 3))
print("Precision:", round(precision_score(y_test, y_pred_GA), 3))
print("Recall   :", round(recall_score(y_test, y_pred_GA), 3))
print("F1 Score :", round(f1_score(y_test, y_pred_GA), 3))

Accuracy : 0.816
Precision: 0.764
Recall   : 0.677
F1 Score : 0.718


In [23]:
param = {
    'alpha': [0.1, 0.5, 1.0, 1.5, 2.0],
    'fit_prior': [True, False]
}
random_search = RandomizedSearchCV(
    estimator=model_MN,
    param_distributions=param,
    scoring='f1',
    cv=5,
    verbose=1,
    random_state=42
)
random_search.fit(X_train, y_train)

Fitting 5 folds for each of 10 candidates, totalling 50 fits


0,1,2
,estimator,MultinomialNB()
,param_distributions,"{'alpha': [0.1, 0.5, ...], 'fit_prior': [True, False]}"
,n_iter,10
,scoring,'f1'
,n_jobs,
,refit,True
,cv,5
,verbose,1
,pre_dispatch,'2*n_jobs'
,random_state,42

0,1,2
,alpha,0.5
,force_alpha,True
,fit_prior,False
,class_prior,


In [24]:
best_model = random_search.best_estimator_
y_pred_best = best_model.predict(X_test)
print("Accuracy :", round(accuracy_score(y_test, y_pred_best), 3))
print("Precision:", round(precision_score(y_test, y_pred_best), 3))
print("Recall   :", round(recall_score(y_test, y_pred_best), 3))
print("F1 Score :", round(f1_score(y_test, y_pred_best), 3))

Accuracy : 0.883
Precision: 0.86
Recall   : 0.79
F1 Score : 0.824


In [25]:
rf = RandomForestClassifier(random_state=42)
rf.fit(X_train, y_train)
y_pred_rf = rf.predict(X_test)

print("Accuracy :", round(accuracy_score(y_test, y_pred_rf), 3))
print("Precision:", round(precision_score(y_test, y_pred_rf), 3))
print("Recall   :", round(recall_score(y_test, y_pred_rf), 3))
print("F1 Score :", round(f1_score(y_test, y_pred_rf), 3))

Accuracy : 0.821
Precision: 0.788
Recall   : 0.661
F1 Score : 0.719


In [26]:
dump(best_model, open("Nayve.sav", "wb"))