# **Sentiment analysis**

### Naive Bayes models are very useful when we want to analyze sentiment, classify texts into topics or recommendations, as the characteristics of these challenges meet the theoretical and methodological assumptions of the model very well.

### In this project you will practice with a dataset to create a review classifier for the Google Play store.

In [38]:
# Import the libraries
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import GaussianNB, MultinomialNB, BernoulliNB
from sklearn.metrics import accuracy_score
from sklearn.model_selection import GridSearchCV
from pickle import dump

### **Step 1:** Loading the dataset

In [2]:
# Load the dataset from https://raw.githubusercontent.com/4GeeksAcademy/naive-bayes-project-tutorial/main/playstore_reviews.csv
google_play_store_df = pd.read_csv('https://raw.githubusercontent.com/4GeeksAcademy/naive-bayes-project-tutorial/main/playstore_reviews.csv')
google_play_store_df

Unnamed: 0,package_name,review,polarity
0,com.facebook.katana,privacy at least put some option appear offli...,0
1,com.facebook.katana,"messenger issues ever since the last update, ...",0
2,com.facebook.katana,profile any time my wife or anybody has more ...,0
3,com.facebook.katana,the new features suck for those of us who don...,0
4,com.facebook.katana,forced reload on uploading pic on replying co...,0
...,...,...,...
886,com.rovio.angrybirds,loved it i loooooooooooooovvved it because it...,1
887,com.rovio.angrybirds,all time legendary game the birthday party le...,1
888,com.rovio.angrybirds,ads are way to heavy listen to the bad review...,0
889,com.rovio.angrybirds,fun works perfectly well. ads aren't as annoy...,1


### **Step 2:** Study of variables and their content

In [3]:
# Remove the "package_name" column
google_play_store_df.drop(['package_name'], axis=1, inplace=True)
google_play_store_df

Unnamed: 0,review,polarity
0,privacy at least put some option appear offli...,0
1,"messenger issues ever since the last update, ...",0
2,profile any time my wife or anybody has more ...,0
3,the new features suck for those of us who don...,0
4,forced reload on uploading pic on replying co...,0
...,...,...
886,loved it i loooooooooooooovvved it because it...,1
887,all time legendary game the birthday party le...,1
888,ads are way to heavy listen to the bad review...,0
889,fun works perfectly well. ads aren't as annoy...,1


In [4]:
# Removing spaces and converting the text to lowercase
google_play_store_df["review"] = google_play_store_df["review"].str.strip().str.lower()
google_play_store_df

Unnamed: 0,review,polarity
0,privacy at least put some option appear offlin...,0
1,"messenger issues ever since the last update, i...",0
2,profile any time my wife or anybody has more t...,0
3,the new features suck for those of us who don'...,0
4,forced reload on uploading pic on replying com...,0
...,...,...
886,loved it i loooooooooooooovvved it because it ...,1
887,all time legendary game the birthday party lev...,1
888,ads are way to heavy listen to the bad reviews...,0
889,fun works perfectly well. ads aren't as annoyi...,1


In [5]:
# Divide the dataset into train and test: X_train, X_test, y_train, y_test
# Separate the features and target
X = google_play_store_df.drop(columns=['polarity'])
y = google_play_store_df['polarity']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

X_train = list(X_train['review'])
X_test = list(X_test['review'])

In [6]:
# Transform the text into a word count matrix
vec_model = CountVectorizer(stop_words='english')
X_train = vec_model.fit_transform(X_train).toarray()
X_test = vec_model.transform(X_test).toarray()

### **Step 3:** Build a naive bayes model

In [13]:
# Create model GaussianNB
model_GNB = GaussianNB()
model_GNB.fit(X_train,y_train)

In [14]:
y_pred = model_GNB.predict(X_test)
y_pred

array([0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0,
       1, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0,
       0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,
       0, 0, 0, 1, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1,
       1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0,
       0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0,
       1, 0, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1,
       1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 1,
       0, 0, 0])

In [15]:
# Check the accuracy
print('The accuracy with GaussianNB is: ',accuracy_score(y_test, y_pred))

The accuracy with GaussianNB is:  0.8044692737430168


In [20]:
# Create the model MutinomialNB
model_MNB = MultinomialNB()
model_MNB.fit(X_train, y_train)

In [23]:
# Evaluate the model
y_pred = model_MNB.predict(X_test)
y_pred

array([0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0,
       1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0,
       0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 1, 0, 0, 0, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0,
       1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0,
       1, 0, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0,
       0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0,
       0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1,
       0, 0, 0])

In [24]:
# Check the accuracy
print('The accuracy with MultinomialNB is: ',accuracy_score(y_test, y_pred))

The accuracy with MultinomialNB is:  0.8156424581005587


In [25]:
# Create the model BernoullilNB
model_BNB = BernoulliNB()
model_BNB.fit(X_train, y_train)

In [26]:
# Evaluate the model
y_pred = model_BNB.predict(X_test)
y_pred

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0,
       1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0,
       0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 1, 0, 0, 0, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0,
       1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0,
       0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0,
       0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0,
       0, 0, 0])

In [27]:
# Check the accuracy
print('The accuracy with BernulliNB is: ',accuracy_score(y_test, y_pred))

The accuracy with BernulliNB is:  0.770949720670391


### **Step 4:** Optimize the previous model

In [35]:
# Chosing the best model, that was the MutinomialNB, we are going to optimize the hyperparameters
# Define the hipperparameters
hiperparameters = {
    'alpha': [0.1, 0.2, 0.3, 0.5, 0.8, 1.0],
    'fit_prior': [True, False]
}

# Initialize the GridSearchCV
grid_search = GridSearchCV(estimator=model_MNB, param_grid=hiperparameters, scoring='accuracy')

# Train the model
grid_search.fit(X_train, y_train)

# Print the best parameter
grid_search.best_params_


{'alpha': 0.5, 'fit_prior': False}

In [37]:
# Adjust the model with the best parameters
model_grid = MultinomialNB(alpha = 0.5, fit_prior = False)
model_grid.fit(X_train, y_train)

# Predict the test data
y_pred = model_grid.predict(X_test)

# Calculate the accuracy score
accuracy_score(y_test, y_pred)

0.8044692737430168

### **Step 5:** Save the model

In [39]:
# Save the model
dump(model_MNB, open('../models/multinomialNB_default_42.pkl', 'wb'))
dump(model_grid, open('../models/MNB_optimized_model.pkl', 'wb'))