In [2]:
import pandas as pd
import numpy as np
import sys
import pickle
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn import metrics
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score
from sklearn.model_selection import RandomizedSearchCV


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\belen\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.


In [3]:
df_raw = pd.read_csv('https://raw.githubusercontent.com/4GeeksAcademy/naive-bayes-project-tutorial/main/playstore_reviews_dataset.csv')

In [4]:
df_raw.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 3 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   package_name  891 non-null    object
 1   review        891 non-null    object
 2   polarity      891 non-null    int64 
dtypes: int64(1), object(2)
memory usage: 21.0+ KB


In [5]:
df_raw.sample(10)

Unnamed: 0,package_name,review,polarity
563,jabanaki.todo.todoly,"simply brilliant simply brilliant, no problem...",1
518,com.dropbox.android,please do not stop carousel! google has imp...,0
31,com.facebook.katana,showing old news why is it showing that someo...,0
407,com.facebook.orca,"sometimes, if i turn off wifi, both messenge...",0
372,com.google.android.talk,often painfully slow. needs a useful tablet ...,0
750,com.shirantech.kantipur,virus i think your site ìs infected as it is...,0
218,com.supercell.clashofclans,it was a great game . until the last update ...,0
118,com.linkedin.android,totally diferent from the web page do not use...,0
678,com.hamrokeyboard,i found this app very fruitful.. and m using...,1
623,com.uc.browser.en,i love it i love using uc browser mini becaus...,1


In [6]:
df_raw['polarity'].value_counts()
#0 negativos, 1 positivo

0    584
1    307
Name: polarity, dtype: int64

**Step 1:**

</br>We have three columns: package name, review and polarity (0 = bad, 1 = good) Preprocess the data by eliminating the package name column and putting all reviews in lower case.

In [7]:
df_trans = df_raw.drop("package_name", axis = 1)
#Drops column 'package_name'

In [8]:
df_trans['review'] = df_trans['review'].str.lower()
#Strings to lower case

In [9]:
df_trans['review'] = df_trans['review'].str.strip() 
#Removes leading and trailing whitespaces

In [10]:
df_trans['review'].str.split(expand = True).stack().value_counts()[:60]

the       1293
to        1159
i         1084
and        853
it         778
a          619
is         555
my         442
this       421
for        410
of         390
but        348
in         340
on         325
not        318
app        317
you        315
that       260
have       252
with       220
so         203
be         194
no         186
when       182
if         180
or         169
all        163
can        158
as         148
me         145
are        139
it's       138
can't      137
update     136
new        136
like       135
fix        133
please     131
good       128
game       121
one        119
use        118
just       117
get        112
its        111
very       109
more       107
was        104
there      104
don't      103
now        102
will       102
even       101
from       101
at         100
up          99
great       99
time        93
your        90
do          89
dtype: int64

In [11]:
#Función para retirar los stopwords
stop = stopwords.words('english')

def remove_stopwords(review):
  if review is not None:
    words = review.strip().split()
    words_filtered = []
    for word in words:
      if word not in stop:
        words_filtered.append(word)
    result = " ".join(words_filtered) #hace un join elemento por elemento separados por espacio
  else:
      result = None
  return result

In [12]:
df_trans['review'] = df_trans['review'].apply(remove_stopwords)

In [13]:
df_trans['review'].str.split(expand = True).stack().value_counts()[:60]

app         317
can't       137
new         136
update      136
like        135
fix         133
please      131
good        128
game        121
one         119
use         118
get         112
even        101
great        99
time         93
really       82
would        80
back         78
still        78
love         77
browser      76
make         74
see          69
using        68
work         67
i'm          67
5            67
open         65
want         64
it.          64
google       63
much         61
phone        61
every        61
u            58
option       58
version      57
go           56
way          56
app.         56
used         55
also         55
android      54
download     53
best         52
.            51
give         50
better       50
play         50
send         49
people       48
able         47
message      47
since        46
many         45
keep         45
works        44
i've         44
old          43
find         43
dtype: int64

In [14]:
df = df_trans.copy()

**Step 2:**

Separate target from feature, and split your data.

In [15]:
X = df['review']
y = df['polarity']

X_train, X_test, y_train, y_test = train_test_split(X, y, stratify = y, random_state = 25)
#Stratify is for unbalanced datasets
#Polarity is unbalanced (584 vs 307)

**Step 3:**

Vectorize your features and use Naive Bayes to classify the reviews as good or bad. We will not focus on hypertuning our model this time. This was an introduction project to sentiment analysis using Naive Bayes.

Diferentes opciones de pipeline según los preprocessing steps:

1. One preprocessing step (CountVectorizer) and one model step:

In [16]:
clf_1 = Pipeline([('cont_vect', CountVectorizer()), ('clf', MultinomialNB())])

2. One preprocessing step (TfidfVectorizer) and one model step:

In [17]:
clf_2 = Pipeline([('tfidf_vect', TfidfVectorizer()), ('clf', MultinomialNB())])

3. Two preprocessing steps and one model step:

In [18]:
clf_3 = Pipeline([('vect', CountVectorizer()), ('tfidf', TfidfTransformer()), ('clf', MultinomialNB())])

Model training:

In [19]:
clf_1.fit(X_train, y_train)
#Para la opción 1

In [20]:
clf_2.fit(X_train, y_train)
#Para la opción 2

In [21]:
clf_3.fit(X_train, y_train)
#Para la opción 3

Predictions:

In [22]:
pred_1 = clf_1.predict(X_test)

In [23]:
pred_2 = clf_2.predict(X_test)


In [24]:
pred_3 = clf_3.predict(X_test)

* Pipeline step by step:

In [61]:
#Opción 1:
#vect = CountVectorizer() #Vector de conteo
#text_vec = vect.fit_transform(X_train)
#text_vec.toarray() #Vemos que son vectores con 0 y 1
#Cada fila es un comentario y cada columna una palabra

In [62]:
#vect.get_feature_names_out()
#Muestra array con todo el vocabulario

In [63]:
#Opción 2:
#tfidf = TfidfVectorizer()
#text_tfidf = tfidf.fit_transform(X_train)
#np.set_printoptions(threshold = sys.maxsize)
#text_tfidf.toarray()[0] Commented bc output is too large

In [64]:
#Opción 3:
#text_clf = Pipeline([('vect', CountVectorizer()), ('tfidf', TfidfTransformer())])
#text_norm = text_clf.fit_transform(X_train)

In [65]:
#text_norm.toarray()[0] commented bc output is too big

Scores:

In [25]:
print(classification_report(y_test, pred_1)) #Al que le dio mejor es a este (0.83 accuracy)
print(classification_report(y_test, pred_2))
print(classification_report(y_test, pred_3))

              precision    recall  f1-score   support

           0       0.83      0.93      0.88       146
           1       0.83      0.65      0.73        77

    accuracy                           0.83       223
   macro avg       0.83      0.79      0.81       223
weighted avg       0.83      0.83      0.83       223

              precision    recall  f1-score   support

           0       0.71      0.99      0.83       146
           1       0.90      0.23      0.37        77

    accuracy                           0.73       223
   macro avg       0.80      0.61      0.60       223
weighted avg       0.78      0.73      0.67       223

              precision    recall  f1-score   support

           0       0.71      0.99      0.83       146
           1       0.90      0.23      0.37        77

    accuracy                           0.73       223
   macro avg       0.80      0.61      0.60       223
weighted avg       0.78      0.73      0.67       223



In [26]:
print('clf_1 Test Accuracy = ', metrics.accuracy_score(y_test,pred_1))
print('clf_2 Test Accuracy = ' , metrics.accuracy_score(y_test,pred_2))
print('clf_3 Test Accuracy = ', metrics.accuracy_score(y_test,pred_3))

clf_1 Test Accuracy =  0.8340807174887892
clf_2 Test Accuracy =  0.726457399103139
clf_3 Test Accuracy =  0.726457399103139


Búsqueda de hiperparámetros

1. Para el modelo 1:

In [27]:
n_iter_search = 4
parameters = {'cont_vect__ngram_range': [(1, 1), (1, 2)], 'clf__alpha': (1e-2, 1e-3)}
gs_clf_1 = RandomizedSearchCV(clf_1, parameters, n_iter = n_iter_search)
gs_clf_1.fit(X_train, y_train)
pred_1_grid = gs_clf_1.predict(X_test)

In [28]:
gs_clf_1.best_params_

{'cont_vect__ngram_range': (1, 2), 'clf__alpha': 0.01}

2. Para el modelo 2:

In [29]:
n_iter_search = 2
parameters = {'clf__alpha': (1e-2, 1e-3)}
gs_clf_2 = RandomizedSearchCV(clf_2, parameters, n_iter = n_iter_search)
gs_clf_2.fit(X_train, y_train)
pred_2_grid = gs_clf_2.predict(X_test)

In [30]:
gs_clf_2.best_params_

{'clf__alpha': 0.01}

3. Para el modelo 3:

In [31]:
n_iter_search = 4
parameters = {'vect__ngram_range': [(1, 1), (1, 2)], 'tfidf__use_idf': (True, False), 'clf__alpha': (1e-2, 1e-3)}
gs_clf_3 = RandomizedSearchCV(clf_3, parameters, n_iter = n_iter_search)
gs_clf_3.fit(X_train, y_train)
pred_3_grid = gs_clf_3.predict(X_test)

In [32]:
gs_clf_3.best_params_

{'vect__ngram_range': (1, 2), 'tfidf__use_idf': False, 'clf__alpha': 0.01}

In [33]:
print(classification_report(y_test, pred_1_grid))
print(classification_report(y_test, pred_2_grid))
print(classification_report(y_test, pred_3_grid)) #Este es el modelo con mayor accuracy

              precision    recall  f1-score   support

           0       0.83      0.94      0.88       146
           1       0.84      0.64      0.73        77

    accuracy                           0.83       223
   macro avg       0.84      0.79      0.80       223
weighted avg       0.84      0.83      0.83       223

              precision    recall  f1-score   support

           0       0.80      0.95      0.87       146
           1       0.84      0.56      0.67        77

    accuracy                           0.81       223
   macro avg       0.82      0.75      0.77       223
weighted avg       0.82      0.81      0.80       223

              precision    recall  f1-score   support

           0       0.82      0.97      0.89       146
           1       0.90      0.61      0.73        77

    accuracy                           0.84       223
   macro avg       0.86      0.79      0.81       223
weighted avg       0.85      0.84      0.83       223



In [34]:
bmodel = gs_clf_3.best_estimator_

In [35]:
pickle.dump(bmodel, open('../models/bmodel.csv', 'wb'))