In [1]:
import pandas as pd

data = pd.read_csv('googleplaystore_user_reviews.csv')

data.head()

Unnamed: 0,App,Translated_Review,Sentiment,Sentiment_Polarity,Sentiment_Subjectivity
0,10 Best Foods for You,I like eat delicious food. That's I'm cooking ...,Positive,1.0,0.533333
1,10 Best Foods for You,This help eating healthy exercise regular basis,Positive,0.25,0.288462
2,10 Best Foods for You,,,,
3,10 Best Foods for You,Works great especially going grocery store,Positive,0.4,0.875
4,10 Best Foods for You,Best idea us,Positive,1.0,0.3


# EDA

In [2]:
data.shape

(64295, 5)

In [3]:
data['Sentiment'].value_counts()

Sentiment
Positive    23998
Negative     8271
Neutral      5163
Name: count, dtype: int64

In [4]:
data.describe()

Unnamed: 0,Sentiment_Polarity,Sentiment_Subjectivity
count,37432.0,37432.0
mean,0.182146,0.492704
std,0.351301,0.259949
min,-1.0,0.0
25%,0.0,0.357143
50%,0.15,0.514286
75%,0.4,0.65
max,1.0,1.0


In [5]:
data.describe(include='all')

Unnamed: 0,App,Translated_Review,Sentiment,Sentiment_Polarity,Sentiment_Subjectivity
count,64295,37427,37432,37432.0,37432.0
unique,1074,27994,3,,
top,Angry Birds Classic,Good,Positive,,
freq,320,247,23998,,
mean,,,,0.182146,0.492704
std,,,,0.351301,0.259949
min,,,,-1.0,0.0
25%,,,,0.0,0.357143
50%,,,,0.15,0.514286
75%,,,,0.4,0.65


In [6]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 64295 entries, 0 to 64294
Data columns (total 5 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   App                     64295 non-null  object 
 1   Translated_Review       37427 non-null  object 
 2   Sentiment               37432 non-null  object 
 3   Sentiment_Polarity      37432 non-null  float64
 4   Sentiment_Subjectivity  37432 non-null  float64
dtypes: float64(2), object(3)
memory usage: 2.5+ MB


In [7]:
data.isnull().sum()

App                           0
Translated_Review         26868
Sentiment                 26863
Sentiment_Polarity        26863
Sentiment_Subjectivity    26863
dtype: int64

# PREPROCESAMIENTO

In [8]:
# Eliminar todas las filas que tengan al menos un valor NaN
data = data.dropna()

data.isnull().sum()

App                       0
Translated_Review         0
Sentiment                 0
Sentiment_Polarity        0
Sentiment_Subjectivity    0
dtype: int64

In [9]:
data.shape

(37427, 5)

In [10]:
data = data[['Translated_Review', 'Sentiment']]
data.head()

Unnamed: 0,Translated_Review,Sentiment
0,I like eat delicious food. That's I'm cooking ...,Positive
1,This help eating healthy exercise regular basis,Positive
3,Works great especially going grocery store,Positive
4,Best idea us,Positive
5,Best way,Positive


In [11]:
import re
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')

stop_words = set(stopwords.words('english'))

def clean_text(text):
    text = text.lower()                                  # minúsculas
    text = re.sub(r'[^a-z\s]', '', text)                 # eliminar signos, números, emojis
    text = ' '.join([w for w in text.split() if w not in stop_words])  # eliminar stopwords
    return text

data['clean_review'] = data['Translated_Review'].apply(clean_text)


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Diego\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [12]:
data.head()

Unnamed: 0,Translated_Review,Sentiment,clean_review
0,I like eat delicious food. That's I'm cooking ...,Positive,like eat delicious food thats im cooking food ...
1,This help eating healthy exercise regular basis,Positive,help eating healthy exercise regular basis
3,Works great especially going grocery store,Positive,works great especially going grocery store
4,Best idea us,Positive,best idea us
5,Best way,Positive,best way


In [13]:
from sklearn.preprocessing import LabelEncoder

encoder = LabelEncoder()
data['Sentiment_encoded'] = encoder.fit_transform(data['Sentiment'])

print(encoder.classes_)   # ['Negative', 'Neutral', 'Positive']


['Negative' 'Neutral' 'Positive']


In [14]:
data.head()

Unnamed: 0,Translated_Review,Sentiment,clean_review,Sentiment_encoded
0,I like eat delicious food. That's I'm cooking ...,Positive,like eat delicious food thats im cooking food ...,2
1,This help eating healthy exercise regular basis,Positive,help eating healthy exercise regular basis,2
3,Works great especially going grocery store,Positive,works great especially going grocery store,2
4,Best idea us,Positive,best idea us,2
5,Best way,Positive,best way,2


# VECTORIZAR

In [15]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(max_features=5000)
X = vectorizer.fit_transform(data['clean_review'])
y = data['Sentiment_encoded']


# ENTRENAR

In [16]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)

y_pred = model.predict(X_test)

print(classification_report(y_test, y_pred))


              precision    recall  f1-score   support

           0       0.91      0.81      0.86      1653
           1       0.88      0.85      0.86      1049
           2       0.92      0.97      0.94      4784

    accuracy                           0.91      7486
   macro avg       0.91      0.87      0.89      7486
weighted avg       0.91      0.91      0.91      7486



In [17]:
import joblib

# Guardar el modelo
joblib.dump(model, "modelo_sentimientos.pkl")

# guardar el vectorizador TF-IDF
joblib.dump(vectorizer, "vectorizer_tfidf.pkl")

['vectorizer_tfidf.pkl']

In [19]:
y_pred = model.predict(X_test)

from sklearn.metrics import classification_report, confusion_matrix
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))


[[1335   60  258]
 [  26  889  134]
 [ 100   60 4624]]
              precision    recall  f1-score   support

           0       0.91      0.81      0.86      1653
           1       0.88      0.85      0.86      1049
           2       0.92      0.97      0.94      4784

    accuracy                           0.91      7486
   macro avg       0.91      0.87      0.89      7486
weighted avg       0.91      0.91      0.91      7486

