In [None]:
import pandas as pd

data = pd.read_csv('googleplaystore_user_reviews.csv')

data.head()

# EDA

In [None]:
data.shape

In [None]:
data['Sentiment'].value_counts()

In [None]:
data.describe()

In [None]:
data.describe(include='all')

In [None]:
data.info()

In [None]:
data.isnull().sum()

# PREPROCESAMIENTO

In [None]:
# Eliminar todas las filas que tengan al menos un valor NaN
data = data.dropna()

data.isnull().sum()

In [None]:
data.shape

In [None]:
data = data[['Translated_Review', 'Sentiment']]
data.head()

In [None]:
import re
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')

stop_words = set(stopwords.words('english'))

def clean_text(text):
    text = text.lower()                                  # minúsculas
    text = re.sub(r'[^a-z\s]', '', text)                 # eliminar signos, números, emojis
    text = ' '.join([w for w in text.split() if w not in stop_words])  # eliminar stopwords
    return text

data['clean_review'] = data['Translated_Review'].apply(clean_text)


In [None]:
from sklearn.preprocessing import LabelEncoder

encoder = LabelEncoder()
data['Sentiment_encoded'] = encoder.fit_transform(data['Sentiment'])

print(encoder.classes_)   # ['Negative', 'Neutral', 'Positive']


# VECTORIZAR

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(max_features=5000)
X = vectorizer.fit_transform(data['clean_review'])
y = data['Sentiment_encoded']


# ENTRENAR

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)

y_pred = model.predict(X_test)

print(classification_report(y_test, y_pred))


In [None]:
import joblib

# Guardar el modelo
joblib.dump(model, "modelo_sentimientos.pkl")

# guardar el vectorizador TF-IDF
joblib.dump(vectorizer, "vectorizer_tfidf.pkl")