In [28]:
import pandas as pd
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline as ImbPipeline  # Importar correctamente el pipeline de imblearn
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from sklearn.base import BaseEstimator, TransformerMixin
import nltk

# Descargar los recursos necesarios para VADER
nltk.download('vader_lexicon')


[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\evely\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


True

In [29]:

# Clase para el análisis de sentimientos
class SentimentAnalyzer(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.sia = SentimentIntensityAnalyzer()
        
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        return X.apply(self.get_sentiment_label)
    
    def get_sentiment_label(self, text):
        sentiment_score = self.sia.polarity_scores(text)['compound']
        if sentiment_score >= 0.05:
            return 'positive'
        elif sentiment_score <= -0.05:
            return 'negative'
        else:
            return 'neutral'

In [30]:
# Cargar el dataset
reviews = pd.read_csv('Reviews.csv')

# Ver las primeras filas para asegurarte de que cargó bien
reviews.head(3)

Unnamed: 0,business_id,name,Address,city,state,postal_code,latitude,longitude,stars_x,review_count,categories,review_id,stars_y,useful,funny,cool,text
0,Zwwgjqlueils7Hcxdkbgzg,Greek House Cafe,5 W Haley St,Santa Barbara,Fl,93101.0,34.416471,-119.695671,4.0,40,Delis Restaurants,Zoioivexpwmki600Nfrwpw,3,1,0,0,My Husband And I Enjoyed A Quick Light Lunch H...
1,Hj5K3Fmo8Dog7X8Xalgisg,Del Taco,12490 Old Virginia Rd,Reno,Fl,89521.0,39.420854,-119.75369,2.5,52,Fast Food Restaurants Mexican,4Gawhktjysydnjuru8Xcq,1,0,0,0,I Dont Go To A Drive Thru Of Any Kind With Hig...
2,8Vo6Ln9Gqulhxzoxj5K6Kq,The Black Sheep,26 E Ortega St,Santa Barbara,Az,93101.0,34.419246,-119.697039,4.5,983,American New Beer Wine Spirits Food Japanese ...,N_Vbrgbtoxedoz2Xfasoa,4,0,0,0,Ambiance 55 Food 455 Service 45 Value 35


In [31]:
reviews.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 227695 entries, 0 to 227694
Data columns (total 17 columns):
 #   Column        Non-Null Count   Dtype  
---  ------        --------------   -----  
 0   business_id   227695 non-null  object 
 1   name          227695 non-null  object 
 2   Address       227695 non-null  object 
 3   city          227695 non-null  object 
 4   state         227695 non-null  object 
 5   postal_code   227695 non-null  float64
 6   latitude      227695 non-null  float64
 7   longitude     227695 non-null  float64
 8   stars_x       227695 non-null  float64
 9   review_count  227695 non-null  int64  
 10  categories    227695 non-null  object 
 11  review_id     227695 non-null  object 
 12  stars_y       227695 non-null  int64  
 13  useful        227695 non-null  int64  
 14  funny         227695 non-null  int64  
 15  cool          227695 non-null  int64  
 16  text          227695 non-null  object 
dtypes: float64(4), int64(5), object(8)
memory usage:

In [32]:

# Aplicar el análisis de sentimientos a la columna de texto
sentiment_analyzer = SentimentAnalyzer()
reviews['sentiment_label'] = sentiment_analyzer.transform(reviews['text'])

# Filtrar reseñas negativas
negative_reviews = reviews[reviews['sentiment_label'] == 'negative']


In [33]:
# Crear variable objetivo (1 si el texto tiene más de 50 caracteres, 0 si tiene menos o igual)
threshold = 50
y = negative_reviews['text'].apply(lambda x: len(x))
y = y.apply(lambda x: 1 if x > threshold else 0)

# Definir el clasificador
rf_model = RandomForestClassifier(random_state=42)


In [34]:
# Crear el pipeline usando ImbPipeline que soporta SMOTE
pipeline_with_sentiment = ImbPipeline([
    ('sentiment_analyzer', SentimentAnalyzer()),  # Análisis de sentimientos
    ('tfidf', TfidfVectorizer(max_features=1000)),  # TF-IDF
    ('smote', SMOTE(random_state=42)),  # Sobremuestreo con SMOTE
    ('classifier', rf_model)  # Clasificador
])

# Ajustar el pipeline en los datos de entrenamiento
pipeline_with_sentiment.fit(negative_reviews['text'], y)

In [35]:
df.to_csv('reviews.csv', index=False)

['reviews.pkl']

In [38]:
# **Elimina columnas no deseadas** (ejemplo)
# Puedes especificar las columnas que quieras eliminar. Aquí se eliminan todas menos 'text', 'sentiment_label' y 'predicted_label'.
columns_to_keep = ['name', 'Address', 'city', 'latitude', 'longitude', 'text', 'sentiment_label', 'predicted_label']
negative_reviews = negative_reviews[columns_to_keep]

# Guardar las reseñas con las predicciones en un archivo CSV
negative_reviews.to_csv('negative_reviews.csv', index=False)