In [39]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.metrics import classification_report

# Machine Learning

## Sentiment Analysis

Se desarrollará un sistema de machine learning del analisis de sentimiento para clasificar las reviews. Para esto se encontro un dataset con 10,000 reviews de restaurantes en Kagle (https://www.kaggle.com/datasets/joebeachcapital/restaurant-reviews?resource=download). Primeramenta vamos a explorar nuestro dataset y posteriormente se desarrollara el sistema de ML meidante NLP.

In [40]:
df_reviews = pd.read_csv('Restaurant reviews.csv')
df_reviews['Review'].fillna('NoReview', inplace=True)
df_reviews.head()

Unnamed: 0,Restaurant,Reviewer,Review,Rating,Metadata,Time,Pictures,7514
0,Beyond Flavours,Rusha Chakraborty,"The ambience was good, food was quite good . h...",5,"1 Review , 2 Followers",5/25/2019 15:54,0,2447.0
1,Beyond Flavours,Anusha Tirumalaneedi,Ambience is too good for a pleasant evening. S...,5,"3 Reviews , 2 Followers",5/25/2019 14:20,0,
2,Beyond Flavours,Ashok Shekhawat,A must try.. great food great ambience. Thnx f...,5,"2 Reviews , 3 Followers",5/24/2019 22:54,0,
3,Beyond Flavours,Swapnil Sarkar,Soumen das and Arun was a great guy. Only beca...,5,"1 Review , 1 Follower",5/24/2019 22:11,0,
4,Beyond Flavours,Dileep,Food is good.we ordered Kodi drumsticks and ba...,5,"3 Reviews , 2 Followers",5/24/2019 21:37,0,


Podemos osbervar que tenemos el nombre del restaurante, el usuario que generó la reseña, la reseña y un rating. En este caso en Kagle nos recomiendan que si el rating es mayor a 3, la clasifiquemos como positiva, menor a 3 negativa. Así que consideraremos 3 como neutral. Vamos a generar una nueva columna donde 0 sea para las reseñas malas, 1 para las neutrales y 2 para las positivas.

Nuestra columna de ratings es de tipo str, por lo que tendremos que convertir nuestros str a int para su analisis.

In [41]:
ratings = df_reviews['Rating']
sentimiento = []

for i in ratings:

    try:
        float(i)
    except:
        #En este caso i es un str que no se puede convertir a float, ponemos neutral sentiment
        sentimiento.append(1)
        continue
    
    if float(i) > 3.0:
        sentimiento.append(2)
    elif float(i) == 3.0:
        sentimiento.append(1)
    else:
        sentimiento.append(0)

In [42]:
#Ahora generamos nuestro nuevo dataset
df_reviews_sentiment = pd.DataFrame()
df_reviews_sentiment['Restaurant'] = df_reviews['Restaurant']
df_reviews_sentiment['Review'] = df_reviews['Review']
df_reviews_sentiment['Rating'] = df_reviews['Rating']
df_reviews_sentiment['Sentiment'] = sentimiento
df_reviews_sentiment['Review'].fillna('NaN', inplace=True)
df_reviews_sentiment


Unnamed: 0,Restaurant,Review,Rating,Sentiment
0,Beyond Flavours,"The ambience was good, food was quite good . h...",5,2
1,Beyond Flavours,Ambience is too good for a pleasant evening. S...,5,2
2,Beyond Flavours,A must try.. great food great ambience. Thnx f...,5,2
3,Beyond Flavours,Soumen das and Arun was a great guy. Only beca...,5,2
4,Beyond Flavours,Food is good.we ordered Kodi drumsticks and ba...,5,2
...,...,...,...,...
9995,Chinese Pavilion,Madhumathi Mahajan Well to start with nice cou...,3,1
9996,Chinese Pavilion,This place has never disappointed us.. The foo...,4.5,2
9997,Chinese Pavilion,"Bad rating is mainly because of ""Chicken Bone ...",1.5,0
9998,Chinese Pavilion,I personally love and prefer Chinese Food. Had...,4,2


Ya tenemos nuestro dataset clasificado, por lo tanto, primero lo vamos a exportar para utilizarlo en nuestro sistema en GCP.

In [43]:
df_reviews_sentiment.to_csv('Reviews_MLtraining.csv')

Ahora vamos a entrenar nuestro sistema para poder clasificar las reviews

In [44]:
# Dividir los datos en conjunto de entrenamiento y prueba
X_train, X_test, y_train, y_test = train_test_split(df_reviews_sentiment['Review'], df_reviews_sentiment['Sentiment'], test_size=0.01, random_state=42)

# Extracción de características usando TF-IDF
tfidf_vectorizer = TfidfVectorizer(max_features=1000) # Puedes ajustar el número máximo de características según tus necesidades
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)

# Entrenamiento del modelo SVM
svm_model = SVC(kernel='linear')
svm_model.fit(X_train_tfidf, y_train)

# Predicción y evaluación del modelo
y_pred = svm_model.predict(X_test_tfidf)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.92      0.85      0.88        27
           1       0.40      0.29      0.33         7
           2       0.91      0.97      0.94        66

    accuracy                           0.89       100
   macro avg       0.74      0.70      0.72       100
weighted avg       0.88      0.89      0.88       100



Logramos obtener una presición decente de nuestro modelo, por lo que ahora vamos a hacer la prueba con un archivo de reviews de California para ponerlo a prueba.

In [54]:
df_cal_test = pd.read_json('DataCalifornia\\1.json', lines= True)
df_cal_test

Unnamed: 0,user_id,name,time,rating,text,pics,resp,gmap_id
0,1.089912e+20,Song Ro,1609909927056,5,Love there korean rice cake.,,,0x80c2c778e3b73d33:0xbdc58662a4a97d49
1,1.112903e+20,Rafa Robles,1612849648663,5,Good very good,,,0x80c2c778e3b73d33:0xbdc58662a4a97d49
2,1.126404e+20,David Han,1583643882296,4,They make Korean traditional food very properly.,,,0x80c2c778e3b73d33:0xbdc58662a4a97d49
3,1.174403e+20,Anthony Kim,1551938216355,5,Short ribs are very delicious.,,,0x80c2c778e3b73d33:0xbdc58662a4a97d49
4,1.005808e+20,Mario Marzouk,1494910901933,5,Great food and prices the portions are large,,,0x80c2c778e3b73d33:0xbdc58662a4a97d49
...,...,...,...,...,...,...,...,...
149995,1.081939e+20,Raffi Khatchadourian,1583646462958,5,,,,0x80c299484c7709cd:0x89510434231b14d4
149996,1.075762e+20,Noah Vincent Ford,1605055631573,5,,,,0x80c299484c7709cd:0x89510434231b14d4
149997,1.008113e+20,David Karwaski,1597443113544,5,,,,0x80c299484c7709cd:0x89510434231b14d4
149998,1.177078e+20,Reynalda Vargaz,1573479297751,5,,,,0x80c299484c7709cd:0x89510434231b14d4


Vamos a extraer la columna 'Text' para poderla procesar.

In [57]:
df_test = pd.DataFrame()
df_test['Review'] = df_cal_test['text']
df_test['Rating'] = df_cal_test['rating']

#LLenamos los espacios vacios
df_test['Review'].fillna('NoReview', inplace=True)

#Vemos nuestro dataframe
df_test

Unnamed: 0,Review,Rating
0,Love there korean rice cake.,5
1,Good very good,5
2,They make Korean traditional food very properly.,4
3,Short ribs are very delicious.,5
4,Great food and prices the portions are large,5
...,...,...
149995,NoReview,5
149996,NoReview,5
149997,NoReview,5
149998,NoReview,5


Ahora ya tenemos nuestro dataset más limpio, por lo que vamos a pasarselo a una función de nuestro modelo de NLP para ver cómo funcióno.

In [58]:
def sentimiento(x):
    # Extracción de características utilizando el vectorizador TF-IDF
    nueva_resena_tfidf = tfidf_vectorizer.transform([x])

    # Predicción utilizando el modelo SVM entrenado
    prediccion = svm_model.predict(nueva_resena_tfidf)

    # Retornamos la predicción
    return prediccion

In [59]:
#Ingresamos nuestras reseñas analizadas
Sentiment_Analysis = []

for i in df_test['Review']:
    Sentiment_Analysis.append(sentimiento(i))

#Hacemos un acomodo para extraer los datos
sentimientos = []
for i in range(len(Sentiment_Analysis)):
    sentimientos.append(int(Sentiment_Analysis[i][0]))

In [67]:
df_test['Sentiment'] = sentimientos

df_test

Unnamed: 0,Review,Rating,Sentiment
0,Love there korean rice cake.,5,2
1,Good very good,5,2
2,They make Korean traditional food very properly.,4,0
3,Short ribs are very delicious.,5,2
4,Great food and prices the portions are large,5,2
...,...,...,...
149995,NoReview,5,2
149996,NoReview,5,2
149997,NoReview,5,2
149998,NoReview,5,2
