In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from wordcloud import WordCloud

from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.metrics.pairwise import linear_kernel

import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')

import zipfile

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Cistelsa\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


>**`Importar el dataframe con las opiniones de los usuarios en Yelp y Google Maps`**

In [4]:
# Ruta del archivo ZIP
zip_file_path = '../2_Datasets/launch/df_sent_yelp_gm.zip'

# Abre el archivo ZIP
with zipfile.ZipFile(zip_file_path, 'r') as zip_file:
    # Lista los archivos en el ZIP (puede haber múltiples archivos, no necesariamente uno)
    zip_file_contents = zip_file.namelist()
    
    # Supongamos que queremos leer el primer archivo CSV en el ZIP
    csv_file_name = zip_file_contents[0]
    
    # Extrae el archivo CSV del ZIP
    with zip_file.open(csv_file_name) as csv_file:
        # Lee el archivo CSV con Pandas
        df_reviews = pd.read_csv(csv_file)

In [16]:
#df_reviews = pd.read_csv("2_Datasets/launch/df_sent_yelp_gm.csv")
df_reviews = df_reviews.iloc[:,:2]
df_reviews.head(3)

Unnamed: 0,hotel_id,review
0,6488,the sheraton new orleans hotel is conveniently...
1,12892,hotel was great room was clean access to the...
2,12918,decent place clean nice staff but wifi does ...


>**`Agrupar los comentarios de los usuarios por el id del hotel`**

In [6]:
df_reviews = df_reviews.groupby(['hotel_id'])['review'].sum().reset_index(name='review')

>**`Importar el dataframe de los hoteles (ya cruzado/unido con lo necesario: estado, ciudad, etc) y quedarse con los registros únicos`**

In [7]:
# Ruta del archivo ZIP
zip_file_path = '../2_Datasets/beta/Hotelbeds/df_hotels.zip'

# Abre el archivo ZIP
with zipfile.ZipFile(zip_file_path, 'r') as zip_file:
    # Lista los archivos en el ZIP (puede haber múltiples archivos, no necesariamente uno)
    zip_file_contents = zip_file.namelist()
    
    # Supongamos que queremos leer el primer archivo CSV en el ZIP
    csv_file_name = zip_file_contents[0]
    
    # Extrae el archivo CSV del ZIP
    with zip_file.open(csv_file_name) as csv_file:
        # Lee el archivo CSV con Pandas
        df_hotels = pd.read_csv(csv_file)

In [8]:
#df_hotels = pd.read_csv("/lakehouse/default/Files/data/beta/Hotelbeds/df_hotels.csv")
df_hotels = df_hotels.iloc[:,[0,1,2,3,8,9,10,11,12,15]]
df_hotels.drop_duplicates(['hotel_id'], inplace=True)
df_hotels.head(4)

Unnamed: 0,hotel_id,name,address,description,phones,web,email,state_name,city_name,stars
0,40206,"The Eliza Jane, in The Unbound Collection by H...",315 Magazine St,The Eliza Jane Hotel is a 3.5 star hotel that ...,+5042075071,,,LOUISIANA,New,4.2
1,160358,Holiday Inn Express Voorhees - Mt. Laurel,121 Laurel Oak Rd,This hotel enjoys a strategic setting just a s...,1-856-346-4500,,,NEW JERSEY,Lindenwold,4.0
2,167791,Best Western Plus South Coast Inn,5620 Calle Real,Welcome to Best Western Plus South Coast Inn!\...,36113167559,,,CALIFORNIA,Goleta,3.5
5,168054,Best Western Plus Meridian,1019 S Progress Avenue,"Stay at this Meridian, Idaho hotel and enjoy a...",0012083441200,,,IDAHO,Meridian,3.5


In [9]:
df_hotels.shape

(41627, 10)

>**`Unir ambos dataframes a través de la columna 'hotel_id'`**

In [10]:
df_hotels = pd.merge(df_hotels, df_reviews, on='hotel_id')
df_hotels.head(4)

Unnamed: 0,hotel_id,name,address,description,phones,web,email,state_name,city_name,stars,review
0,40206,"The Eliza Jane, in The Unbound Collection by H...",315 Magazine St,The Eliza Jane Hotel is a 3.5 star hotel that ...,+5042075071,,,LOUISIANA,New,4.2,on every one of my dozen trips to new orleans ...
1,160358,Holiday Inn Express Voorhees - Mt. Laurel,121 Laurel Oak Rd,This hotel enjoys a strategic setting just a s...,1-856-346-4500,,,NEW JERSEY,Lindenwold,4.0,thanks holiday inn express \n\nhad a great sta...
2,167791,Best Western Plus South Coast Inn,5620 Calle Real,Welcome to Best Western Plus South Coast Inn!\...,36113167559,,,CALIFORNIA,Goleta,3.5,i had such high hopes with this best western ...
3,168054,Best Western Plus Meridian,1019 S Progress Avenue,"Stay at this Meridian, Idaho hotel and enjoy a...",0012083441200,,,IDAHO,Meridian,3.5,needed a place for a few days while visiting f...


In [11]:
df_hotels.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3138 entries, 0 to 3137
Data columns (total 11 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   hotel_id     3138 non-null   int64  
 1   name         3138 non-null   object 
 2   address      3137 non-null   object 
 3   description  1853 non-null   object 
 4   phones       1464 non-null   object 
 5   web          445 non-null    object 
 6   email        760 non-null    object 
 7   state_name   3138 non-null   object 
 8   city_name    3138 non-null   object 
 9   stars        3138 non-null   float64
 10  review       3138 non-null   object 
dtypes: float64(1), int64(1), object(9)
memory usage: 294.2+ KB


In [12]:
df_hotels.to_csv("../2_Datasets/beta/Hotelbeds/df_hotels_.csv", index=False)

# <h1 align=center>**`Endpoint`**</h1>

>**`Endpoint 'recommendation system'`**

In [22]:
def cargar_datos_y_generar_recomendaciones():
    # Ruta del archivo ZIP
    zip_file_path = '../2_Datasets/beta/Hotelbeds/df_hotels_.zip'

    # Abre el archivo ZIP
    with zipfile.ZipFile(zip_file_path, 'r') as zip_file:
        # Lista los archivos en el ZIP (puede haber múltiples archivos, no necesariamente uno)
        zip_file_contents = zip_file.namelist()
        
        # Supongamos que queremos leer el primer archivo CSV en el ZIP
        csv_file_name = zip_file_contents[0]
        
        # Extrae el archivo CSV del ZIP
        with zip_file.open(csv_file_name) as csv_file:
            # Lee el archivo CSV con Pandas
            df_hotels = pd.read_csv(csv_file)

    # Instanciamos el CountVectorizer
    vectorizer = CountVectorizer()

    # Eliminamos las "stop words", palabras comunes no informativas
    stop = list(stopwords.words('english'))
    tfidf = TfidfVectorizer(stop_words=stop)

    # Calculamos los features para cada ítem (texto)
    tfidf_matrix = tfidf.fit_transform(df_hotels['review'])

    # Calculamos las similitudes entre todos los documentos
    cosine_similarities = linear_kernel(tfidf_matrix, tfidf_matrix)

    # Cantidad de hoteles a recomendar
    n = 6

    # Creamos un diccionario para guardar el resultado
    results = {} 

    for idx, row in df_hotels.iterrows():
        # Guardamos los índices similares basados en la similitud coseno.
        # Los ordenamos en modo ascendente, siendo 0 nada de similitud y 1 total.
        similar_indices = cosine_similarities[idx].argsort()[:-n-2:-1] 

        # Guardamos los N más cercanos
        similar_items = [i for i in similar_indices]
        results[f"{row['name']}"] = df_hotels.iloc[similar_items[1:], [1, 9, 2, 8, 7, 3, 4, 5, 6]]

    def limitar_palabras(texto, limite=20):
        if isinstance(texto, str):
            palabras = texto.split()
            if len(palabras) <= limite:
                return texto
            else:
                return ' '.join(palabras[:limite])
        else:
            return texto  # Devuelve el valor tal como está si no es una cadena

    # Definimos una función de recomendación
    def recommend(hotel):
        df_recommend = pd.DataFrame(results[hotel])
        df_recommend['description'] = df_recommend['description'].apply(lambda x: limitar_palabras(x, 20))
        return df_recommend.to_html(index=False, border=0, justify="center")

    return recommend

# Cargar datos y generar la función de recomendación
recommendation_function = cargar_datos_y_generar_recomendaciones()

In [20]:
# Obtener recomendaciones para un hotel específico
recomendaciones = recommendation_function("Holiday Inn Express Voorhees - Mt. Laurel")

In [21]:
recomendaciones

'<table class="dataframe">\n  <thead>\n    <tr style="text-align: center;">\n      <th>name</th>\n      <th>stars</th>\n      <th>address</th>\n      <th>city_name</th>\n      <th>state_name</th>\n      <th>description</th>\n      <th>phones</th>\n      <th>web</th>\n      <th>email</th>\n    </tr>\n  </thead>\n  <tbody>\n    <tr>\n      <td>Holiday Inn Express South</td>\n      <td>3.0</td>\n      <td>5151 South East Street</td>\n      <td>Indianapolis</td>\n      <td>INDIANA</td>\n      <td>This cosy hotel is in Downtown. A total of 118 units are available for guests\' convenience. Holiday Inn Express South</td>\n      <td>0013177835151</td>\n      <td>NaN</td>\n      <td>EXPRESSSOUTH@GENHOTELS.COM</td>\n    </tr>\n    <tr>\n      <td>Hampton Inn Philadelphia-Northeast Bensalem</td>\n      <td>4.0</td>\n      <td>3660 Street Road</td>\n      <td>Bensalem</td>\n      <td>PENNSYLVANIA</td>\n      <td>The following services and amenities are available, but with **reduced service**: Brea

In [5]:
recommend('Holiday Inn Express Voorhees - Mt. Laurel')

'<table border="1" class="dataframe">\n  <thead>\n    <tr style="text-align: right;">\n      <th></th>\n      <th>name</th>\n      <th>stars</th>\n      <th>address</th>\n      <th>city_name</th>\n      <th>state_name</th>\n      <th>description</th>\n      <th>phones</th>\n      <th>web</th>\n      <th>email</th>\n    </tr>\n  </thead>\n  <tbody>\n    <tr>\n      <th>2804</th>\n      <td>Holiday Inn Express South</td>\n      <td>3.0</td>\n      <td>5151 South East Street</td>\n      <td>Indianapolis</td>\n      <td>INDIANA</td>\n      <td>This cosy hotel is in Downtown. A total of 118 units are available for guests\' convenience. Holiday Inn Express South offers Wi-Fi internet connection in communal areas. The reception desk is open all day long. Holiday Inn Express South understands that accessibility is important to all guests. For this reason, it features wheelchair accessible accomodation units and is fully adapted for easy access. Travellers arriving by car will appreciate parkin