In [36]:
import pandas as pd
import numpy as np
import re
from collections import Counter
import spacy
import nltk
from nltk.corpus import stopwords

In [None]:
try:
    df = pd.read_csv("Hotel_Reviews.csv")
    print(f"Dataset Original carregado: {df.shape[0]} linhas, {df.shape[1]} colunas")
except FileNotFoundError:
    print("ERRO: O ficheiro 'Hotel_Reviews.csv' não foi encontrado.")

A carregar o dataset original
Dataset Original carregado: 515738 linhas, 17 colunas


In [None]:
df["Negative_Review"] = df["Negative_Review"].replace("No Negative", "")
df["Positive_Review"] = df["Positive_Review"].replace("No Positive", "")

df["Negative_Review"] = df["Negative_Review"].fillna("")
df["Positive_Review"] = df["Positive_Review"].fillna("")


A limpar dados ruidosos
Limpeza de textos padrão concluída.


In [None]:
positivas = df[df['Positive_Review'].str.strip() != ''].groupby('Hotel_Name').size()
negativas = df[df['Negative_Review'].str.strip() != ''].groupby('Hotel_Name').size()

hoteis_validos = positivas[(positivas >= 15) & (negativas.get(positivas.index, 0) >= 15)].index

df = df[df['Hotel_Name'].isin(hoteis_validos)]

df = df[(df['Review_Total_Positive_Word_Counts'] > 10) & (df['Review_Total_Negative_Word_Counts'] > 10)]

df['word_count'] = df['Review_Total_Positive_Word_Counts'] + df['Review_Total_Negative_Word_Counts']

df_sorted = df.sort_values(['Hotel_Name', 'word_count'], ascending=[True, False])

def sample_hotel_reviews(group):
    if len(group) <= 40:
        return group
    
    top_20 = group.nlargest(20, 'Reviewer_Score')
    bottom_20 = group.nsmallest(20, 'Reviewer_Score')
    
    return pd.concat([top_20, bottom_20]).drop_duplicates()


df_final = df_sorted.groupby('Hotel_Name', group_keys=False).apply(sample_hotel_reviews)

df_final = df_final.sample(frac=1, random_state=42).reset_index(drop=True)

df_final = df_final.drop(columns=['word_count'])

print(f"Número de hotéis únicos: {df_final['Hotel_Name'].nunique()}")
print(f"Total de reviews final: {len(df_final)}")

Amostragem concluída!
Número de hotéis únicos: 1460
Total de reviews final: 46537


  df_final = df_sorted.groupby('Hotel_Name', group_keys=False).apply(sample_hotel_reviews)


In [None]:
def limpar_tags(tag_str):
    if pd.isna(tag_str) or tag_str == "":
        return ""
    
    clean_str = tag_str.replace("[", "").replace("]", "").replace("'", "")
    
    lista_tags = clean_str.split(",")
    
    tags_limpas = [t.strip() for t in lista_tags if t.strip()]
    
    return ", ".join(tags_limpas)

df_final['Tags_Clean'] = df_final['Tags'].apply(limpar_tags)

In [None]:
def processar_texto(row):
    pos = str(row['Positive_Review']).strip()
    texto_positivo = f"O QUE OS CLIENTES ADORAM: {pos}" if pos else "Sem comentários positivos destacados."
    
    neg = str(row['Negative_Review']).strip()
    texto_negativo = f"PONTOS A MELHORAR: {neg}" if neg else "Sem queixas relevantes registadas."
    
    texto_final = (
        f"Hotel Name: {row['Hotel_Name']}. "
        f"Location: {row['Hotel_Address']}. "
        f"{texto_positivo} "
        f"{texto_negativo}"
    )
    
    texto_final = " ".join(texto_final.split())
    
    return texto_final

print("Função de processamento definida.")

Função de processamento definida.


In [None]:
df_final['review'] = df_final.apply(processar_texto, axis=1)

print(f"Linhas válidas restantes: {len(df_final)}")

A criar a coluna de Texto Enriquecido
Processamento concluído. Linhas válidas restantes: 46537


In [None]:
nlp = spacy.load("en_core_web_sm")
nltk.download('stopwords', quiet=True)
stop_words_nltk = set(stopwords.words('english'))
extra_stops = {
    'hotel', 'room', 'staff', 'stay', 'location', 'would', 'could', 'also', 'get', 'us',
    'loved', 'liked', 'amazing', 'good', 'nice', 'excellent', 'great', 'really', 'bit',
    'everything', 'nothing', 'breakfast', 'egg', 'eggs', 'even', 'next', 'one', 'back', 
    'front', 'desk', 'facilities', 'nearest'
}
ALL_STOPWORDS = stop_words_nltk.union(extra_stops)

# Palavras que nunca devem ser entidades
STOP_NER = {
    'breakfast', 'room', 'rooms', 'bed', 'beds', 'shower', 'bathroom', 
    'staff', 'hotel', 'stay', 'everything', 'nothing', 'thing', 'bit',
    'nice', 'great', 'small', 'tiny', 'excellent', 'door', 'window',
    'front', 'desk','facilities', 'nearest', 'egg', 'eggs', 'loved', 
    'really', 'helpful', 'liked'
}

def extrair_conhecimento_hibrido(row):
    texto_pos = str(row['Positive_Review'])
    texto_neg = str(row['Negative_Review'])
    texto_total = (texto_pos + " " + texto_neg).strip()
    
    if len(texto_total) < 10:
        return "", "", ""

    # Palavras mais frequentes
    palavras = re.findall(r'\b[a-zA-Z]{3,}\b', texto_total.lower())
    palavras_uteis = [p for p in palavras if p not in ALL_STOPWORDS]
    keywords = ", ".join([w for w, f in Counter(palavras_uteis).most_common(8)])

    doc = nlp(texto_total[:1500])

    entidades_geral = []
    pois = []

    for ent in doc.ents:
        ent_text = ent.text.strip()
        ent_lower = ent_text.lower()
        
        # Ignora se a base for Adjetivo/Verbo ou se estiver na lista STOP_NER
        if ent.root.pos_ in ['ADJ', 'VERB'] or ent_lower in STOP_NER:
            continue
        
        # Ignora se for o próprio nome do hotel 
        if ent_lower in str(row['Hotel_Name']).lower():
            continue

        if ent.label_ in ['FAC', 'LOC']:
            pois.append(ent_text)
            entidades_geral.append(f"{ent_text} ({ent.label_})")

        elif ent.label_ in ['ORG', 'GPE']:
            entidades_geral.append(f"{ent_text} ({ent.label_})")

    return (
        keywords,
        ", ".join(list(dict.fromkeys(entidades_geral))[:5]),
        ", ".join(list(dict.fromkeys(pois))[:4])
    )


def identificar_cidade(address):
    for city in ["London", "Paris", "Amsterdam", "Barcelona", "Milan", "Vienna"]:
        if city in address or (city == "London" and "United Kingdom" in address):
            return city
    return "Other"

print("A processar Inteligência Artificial (Keywords, NER, POIs e Cidades)...")

df_final[['keywords', 'entidades_ner', 'POI']] = df_final.apply(
    lambda row: pd.Series(extrair_conhecimento_hibrido(row)), axis=1
)

#Cria a coluna da Cidade
df_final['City'] = df_final['Hotel_Address'].apply(identificar_cidade)

print("Concluído!")

A processar Inteligência Artificial (Keywords, NER, POIs e Cidades)...
Concluído!


In [None]:
pd.set_option('display.max_columns', None)
# Linhas com POI 
hotels_with_poi = df_final[df_final['POI'].str.strip() != ''].head(5)
hotels_with_poi

Unnamed: 0,Hotel_Name,Hotel_Address,City,Average_Score,Total_Number_of_Reviews,Reviewer_Nationality,review,Review_Date,Positive_Review,Negative_Review,Reviewer_Score,Tags_Clean,keywords,entidades_ner,POI,lat,lng
1,Charlotte Street Hotel,15 17 Charlotte Street Hotel Westminster Borou...,London,9.5,319,United Kingdom,Hotel Name: Charlotte Street Hotel. Location: ...,8/28/2015,Being upgraded Lovely furnishings Candle welc...,The first room had scaffolding up on the next...,10.0,"Leisure trip, Couple, Superior Room, Stayed 1 ...","bar, upgraded, lovely, furnishings, candle, we...","Location (ORG), Charlotte Street Great Bar (FAC)",Charlotte Street Great Bar,51.518416,-0.134851
5,Chambiges Elys es,8 rue Chambiges 8th arr 75008 Paris France,Paris,8.9,858,South Korea,Hotel Name: Chambiges Elys es. Location: 8 rue...,10/15/2016,Beautiful elegant interior Just enjoy traditi...,Sometimes Wi fi doesn t work but in one hour ...,10.0,"Leisure trip, Family with young children, Delu...","menu, beautiful, elegant, interior, enjoy, tra...","Paris (GPE), Metro (FAC), Kind Staff Shower (ORG)",Metro,48.866805,2.303946
18,Sofitel Vienna Stephansdom,Praterstra e 1 02 Leopoldstadt 1020 Vienna Aus...,Vienna,9.0,1148,Netherlands,Hotel Name: Sofitel Vienna Stephansdom. Locati...,8/13/2016,Cool architecture and design Rooms were very ...,The arm chair in the room should be changed i...,9.6,"Business trip, Solo traveler, Superior Queen R...","white, cool, architecture, design, rooms, mode...","WHITE Great (FAC), Hermes (GPE)",WHITE Great,48.212857,16.37986
26,Best Western Antares Hotel Concorde,Viale Monza 132 Distretto Viale Monza 20127 Mi...,Milan,8.1,984,India,Hotel Name: Best Western Antares Hotel Concord...,7/18/2017,Good location just 150 mts from metro station...,The quality of central air conditioning wasn ...,8.3,"Leisure trip, Family with young children, Trip...","station, quite, provided, bed, mts, metro, tur...","metro (FAC), Duomo (ORG), Carrefour (ORG), the...",metro,45.502453,9.221836
41,Glam Milano,Piazza Duca D Aosta 4 6 Central Station 20124 ...,Milan,8.8,7371,Spain,Hotel Name: Glam Milano. Location: Piazza Duca...,6/29/2017,The location is good you have metro train sta...,On the first night there was a fire alarm 2 t...,4.2,"Business trip, Solo traveler, Standard Double ...","water, away, night, times, reception, bathtub,...",metro (FAC),metro,45.48385,9.203407


In [49]:
colunas_uteis = [
    'Hotel_Name', 
    'Hotel_Address', 
    'City',
    'Average_Score',
    'Total_Number_of_Reviews',
    'Reviewer_Nationality',
    'review',
    'Review_Date',
    'Positive_Review', 
    'Negative_Review',
    'Reviewer_Score',
    'Tags_Clean',
    'keywords',
    'entidades_ner',
    'POI',
    'lat', 
    'lng'
]

df_final = df_final[colunas_uteis]

In [None]:
from sentence_transformers import SentenceTransformer

model = SentenceTransformer('paraphrase-multilingual-MiniLM-L12-v2')

print("A gerar embeddings...")
embeddings = model.encode(df_final['review'].tolist(), show_progress_bar=True)

df_final['embeddings'] = list(embeddings)

df_final.to_pickle("Hotel_Reviews_processed.pkl")


A gerar embeddings...


Batches:   0%|          | 0/1455 [00:00<?, ?it/s]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_final['embeddings'] = list(embeddings)


Sucesso! Agora usa o ficheiro .pkl na tua app.py


In [None]:
df_final.to_csv("Hotel_Reviews_processed.csv", index=False)
print("Csv criado")

A guardar ficheiro final...
Csv criado
