In [1]:
import pandas as pd
import re
import emoji
from langdetect import detect
import os

## Data Retrieving

rev 1 -> hrs  
rev2 -> hotelsone  
rev 3 -> galahotel

In [2]:
df_hrs = pd.read_csv('./scrapped_reviews/trustpilot_reviews_1.csv')
df_sone = pd.read_csv('./scrapped_reviews/trustpilot_reviews_2.csv')
df_gala = pd.read_csv('./scrapped_reviews/trustpilot_reviews_3.csv')

In [3]:
df_hrs

Unnamed: 0,title,body,date
0,A great finding,Not so much known like other OTAs but really w...,"Date of experience: December 17, 2024"
1,Good choice!,"Easy booking process, wide selection of hotels...","Date of experience: December 16, 2024"
2,Its a really good site when you need to…,Its a really good site when you need to book s...,"Date of experience: December 16, 2024"
3,This was a great hotel,This was a great hotel. very comfortable and p...,"Date of experience: November 18, 2024"
4,Friendly staff,"Friendly staff, good location, clean and spaci...","Date of experience: December 11, 2024"
...,...,...,...
4995,Best Rate,I made a booking at a Barcelona hotel using Bo...,"Date of experience: June 23, 2024"
4996,special offers,This site has often special offers for high qu...,"Date of experience: October 14, 2024"
4997,latest arrival time/reception hours…,on HRS there is seldom information on the late...,"Date of experience: July 15, 2024"
4998,I am very satisfied you can improve…,I am very satisfied you can improve your web p...,"Date of experience: September 04, 2024"


In [4]:
# 1 missing value in titlt col in df_gala
df_gala['title'] = df_gala['title'].fillna('No Title')
df_gala.isnull().sum()

title    0
body     0
date     0
dtype: int64

## Preprocessing & Traduction

In [5]:
#------Function for removing english stopwords------#

import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')
def remove_stopwords(text, lang='english'):
    stop_words = set(stopwords.words(lang))
    words = text.split()
    filtered_words = [word for word in words if word.lower() not in stop_words]
    return ' '.join(filtered_words)


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\thiba\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [6]:
#------Function for translating to english------#

from deep_translator import GoogleTranslator

def translate_avis(avis, lang):
    return GoogleTranslator(source=lang, target='en').translate(avis)



In [7]:
import re
from symspellpy import SymSpell, Verbosity

sym_spell = SymSpell(max_dictionary_edit_distance=2, prefix_length=7)
sym_spell.load_dictionary("./frequency_dictionary_en_82_765.txt", term_index=0, count_index=1)

def correct_spelling_fast(text, lang='en'):

    words = text.split()

    corrected_words = []
    for word in words:
        correction = sym_spell.lookup(word, Verbosity.CLOSEST, max_edit_distance=2)
        corrected_words.append(correction[0].term if correction else word)
    corrected_text = ' '.join(corrected_words)
    return corrected_text

#### HRS data

In [8]:
df_hrs['body'] = df_hrs['body'].str.lower()
df_hrs['body'] = df_hrs['body'].apply(lambda x: re.sub(r'[^\w\s]', '', x))
df_hrs['body'] = df_hrs['body'].apply(lambda x: emoji.replace_emoji(x, replace=''))
df_hrs['language'] = df_hrs['body'].apply(detect)

In [9]:
df_hrs['language'].unique() #only english no need to translate

array(['en'], dtype=object)

In [10]:
df_hrs['body'] = df_hrs['body'].apply(remove_stopwords)

In [11]:
df_hrs['body'] = df_hrs['body'].apply(lambda x: correct_spelling_fast(x, lang='en'))

In [12]:
df_hrs.to_csv('./cleaned_rev/hrs_clean.csv',index=False)

#### Hotelsone data

In [13]:
df_sone['body'] = df_sone['body'].str.lower()
df_sone['body'] = df_sone['body'].apply(lambda x: re.sub(r'[^\w\s]', '', x))
df_sone['body'] = df_sone['body'].apply(lambda x: emoji.replace_emoji(x, replace=''))
df_sone = df_sone[df_sone['body'].str.strip().astype(bool)] # remove reviews that only contains whitespaces
df_sone['language'] = df_sone['body'].apply(detect)

In [14]:
df_sone['language'].unique()

array(['en', 'da', 'fr', 'sk', 'fi', 'af', 'tl', 'so', 'it', 'cs', 'pl',
       'ro', 'es', 'cy', 'pt', 'no', 'nl', 'ca', 'lv', 'sl'], dtype=object)

In [15]:
print(df_sone['language'].value_counts()) #Many reviews in other languages, need to translate

language
en    3798
sk    1086
fr      32
af      12
ro      12
da      11
tl      10
it       8
ca       6
es       6
so       4
cy       2
pt       2
sl       2
cs       2
fi       2
no       1
nl       1
lv       1
pl       1
Name: count, dtype: int64


In [16]:
df_sone = df_sone[df_sone['language'] == 'en'] #Garder uniquement les review en anglais

In [17]:
df_sone['body'] = df_sone['body'].apply(remove_stopwords)

In [18]:
df_sone['body'] = df_sone['body'].apply(lambda x: correct_spelling_fast(x, lang='en'))

In [19]:
df_sone.to_csv('./cleaned_rev/HotelSone_clean.csv',index=False)

#### Gala Hotel Data

In [20]:
df_gala['body'] = df_gala['body'].str.lower()
df_gala['body'] = df_gala['body'].apply(lambda x: re.sub(r'[^\w\s]', '', x))
df_gala['body'] = df_gala['body'].apply(lambda x: emoji.replace_emoji(x, replace=''))
df_gala = df_gala[df_gala['body'].str.strip().astype(bool)] # remove reviews that only contains whitespaces
df_gala['language'] = df_gala['body'].apply(detect)

In [21]:
df_gala['language'].unique()

array(['en', 'sk', 'sl', 'it', 'es', 'af', 'ca', 'fr', 'da', 'pl', 'so',
       'de', 'nl', 'ro', 'cy', 'hr', 'pt', 'no', 'tl'], dtype=object)

In [22]:
df_gala = df_gala[df_gala['language'] == 'en'] #Garder uniquement les review en anglais

In [23]:
df_gala['body'] = df_gala['body'].apply(remove_stopwords)

In [24]:
df_gala['body'] = df_gala['body'].apply(lambda x: correct_spelling_fast(x, lang='en'))

In [25]:
df_gala.to_csv('./cleaned_rev/GalaHotel_clean.csv',index=False)

#### Concatenation for training

In [26]:
df_combined = pd.concat([df_hrs, df_sone, df_gala], ignore_index=True)
df_combined.to_csv('./cleaned_rev/concat_rev.csv')

## Model Setup & Training

#### Topic Classification (LDA)

In [27]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(stop_words='english', max_features=1000)

X_tfidf = vectorizer.fit_transform(df_combined['body'])


In [28]:
from sklearn.decomposition import LatentDirichletAllocation


n_topics = 7
lda = LatentDirichletAllocation(n_components=n_topics, random_state=42)

lda.fit(X_tfidf)

# Extract the topics (words associated with each topic)
feature_names = vectorizer.get_feature_names_out()
n_top_words = 20  # Number of words to display per topic

# Print topics
for topic_idx, topic in enumerate(lda.components_):
    print(f"Topic {topic_idx}:")
    print(", ".join([feature_names[i] for i in topic.argsort()[:-n_top_words - 1:-1]]))


Topic 0:
friendly, good, staff, clean, location, great, room, spacious, excellent, live, helpful, nice, stay, service, offers, hotel, site, comfortable, best, sure
Topic 1:
valid, easily, card, book, using, credit, hotel, hrs, rates, issues, reliable, search, point, willing, able, rating, shows, course, consistently, end
Topic 2:
hotel, room, booking, best, prices, hrs, breakfast, avoid, process, easy, night, use, good, old, bad, included, nice, staff, desk, clean
Topic 3:
really, good, business, site, pricing, rooms, wants, need, platform, limited, past, recently, people, significant, perfect, number, wonderful, experience, book, options
Topic 4:
easy, simple, fast, rates, hotels, use, booking, process, choice, convenient, selection, bookings, particular, germany, interesting, la, reward, wide, cities, small
Topic 5:
improve, information, site, hotels, user, hrs, satisfied, quality, high, friendliness, bit, web, worth, automatically, regards, italian, blast, kind, applied, page
Topic 

Results are not satisfying enough, We will try using zero-shot model

In [29]:
# Get topic distribution for each review
topic_distribution = lda.transform(X_tfidf)

# Assign the review to the topic with the highest probability
df_combined['assigned_topic'] = topic_distribution.argmax(axis=1)

In [30]:
df_combined['assigned_topic']

0        5
1        4
2        3
3        3
4        0
        ..
13343    3
13344    0
13345    0
13346    6
13347    0
Name: assigned_topic, Length: 13348, dtype: int64

#### Topic classification (Zero-shot Model)

In [34]:
from transformers import pipeline
import json
import torch
from tqdm import tqdm
from datasets import Dataset

labels = ['service', 'cleanliness', 'overall', 'value', 'location', 'sleep_quality', 'rooms']
classifier = pipeline("zero-shot-classification", model="knowledgator/comprehend_it-base", device=0)
result = []
dataset = Dataset.from_pandas(df_combined[['body']])
for review in tqdm(dataset['body'], total=len(dataset['body']), desc="Classifying Reviews", unit="review"):
    classification = classifier(review, candidate_labels=labels)
    top_label = classification['labels'][0]
    result.append(top_label)
df_combined['classif_result'] = result

# Check the updated DataFrame
print(df_combined[['body', 'classif_result']].head())


Classifying Reviews: 100%|██████████| 13348/13348 [26:41<00:00,  8.34review/s]

                                                body classif_result
0  much known like otis really worth book rooms w...          value
1  easy booking process wide selection hotels eve...          rooms
2  really good site need book several rooms wants...          rooms
3  great hotel comfortable perfect short staysthe...          rooms
4   friendly staff good location clean spacious room    cleanliness





In [35]:
df_combined.to_csv('./nlp_results/rev_topic.csv',index=False)

#### Sentiment Analysis

In [36]:
from transformers import pipeline
import json
import torch
from tqdm import tqdm
from datasets import Dataset

labels_st = ['Positive', 'Negative', 'Neutral']
classifier_st = pipeline("zero-shot-classification", model="FacebookAI/roberta-base", device=0)
result_st = []
for review in tqdm(dataset['body'], total=len(dataset['body']), desc="Classifying Reviews", unit="review"):
    classification = classifier(review, candidate_labels=labels_st)
    top_label = classification['labels'][0]
    result_st.append(top_label)
df_combined['sentiment_result'] = result_st

print(df_combined[['body', 'sentiment_result']].head())

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at FacebookAI/roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Failed to determine 'entailment' label id from the label2id mapping in the model config. Setting to -1. Define a descriptive label2id mapping in the model config to ensure correct outputs.
Classifying Reviews: 100%|██████████| 13348/13348 [11:12<00:00, 19.86review/s]

                                                body sentiment_result
0  much known like otis really worth book rooms w...         Positive
1  easy booking process wide selection hotels eve...         Positive
2  really good site need book several rooms wants...         Positive
3  great hotel comfortable perfect short staysthe...         Positive
4   friendly staff good location clean spacious room         Positive





In [37]:
df_combined.to_csv('./nlp_results/rev_topic_sentiment.csv',index=False)