In [1]:
import pandas as pd
import json
import unidecode
import re

In [2]:
# Load hotel data from JSON file 
with open('../../Extract/hotels/data/hotels_descriptions.json', encoding='utf-8') as f:
    hotels_data = json.load(f)  

# Convert the JSON data into a Pandas DataFrame 
df_hotels = pd.DataFrame(hotels_data)
df_hotels.head()

Unnamed: 0,location,name,rating,url,description,coordinates
0,Rouen,Sous les Etoiles - dans le plus haut gratte-ci...,92.0,https://www.booking.com/hotel/fr/sous-les-etoi...,L’hébergement Sous les Etoiles - dans le plus ...,"49.4400384,1.0856876"
1,Rouen,L'Oasis Rouennais - Hypercentre - Gare rive Ga...,10.0,https://www.booking.com/hotel/fr/oasis-rouenna...,L’hébergement L'Oasis Rouennais - Hypercentre ...,"49.4480098,1.0955547"
2,Rouen,"Radisson Blu Hotel, Rouen Centre",89.0,https://www.booking.com/hotel/fr/radisson-blu-...,"L’établissement Radisson Blu Hotel, Rouen Cent...","49.446441,1.09412"
3,Paris,Studio à Montmartre,80.0,https://www.booking.com/hotel/fr/studio-a-mont...,"Offrant une vue sur le jardin, l’hébergement S...","48.8915065,2.3385854999999998"
4,Paris,Apartment Cosy 3P Quartier République,,https://www.booking.com/hotel/fr/apartment-cos...,"Doté d’une connexion Wi-Fi gratuite, l’héberge...","48.8681307,2.3586776"


In [3]:
# Rename columns 
df_hotels = df_hotels.rename(columns={
    'location': 'city_name',              
    'name': 'hotel_name',                 
    'rating': 'hotel_rating',             
    'url': 'hotel_url',                   
    'description': 'hotel_description',   
    'coordinates': 'hotel_coordinates'    
})

RATING

In [4]:
# Correct hotel rating values.
# Replace commas with dots and convert the column to numeric (float).
df_hotels['hotel_rating'] = df_hotels['hotel_rating'].str.replace(',', '.').astype(float)

In [5]:
# Check for missing ratings
missing_ratings = df_hotels["hotel_rating"].isnull().sum()
print(f"Number of hotels without rating: {missing_ratings}")

Number of hotels without rating: 18


CITIES

In [6]:
# Correct city names with encoding issues
city_name_corrections = {'N%C3%AEmes': 'Nîmes'}
df_hotels['city_name'] = df_hotels['city_name'].replace(city_name_corrections)

In [7]:
# Create a clean city name for technical usage (snake_case, no accents)
df_hotels['city_name_clean'] = (
    df_hotels['city_name']
    .str.lower()                       # Convert to lowercase
    .apply(unidecode.unidecode)        # Remove accents (é → e)
    .str.replace(' ', '_')             # Replace spaces with underscores
    .str.replace('-', '_')             # Replace hyphens with underscores
)

display(df_hotels[['city_name', 'city_name_clean']].drop_duplicates())

Unnamed: 0,city_name,city_name_clean
0,Rouen,rouen
3,Paris,paris
8,Le Mont-Saint-Michel,le_mont_saint_michel
11,Chateau du Haut Koenigsbourg,chateau_du_haut_koenigsbourg
15,Dijon,dijon
16,Annecy,annecy
40,Besancon,besancon
110,Eguisheim,eguisheim
135,Grenoble,grenoble
160,Colmar,colmar


In [8]:
# Dictionary of corrections for inconsistent city names
hotel_city_corrections = {
    'Mont Saint Michel': 'Le Mont-Saint-Michel',
    'St Malo': 'Saint-Malo',
    'Bormes les Mimosas': 'Bormes-les-Mimosas',
    'Chateau du Haut Koenigsbourg': 'Château du Haut-Koenigsbourg',
    'Aix en Provence': 'Aix-en-Provence',
    'Besancon': 'Besançon',
    'Nîmes': 'Nimes',
    'Ariege': 'Ariège',
    'Saintes Maries de la mer': 'Sainte-Marie de la Mer',
    'Aigues Mortes': 'Aigues-Mortes'
}

# Apply corrections to city names
df_hotels['city_name'] = df_hotels['city_name'].replace(hotel_city_corrections)

# Remove extra spaces if any
df_hotels['city_name'] = df_hotels['city_name'].str.strip()

# Verify final list of unique city names
print("Hotel city names corrected:")
print(sorted(df_hotels['city_name'].unique()))

Hotel city names corrected:
['Aigues-Mortes', 'Aix-en-Provence', 'Amiens', 'Annecy', 'Ariège', 'Avignon', 'Bayeux', 'Bayonne', 'Besançon', 'Biarritz', 'Bormes-les-Mimosas', 'Carcassonne', 'Cassis', 'Château du Haut-Koenigsbourg', 'Collioure', 'Colmar', 'Dijon', 'Eguisheim', 'Gorges du Verdon', 'Grenoble', 'La Rochelle', 'Le Havre', 'Le Mont-Saint-Michel', 'Lille', 'Lyon', 'Marseille', 'Montauban', 'Nimes', 'Paris', 'Rouen', 'Saint-Malo', 'Sainte-Marie de la Mer', 'Strasbourg', 'Toulouse', 'Uzes']


In [10]:
df_weather = pd.read_csv("../../Extract/meteo/weather_7_day_forecast.csv")

In [11]:
# Clean weather dataset city names 
# Remove the trailing ", France" from city names and strip extra spaces
df_weather['city'] = df_weather['city'].str.replace(', France', '', regex=False).str.strip()

# Check the final cleaned list of city names in the weather dataset
print("Weather city names corrected:")
print(sorted(df_weather['city'].unique()))

Weather city names corrected:
['Aigues-Mortes', 'Aix-en-Provence', 'Amiens', 'Annecy', 'Ariège', 'Avignon', 'Bayeux', 'Bayonne', 'Besançon', 'Biarritz', 'Bormes-les-Mimosas', 'Carcassonne', 'Cassis', 'Château du Haut-Koenigsbourg', 'Collioure', 'Colmar', 'Dijon', 'Eguisheim', 'Gorges du Verdon', 'Grenoble', 'La Rochelle', 'Le Havre', 'Le Mont-Saint-Michel', 'Lille', 'Lyon', 'Marseille', 'Montauban', 'Nimes', 'Paris', 'Rouen', 'Saint-Malo', 'Sainte-Marie de la Mer', 'Strasbourg', 'Toulouse', 'Uzes']


In [12]:
# Compare cities between datasets 
# Get the unique set of city names from each DataFrame
cities_weather = set(df_weather['city'].unique())
cities_hotels = set(df_hotels['city_name'].unique())

In [13]:
# Find unmatched cities 
# Cities present in hotels dataset but missing in weather dataset
print("Cities in hotels not in weather:")
print(sorted(cities_hotels - cities_weather))

# Cities present in weather dataset but missing in hotels dataset
print("Cities in weather not in hotels:")
print(sorted(cities_weather - cities_hotels))

Cities in hotels not in weather:
[]
Cities in weather not in hotels:
[]


In [14]:
# Define the full list of cleaned city names (snake_case format, no accents/spaces)
cities = [
    'le_mont_saint_michel', 'saint_malo', 'bayeux', 'le_havre', 'rouen',
    'paris', 'amiens', 'lille', 'strasbourg', 'chateau_du_haut_koenigsbourg',
    'colmar', 'eguisheim', 'besancon', 'dijon', 'annecy',
    'grenoble', 'lyon', 'gorges_du_verdon', 'bormes_les_mimosas', 'cassis',
    'marseille', 'aix_en_provence', 'avignon', 'uzes', 'nimes',
    'aigues_mortes', 'saintes_maries_de_la_mer', 'collioure', 'carcassonne', 'ariege',
    'toulouse', 'montauban', 'biarritz', 'bayonne', 'la_rochelle'
]

# Sort the city list alphabetically to ensure consistent ID assignment
cities_sorted = sorted(cities)

In [15]:
# Create a dictionary that maps each city to a unique ID
city_id_mapping = {city: idx+1 for idx, city in enumerate(cities_sorted)}

# Map the city_id to the hotels DataFrame based on the cleaned city names
df_hotels['city_id'] = df_hotels['city_name_clean'].map(city_id_mapping)

df_hotels.head()

Unnamed: 0,city_name,hotel_name,hotel_rating,hotel_url,hotel_description,hotel_coordinates,city_name_clean,city_id
0,Rouen,Sous les Etoiles - dans le plus haut gratte-ci...,9.2,https://www.booking.com/hotel/fr/sous-les-etoi...,L’hébergement Sous les Etoiles - dans le plus ...,"49.4400384,1.0856876",rouen,30
1,Rouen,L'Oasis Rouennais - Hypercentre - Gare rive Ga...,10.0,https://www.booking.com/hotel/fr/oasis-rouenna...,L’hébergement L'Oasis Rouennais - Hypercentre ...,"49.4480098,1.0955547",rouen,30
2,Rouen,"Radisson Blu Hotel, Rouen Centre",8.9,https://www.booking.com/hotel/fr/radisson-blu-...,"L’établissement Radisson Blu Hotel, Rouen Cent...","49.446441,1.09412",rouen,30
3,Paris,Studio à Montmartre,8.0,https://www.booking.com/hotel/fr/studio-a-mont...,"Offrant une vue sur le jardin, l’hébergement S...","48.8915065,2.3385854999999998",paris,29
4,Paris,Apartment Cosy 3P Quartier République,,https://www.booking.com/hotel/fr/apartment-cos...,"Doté d’une connexion Wi-Fi gratuite, l’héberge...","48.8681307,2.3586776",paris,29


In [16]:
# Verifie mapping_id
df_hotels['city_id'] = df_hotels['city_name_clean'].map(city_id_mapping)
city_id_list = df_hotels[['city_name', 'city_id']].drop_duplicates().sort_values('city_id')
print(city_id_list)

                        city_name  city_id
410                 Aigues-Mortes        1
566               Aix-en-Provence        2
345                        Amiens        3
16                         Annecy        4
465                        Ariège        5
680                       Avignon        6
349                        Bayeux        7
800                       Bayonne        8
40                       Besançon        9
804                      Biarritz       10
403            Bormes-les-Mimosas       11
413                   Carcassonne       12
404                        Cassis       13
11   Château du Haut-Koenigsbourg       14
514                     Collioure       15
160                        Colmar       16
15                          Dijon       17
110                     Eguisheim       18
400              Gorges du Verdon       19
135                      Grenoble       20
816                   La Rochelle       21
215                      Le Havre       22
8          

In [17]:
# Check full duplicates across all columns
duplicates_full = df_hotels[df_hotels.duplicated()]
print(f"Full duplicates found: {len(duplicates_full)}")
display(duplicates_full)


Full duplicates found: 0


Unnamed: 0,city_name,hotel_name,hotel_rating,hotel_url,hotel_description,hotel_coordinates,city_name_clean,city_id


In [18]:
# Check duplicates based on hotel_name and city_name
duplicates_hotels = df_hotels[df_hotels.duplicated(subset=['hotel_name', 'city_name'])]
print(f"Duplicates based on hotel name and city found: {len(duplicates_hotels)}")
display(duplicates_hotels)


Duplicates based on hotel name and city found: 0


Unnamed: 0,city_name,hotel_name,hotel_rating,hotel_url,hotel_description,hotel_coordinates,city_name_clean,city_id


HOTELS

In [19]:
# Strip spaces at the beginning and end of hotel names
df_hotels['hotel_name'] = df_hotels['hotel_name'].str.strip()

# Remove hidden special characters (newline \n, tab \t, carriage return \r)
df_hotels['hotel_name'] = df_hotels['hotel_name'].apply(
    lambda x: re.sub(r'[\n\t\r]', ' ', x)   # replace with a space
)


DESCRIPTION

In [20]:
# Compute description length for each hotel
df_hotels['description_length'] = df_hotels['hotel_description'].str.len()

# Select hotels with very short descriptions (< 50 characters)
short_descriptions = df_hotels[df_hotels['description_length'] < 50]
print(f"Number of hotels with short descriptions (<50 chars): {len(short_descriptions)}")
display(short_descriptions[['hotel_name', 'hotel_description', 'description_length']])


Number of hotels with short descriptions (<50 chars): 0


Unnamed: 0,hotel_name,hotel_description,description_length


In [21]:
# Find hotels with missing descriptions (NaN values)
missing_descriptions = df_hotels[df_hotels['hotel_description'].isna()]

print(f"Number of missing descriptions: {len(missing_descriptions)}")
display(missing_descriptions[['city_name', 'hotel_name', 'hotel_url']])


Number of missing descriptions: 1


Unnamed: 0,city_name,hotel_name,hotel_url
452,Montauban,Montauban City Zen & Cosy,https://www.booking.com/hotel/fr/montauban-cit...


In [22]:
# Clean hotel descriptions 
# Remove leading and trailing spaces, but only if the value is a string
df_hotels['hotel_description'] = df_hotels['hotel_description'].apply(
    lambda x: x.strip() if isinstance(x, str) else x
)

# Remove hidden characters such as \n, \t, \r, but only if the value is a string
df_hotels['hotel_description'] = df_hotels['hotel_description'].apply(
    lambda x: re.sub(r'[\n\t\r]', ' ', x) if isinstance(x, str) else x
)

In [23]:
# Sort and assign IDs 
# Sort hotels by city name and then by hotel name, reset index after sorting
df_hotels = df_hotels.sort_values(by=['city_name', 'hotel_name']).reset_index(drop=True)

# Create a hotel_id column (sequential, starting at 1)
df_hotels['hotel_id'] = df_hotels.index + 1

COORDINATES

In [24]:
# Split and clean coordinates 
# Split the 'hotel_coordinates' column into two new columns: 'latitude' and 'longitude'
df_hotels[['latitude', 'longitude']] = df_hotels['hotel_coordinates'].str.split(',', expand=True)

# Convert 'latitude' and 'longitude' to numeric values
# If a value cannot be converted, it will be set to NaN (errors='coerce')
df_hotels['latitude'] = pd.to_numeric(df_hotels['latitude'], errors='coerce')
df_hotels['longitude'] = pd.to_numeric(df_hotels['longitude'], errors='coerce')

In [25]:
df_hotels.columns

Index(['city_name', 'hotel_name', 'hotel_rating', 'hotel_url',
       'hotel_description', 'hotel_coordinates', 'city_name_clean', 'city_id',
       'description_length', 'hotel_id', 'latitude', 'longitude'],
      dtype='object')

In [26]:
# Define the list of columns to keep for the final dataset
columns_to_keep = [
    'hotel_id',          # Unique identifier for each hotel
    'hotel_name',        # Cleaned hotel name
    'hotel_url',         # URL of the hotel page
    'hotel_rating',      # Numeric rating of the hotel
    'hotel_description', # Cleaned description text
    'latitude',          # Latitude coordinate
    'longitude',         # Longitude coordinate
    'city_id',           # Unique identifier for the city
    'city_name'          # Original (corrected) city name
]

In [27]:
df_hotels_clean = df_hotels[columns_to_keep]

In [28]:
# Drop hotels without rating
df_hotels_cleaned = df_hotels.dropna(subset=["hotel_rating"])
hotels_per_city = df_hotels_cleaned["city_name"].value_counts()

# Check how many hotels remain
print(f"Remaining hotels after cleaning: {len(df_hotels_cleaned)}")
print(f"Number of hotels per city: {hotels_per_city}")


Remaining hotels after cleaning: 857
Number of hotels per city: city_name
Aigues-Mortes                   25
Colmar                          25
Toulouse                        25
Strasbourg                      25
Saint-Malo                      25
Rouen                           25
Nimes                           25
Montauban                       25
Marseille                       25
Lille                           25
Le Mont-Saint-Michel            25
Le Havre                        25
Grenoble                        25
Gorges du Verdon                25
Eguisheim                       25
Collioure                       25
Biarritz                        25
Amiens                          25
Annecy                          25
Avignon                         25
Bayeux                          25
Bayonne                         25
Besançon                        25
Carcassonne                     25
Uzes                            25
Bormes-les-Mimosas              24
Dijon           

In [29]:
df_hotels_clean.to_csv('data/hotels.csv', index=False, encoding='utf-8')

In [31]:
# Filter only cities with less than 25 hotels
few_hotels = hotels_per_city[hotels_per_city < 25]
print(f"Cities with less than 25 hotels: {few_hotels}")


Cities with less than 25 hotels: city_name
Bormes-les-Mimosas              24
Dijon                           24
La Rochelle                     24
Cassis                          24
Sainte-Marie de la Mer          24
Château du Haut-Koenigsbourg    24
Lyon                            23
Ariège                          23
Aix-en-Provence                 23
Paris                           19
Name: count, dtype: int64


In [32]:
# Display unique city_id and city_name pairs
cities_with_id = df_hotels_clean[['city_id', 'city_name']].drop_duplicates().sort_values('city_id')
print(cities_with_id)

     city_id                     city_name
0          1                 Aigues-Mortes
25         2               Aix-en-Provence
50         3                        Amiens
75         4                        Annecy
100        5                        Ariège
125        6                       Avignon
150        7                        Bayeux
175        8                       Bayonne
200        9                      Besançon
225       10                      Biarritz
250       11            Bormes-les-Mimosas
275       12                   Carcassonne
300       13                        Cassis
325       14  Château du Haut-Koenigsbourg
350       15                     Collioure
375       16                        Colmar
400       17                         Dijon
425       18                     Eguisheim
450       19              Gorges du Verdon
475       20                      Grenoble
500       21                   La Rochelle
525       22                      Le Havre
550       2