In [1]:
import pandas as pd

In [2]:
df_coord = pd.read_csv("coordGPS/coordinates_clean.csv")
df_hotels = pd.read_csv("hotels/data/hotels.csv")  
df_weather = pd.read_csv("meteo/weather_clean.csv")  

#### Contrôle des correspondances de villes

Avant de fusionner les jeux de données (coordonnées, hôtels, météo), 
on vérifie que chaque `city_id` renvoie bien au même `city_name` dans toutes les sources.  
Cette étape permet de détecter des incohérences.
et d’assurer une fusion correcte des données complètes.

In [3]:
df_coord_clean = df_coord[["city_id", "city_name"]].drop_duplicates()
df_hotels_clean = df_hotels[["city_id", "city_name"]].drop_duplicates()
df_weather_clean = df_weather[["city_id", "city_name"]].drop_duplicates()

In [4]:
# Merge the 3 sources on city_id 
merged = (
    df_coord_clean.merge(df_hotels_clean, on="city_id", suffixes=("_1", "_2"))
             .merge(df_weather_clean, on="city_id")
             .rename(columns={"city_name": "city_name_3"})
)

# Check consistency between city names 
merged["match_1_2"] = merged["city_name_1"].eq(merged["city_name_2"])
merged["match_1_3"] = merged["city_name_1"].eq(merged["city_name_3"])

# Extract mismatches 
errors = merged.loc[~(merged["match_1_2"] & merged["match_1_3"]),
                    ["city_id", "city_name_1", "city_name_2", "city_name_3"]]

print(f"Nombre de mismatches : {len(errors)}")
display(errors.head(10)) 


Nombre de mismatches : 0


Unnamed: 0,city_id,city_name_1,city_name_2,city_name_3


#### Harmonisation des noms de villes dans le dataset des hôtels

Même si la cohérence des `city_id` et `city_name` a déjà été vérifiée,  
on force ici l’utilisation de la table de référence (`coordGPS/coordinates_clean.csv`).  

L’objectif est de garantir que chaque hôtel soit associé à un nom de ville standardisé,  
ce qui rend le pipeline plus robuste face à d’éventuelles différences d’écriture ou futures mises à jour des données.


In [5]:
# Drop any existing 'city_name' column in hotels (to avoid conflicts)
df_hotels = df_hotels.drop(columns=["city_name"], errors="ignore")

# Merge hotels with reference cities on city_id (left join = keep all hotels)
df_hotels = df_hotels.merge(df_coord_clean, on="city_id", how="left")

# Check: display unique city_id/city_name pairs after the merge
print(df_hotels[["city_id", "city_name"]].drop_duplicates())


     city_id                     city_name
0          1                 Aigues-Mortes
25         2               Aix-en-Provence
50         3                        Amiens
75         4                        Annecy
100        5                        Ariège
125        6                       Avignon
150        7                        Bayeux
175        8                       Bayonne
200        9                      Besançon
225       10                      Biarritz
250       11            Bormes-les-Mimosas
275       12                   Carcassonne
300       13                        Cassis
325       14  Château du Haut-Koenigsbourg
350       15                     Collioure
375       16                        Colmar
400       17                         Dijon
425       18                     Eguisheim
450       19              Gorges du Verdon
475       20                      Grenoble
500       21                   La Rochelle
525       22                      Le Havre
550       2

In [6]:
df_hotels.to_csv('data/hotels_bis.csv', index=False, encoding='utf-8')

In [7]:
df_hotels_bis = pd.read_csv("data/hotels_bis.csv")  
df_hotel_bis_clean = df_hotels_bis[["city_id", "city_name"]].drop_duplicates()

In [8]:
merged = df_coord_clean.merge(df_hotel_bis_clean, on="city_id", suffixes=("_1", "_2")) \
                  .merge(df_weather_clean, on="city_id") \
                  .rename(columns={"city_name": "city_name_3"})

merged["match_1_2"] = merged["city_name_1"] == merged["city_name_2"]
merged["match_1_3"] = merged["city_name_1"] == merged["city_name_3"]

errors = merged[(~merged["match_1_2"]) | (~merged["match_1_3"])]
print("🔍 Incohérences trouvées :")
print(errors[["city_id", "city_name_1", "city_name_2", "city_name_3"]])

🔍 Incohérences trouvées :
Empty DataFrame
Columns: [city_id, city_name_1, city_name_2, city_name_3]
Index: []


#### Fusion finale des données

À cette étape, on construit le jeu de données complet :

1. Fusion des coordonnées et des données météo par `city_id`.  
   (les deux viennent de la même source JSON, donc la correspondance est fiable).  
2. Fusion des hôtels corrigés avec ce jeu coord + météo, en conservant tous les hôtels.  
3. Export du résultat final dans un fichier CSV, puis vérification en affichant les premières lignes.


In [9]:
# 1. Merge coordinates and weather data on city_id
# Inner join: keep only cities that exist in both datasets
df_cities = pd.merge(df_coord, df_weather, on="city_id", how="inner")

# 2. Merge hotels (already harmonized) with cities + weather
# Left join: keep all hotels, add geo/weather info when available
df_final = pd.merge(df_hotels_bis, df_cities, on="city_id", how="left")

# 3. Export the final dataset to CSV (without index column)
df_final.to_csv("data/hotels_weather_combined.csv", index=False)

# 4. Reload and preview the combined dataset
df = pd.read_csv("data/hotels_weather_combined.csv")
display(df.head())


Unnamed: 0,hotel_id,hotel_name,hotel_url,hotel_rating,hotel_description,latitude_x,longitude_x,city_id,city_name,city_name_x,latitude_y,longitude_y,city_id_expected,city_name_y,temp_max,humidity,wind_speed,clouds,pop,good_weather_score
0,1,Aigues Marines,https://www.booking.com/hotel/fr/aigues-marine...,9.5,L’établissement Aigues Marines vous accueille ...,43.55972,4.218698,1,Aigues-Mortes,Aigues-Mortes,43.566152,4.19154,1,Aigues-Mortes,19.544286,53.714286,5.901429,29.571429,1.0,6.610714
1,2,Appartements 3 étoiles terrasse ou patio intra...,https://www.booking.com/hotel/fr/appartement-3...,9.6,L’établissement Appartements 3 étoiles terrass...,43.565358,4.19275,1,Aigues-Mortes,Aigues-Mortes,43.566152,4.19154,1,Aigues-Mortes,19.544286,53.714286,5.901429,29.571429,1.0,6.610714
2,3,Artemia Aigues-Mortes - Hotel avec piscine,https://www.booking.com/hotel/fr/le-royal-hote...,9.1,Featuring free WiFi and a seasonal outdoor swi...,43.576396,4.197818,1,Aigues-Mortes,Aigues-Mortes,43.566152,4.19154,1,Aigues-Mortes,19.544286,53.714286,5.901429,29.571429,1.0,6.610714
3,4,Au Cœur des Remparts,https://www.booking.com/hotel/fr/au-coeur-des-...,9.9,L’hébergement Au Cœur des Remparts se trouve à...,43.565401,4.192973,1,Aigues-Mortes,Aigues-Mortes,43.566152,4.19154,1,Aigues-Mortes,19.544286,53.714286,5.901429,29.571429,1.0,6.610714
4,5,Chez Céline et Sébastien,https://www.booking.com/hotel/fr/chez-celine-e...,9.4,L’hébergement Chez Céline et Sébastien se situ...,43.570192,4.195081,1,Aigues-Mortes,Aigues-Mortes,43.566152,4.19154,1,Aigues-Mortes,19.544286,53.714286,5.901429,29.571429,1.0,6.610714


In [10]:
print(df.dtypes)

hotel_id                int64
hotel_name             object
hotel_url              object
hotel_rating          float64
hotel_description      object
latitude_x            float64
longitude_x           float64
city_id                 int64
city_name              object
city_name_x            object
latitude_y            float64
longitude_y           float64
city_id_expected        int64
city_name_y            object
temp_max              float64
humidity              float64
wind_speed            float64
clouds                float64
pop                   float64
good_weather_score    float64
dtype: object


In [11]:
# Define the final set of columns to keep in the combined dataset
columns_to_keep = [
    'hotel_id',
    'hotel_name',
    'hotel_url',
    'hotel_rating',
    'hotel_description',
    'latitude_x',      # hotel latitude
    'longitude_x',     # hotel longitude
    'city_id',
    'city_name',
    'latitude_y',      # city latitude
    'longitude_y',     # city longitude
    'temp_max',
    'humidity',
    'wind_speed',
    'clouds',
    'pop',
    'good_weather_score'
]


In [12]:
# Keep only the selected columns
df_clean = df[columns_to_keep]

# Rename coordinate columns for clarity
df_clean = df_clean.rename(columns={
    'latitude_x': 'hotel_latitude',
    'longitude_x': 'hotel_longitude',
    'latitude_y': 'city_latitude',
    'longitude_y': 'city_longitude'
})


In [13]:
# Extract top destinations based on weather score
top_destinations = (
    df_clean[['city_id', 'city_name', 'good_weather_score']]
    .drop_duplicates()                              # keep unique cities
    .sort_values(by='good_weather_score', ascending=False)  # best weather first
    .head(5)                                        # select top 5
)

# Display the top 5 destinations
display(top_destinations)


Unnamed: 0,city_id,city_name,good_weather_score
575,24,Lille,34.593571
700,29,Paris,33.266429
250,11,Bormes-les-Mimosas,20.407143
150,7,Bayeux,14.297143
125,6,Avignon,13.882143


In [14]:
# Extract the top-rated hotels
top_hotels = (
    df_clean[['hotel_name', 'hotel_rating', 'city_name', 'hotel_latitude', 'hotel_longitude']]
    .sort_values(by='hotel_rating', ascending=False)  # sort hotels by rating (best first)
    .head(20)                                         # keep the top 20
)

# Display the top 20 hotels
display(top_hotels)


Unnamed: 0,hotel_name,hotel_rating,city_name,hotel_latitude,hotel_longitude
378,Appartement proche centre ville avec parking,10.0,Colmar,48.080807,7.352334
125,3 clés Avignon centre,10.0,Avignon,43.948473,4.801408
641,Studio JAZZ au Vieux port Opéra,10.0,Marseille,43.29364,5.374919
777,"Appartement T2, proche plage, climatisé et par...",10.0,Sainte-Marie de la Mer,43.454108,4.42944
136,Lavande Dorée,10.0,Avignon,43.944341,4.812729
804,Charmant studio II - accès privé,10.0,Strasbourg,48.578978,7.771875
830,Cujas,10.0,Toulouse,43.601254,1.440944
739,L'Oasis Rouennais - Hypercentre - Gare rive Ga...,10.0,Rouen,49.44801,1.095555
870,Secret d'Uzès - Le Duché - Piscine chauffée et...,10.0,Uzes,44.008578,4.325228
856,La DAME de FLAUX,9.9,Uzes,44.01205,4.419244


In [15]:
# Export the cleaned and enriched dataset to CSV
df_clean.to_csv("data/hotels_weather_final.csv", index=False, encoding='utf-8')

# Preview the first rows of the final dataset
df_clean.head()

Unnamed: 0,hotel_id,hotel_name,hotel_url,hotel_rating,hotel_description,hotel_latitude,hotel_longitude,city_id,city_name,city_latitude,city_longitude,temp_max,humidity,wind_speed,clouds,pop,good_weather_score
0,1,Aigues Marines,https://www.booking.com/hotel/fr/aigues-marine...,9.5,L’établissement Aigues Marines vous accueille ...,43.55972,4.218698,1,Aigues-Mortes,43.566152,4.19154,19.544286,53.714286,5.901429,29.571429,1.0,6.610714
1,2,Appartements 3 étoiles terrasse ou patio intra...,https://www.booking.com/hotel/fr/appartement-3...,9.6,L’établissement Appartements 3 étoiles terrass...,43.565358,4.19275,1,Aigues-Mortes,43.566152,4.19154,19.544286,53.714286,5.901429,29.571429,1.0,6.610714
2,3,Artemia Aigues-Mortes - Hotel avec piscine,https://www.booking.com/hotel/fr/le-royal-hote...,9.1,Featuring free WiFi and a seasonal outdoor swi...,43.576396,4.197818,1,Aigues-Mortes,43.566152,4.19154,19.544286,53.714286,5.901429,29.571429,1.0,6.610714
3,4,Au Cœur des Remparts,https://www.booking.com/hotel/fr/au-coeur-des-...,9.9,L’hébergement Au Cœur des Remparts se trouve à...,43.565401,4.192973,1,Aigues-Mortes,43.566152,4.19154,19.544286,53.714286,5.901429,29.571429,1.0,6.610714
4,5,Chez Céline et Sébastien,https://www.booking.com/hotel/fr/chez-celine-e...,9.4,L’hébergement Chez Céline et Sébastien se situ...,43.570192,4.195081,1,Aigues-Mortes,43.566152,4.19154,19.544286,53.714286,5.901429,29.571429,1.0,6.610714
