In [1]:
import json
import pandas as pd

In [2]:
# Load raw GPS coordinates from JSON file
with open("../../Extract/coordGPS/coordinates.json", "r", encoding="utf-8") as f:
    raw_data = json.load(f)

# Convert JSON dict to DataFrame (city as index)
df_coord = pd.DataFrame.from_dict(raw_data, orient="index").reset_index()
df_coord.columns = ["city_name", "latitude", "longitude"]

# Clean city names
df_coord["city_name"] = df_coord["city_name"].str.replace(", France", "", regex=False)

# Ensure coordinates are numeric
df_coord["latitude"] = df_coord["latitude"].astype(float)
df_coord["longitude"] = df_coord["longitude"].astype(float)

# Display result
print(df_coord.to_string(index=False))

                   city_name  latitude  longitude
                    Besançon 47.238022   6.024362
          Bormes-les-Mimosas 43.150697   6.341928
                       Nimes 43.837425   4.360069
                       Dijon 47.321581   5.041470
                       Lille 50.636565   3.063528
                      Bayeux 49.276462  -0.702474
        Le Mont-Saint-Michel 48.635523  -1.510257
                    Toulouse 43.604464   1.444243
                      Cassis 43.214036   5.539632
                  Strasbourg 48.584614   7.750713
               Aigues-Mortes 43.566152   4.191540
                   Montauban 44.017584   1.354999
                  Saint-Malo 48.649518  -2.026041
                      Annecy 45.899235   6.128885
Château du Haut-Koenigsbourg 48.249411   7.344320
                    Le Havre 49.493898   0.107973
Les Saintes-Maries-de-la-Mer 43.520778   4.401775
                   Marseille 43.296174   5.369953
                    Biarritz 43.483252  -1.559278


In [3]:
name_corrections = {
    "Les Saintes-Maries-de-la-Mer": "Sainte-Marie de la Mer",
}
df_coord["city_name"] = df_coord["city_name"].replace(name_corrections)

In [4]:
print(df_coord.to_string(index=False))

                   city_name  latitude  longitude
                    Besançon 47.238022   6.024362
          Bormes-les-Mimosas 43.150697   6.341928
                       Nimes 43.837425   4.360069
                       Dijon 47.321581   5.041470
                       Lille 50.636565   3.063528
                      Bayeux 49.276462  -0.702474
        Le Mont-Saint-Michel 48.635523  -1.510257
                    Toulouse 43.604464   1.444243
                      Cassis 43.214036   5.539632
                  Strasbourg 48.584614   7.750713
               Aigues-Mortes 43.566152   4.191540
                   Montauban 44.017584   1.354999
                  Saint-Malo 48.649518  -2.026041
                      Annecy 45.899235   6.128885
Château du Haut-Koenigsbourg 48.249411   7.344320
                    Le Havre 49.493898   0.107973
      Sainte-Marie de la Mer 43.520778   4.401775
                   Marseille 43.296174   5.369953
                    Biarritz 43.483252  -1.559278


In [5]:
df_weather = pd.read_csv("../../Transform/meteo/weather_clean.csv")

# Create a mapping city -> city_id from df_weather
city_to_id = df_weather.drop_duplicates(subset=["city_name"])[["city_name", "city_id"]].set_index("city_name")["city_id"].to_dict()

# Map city_id onto df_coord
df_coord["city_id"] = df_coord["city_name"].map(city_to_id)

print(df_coord.head())

            city_name   latitude  longitude  city_id
0            Besançon  47.238022   6.024362        9
1  Bormes-les-Mimosas  43.150697   6.341928       11
2               Nimes  43.837425   4.360069       28
3               Dijon  47.321581   5.041470       17
4               Lille  50.636565   3.063528       24


In [6]:
print(df_coord.to_string(index=False))

                   city_name  latitude  longitude  city_id
                    Besançon 47.238022   6.024362        9
          Bormes-les-Mimosas 43.150697   6.341928       11
                       Nimes 43.837425   4.360069       28
                       Dijon 47.321581   5.041470       17
                       Lille 50.636565   3.063528       24
                      Bayeux 49.276462  -0.702474        7
        Le Mont-Saint-Michel 48.635523  -1.510257       23
                    Toulouse 43.604464   1.444243       34
                      Cassis 43.214036   5.539632       13
                  Strasbourg 48.584614   7.750713       33
               Aigues-Mortes 43.566152   4.191540        1
                   Montauban 44.017584   1.354999       27
                  Saint-Malo 48.649518  -2.026041       31
                      Annecy 45.899235   6.128885        4
Château du Haut-Koenigsbourg 48.249411   7.344320       14
                    Le Havre 49.493898   0.107973       

In [11]:
# Extract unique city names from both datasets
cities_weather = set(df_weather["city_name"].unique())
cities_coord = set(df_coord["city_name"].unique())

# Identify cities present in coordinates but missing in weather data
missing_in_weather = cities_coord - cities_weather

# Report missing cities
print("Villes manquantes dans df_weather:", missing_in_weather)

Villes manquantes dans df_weather: set()


In [8]:
print(df_coord.to_string(index=False))

                   city_name  latitude  longitude  city_id
                    Besançon 47.238022   6.024362        9
          Bormes-les-Mimosas 43.150697   6.341928       11
                       Nimes 43.837425   4.360069       28
                       Dijon 47.321581   5.041470       17
                       Lille 50.636565   3.063528       24
                      Bayeux 49.276462  -0.702474        7
        Le Mont-Saint-Michel 48.635523  -1.510257       23
                    Toulouse 43.604464   1.444243       34
                      Cassis 43.214036   5.539632       13
                  Strasbourg 48.584614   7.750713       33
               Aigues-Mortes 43.566152   4.191540        1
                   Montauban 44.017584   1.354999       27
                  Saint-Malo 48.649518  -2.026041       31
                      Annecy 45.899235   6.128885        4
Château du Haut-Koenigsbourg 48.249411   7.344320       14
                    Le Havre 49.493898   0.107973       

In [12]:
# Build city_name -> city_id reference mapping
city_id_mapping = dict(zip(df_weather["city_name"], df_weather["city_id"]))

# Infer expected city_id for each coordinate entry
df_coord["city_id_expected"] = df_coord["city_name"].map(city_id_mapping)

# Detect mismatches between actual and expected city_id
mismatch = df_coord[df_coord["city_id"] != df_coord["city_id_expected"]]

# Report validation result
if mismatch.empty:
    print("Toutes les valeurs de city_id sont correctes.")
else:
    print("Des erreurs dans les city_id :")
    print(mismatch)

Toutes les valeurs de city_id sont correctes.


In [10]:
print(df_coord.to_string(index=False))

                   city_name  latitude  longitude  city_id  city_id_expected
                    Besançon 47.238022   6.024362        9                 9
          Bormes-les-Mimosas 43.150697   6.341928       11                11
                       Nimes 43.837425   4.360069       28                28
                       Dijon 47.321581   5.041470       17                17
                       Lille 50.636565   3.063528       24                24
                      Bayeux 49.276462  -0.702474        7                 7
        Le Mont-Saint-Michel 48.635523  -1.510257       23                23
                    Toulouse 43.604464   1.444243       34                34
                      Cassis 43.214036   5.539632       13                13
                  Strasbourg 48.584614   7.750713       33                33
               Aigues-Mortes 43.566152   4.191540        1                 1
                   Montauban 44.017584   1.354999       27                27

In [11]:
df_coord.to_csv("coordinates_clean.csv", index=False)
print("Fichier coordinates_clean.csv enregistré localement.")

Fichier coordinates_clean.csv enregistré localement.
