# Transformation des jeux de données

On modifie ici nos dataset sur les stations de recharge pour avoir des données par heure et par zone que l'on définit plus bas. On pourra ainsi étudier leur corrélation avec les données de comptage routier, également agrégées par heure et par zone.


### Importation des librairies

In [2]:
import folium
import pandas as pd
from folium.plugins import TimestampedGeoJson
from shapely.geometry import Point
import numpy as np
from math import radians, sin, cos, sqrt, atan2
from branca.colormap import linear
import matplotlib.pyplot as plt
import geopandas as gpd
from shapely.geometry import box
from scipy.spatial import cKDTree

### Importation des données sur les stations transformées dans main.ipynb et converties en .csv

In [3]:
#Données des stations
data = pd.read_csv("csv/hourly_df_semaine2229.csv")
df_real = pd.DataFrame(data)
df_real['taux_dispo'] =  (df_real['Available'] + df_real['Offline']/3)/3
df_real = df_real[['timestamp','latitude','longitude','taux_occup','taux_dispo']]
df_real['timestamp'] = pd.to_datetime(df_real['timestamp']).dt.tz_convert('UTC')
df_real.head(2)

Unnamed: 0,timestamp,latitude,longitude,taux_occup,taux_dispo
0,2020-01-22 00:00:00+00:00,48.855667,2.354089,0.666667,0.333333
1,2020-01-22 00:00:00+00:00,48.86424,2.397724,0.666667,0.333333


On définit les zones de notre découpage (appelées 'carrés') ainsi : chaque carré est délimité par les valeurs des latitudes uniformément réparties de 48.81 à 48.92 avec un pas de latitude de 0.005 (48.810, 48.815, 48.820,...,48.910, 48.915, 48.920). De même les longitudes sont réparties entre 2.255 et 2.420 avec le même pas (2.255, 2.260,...,2.415, 2.420).


In [4]:
#Données de comptage routier
data_traffic= pd.read_csv("csv/moyennes_par_carre_et_t1h2229.csv")
df_traffic = pd.DataFrame(data_traffic)

df_traffic.rename(columns={'t_1h': 'timestamp'}, inplace=True)
df_traffic['timestamp'] = pd.to_datetime(df_traffic['timestamp'], errors='coerce')
df_traffic['timestamp'] = df_traffic['timestamp'].dt.tz_localize('UTC')
df_traffic.head(5)


Unnamed: 0,timestamp,carre_id,moyenne_k,somme_k,count_k,lati,long
0,2020-01-22 01:00:00+00:00,0_16,0.325,0.65,2,48.8125,2.3375
1,2020-01-22 01:00:00+00:00,0_20,0.0,0.0,2,48.8125,2.3575
2,2020-01-22 01:00:00+00:00,10_1,1.478148,8.86889,6,48.8625,2.2625
3,2020-01-22 01:00:00+00:00,10_10,1.640304,18.04334,11,48.8625,2.3075
4,2020-01-22 01:00:00+00:00,10_11,2.367567,49.7189,21,48.8625,2.3125


In [5]:
# Fonction haversine, donnant la distance entre 2 points à partir de leurs coordonnées
def haversine(lat1, lon1, lat2, lon2):
    R = 6371  # Rayon de la Terre en kilomètres
    dlat = radians(lat2 - lat1)
    dlon = radians(lon2 - lon1)
    a = sin(dlat / 2) ** 2 + cos(radians(lat1)) * cos(radians(lat2)) * sin(dlon / 2) ** 2
    c = 2 * atan2(sqrt(a), sqrt(1 - a))
    return R * c

In [6]:
# Définir les limites de la grille (basé sur Paris)
lat_min, lat_max = 48.81, 48.92
lon_min, lon_max = 2.255, 2.42
step = 0.005  # Taille d'un carré (grille de 0.005°)

# Créer les carrés de la grille
latitudes = np.arange(lat_min, lat_max, step)
longitudes = np.arange(lon_min, lon_max, step)

grille_data = []

In [7]:
# Charger le fichier GeoJSON des arrondissements de Paris
arrondissements_geojson = "arrondissements.geojson"  
arrondissements_gdf = gpd.read_file(arrondissements_geojson)  # Charger les arrondissements

In [8]:
# Coordonnées de Charenton-le-Pont et du Jardin d'Acclimatation
charenton_lat, charenton_lon = 48.8337, 2.4149
jardin_lat, jardin_lon = 48.8716, 2.2611


# Distance d'exclusion (en kilomètres)
exclusion_radius = 1  # 2 km autour de ces points

# Fonction pour vérifier si un carré est proche d'un des lieux d'exclusion
def is_near_exclusion(lat, lon):
    distance_to_charenton = haversine(lat, lon, charenton_lat, charenton_lon)
    distance_to_jardin = haversine(lat, lon, jardin_lat, jardin_lon)
    return distance_to_charenton < exclusion_radius or distance_to_jardin < exclusion_radius


In [9]:
def creer_grille(lat_min, lat_max, lon_min, lon_max, step):
    grille = []
    latitudes = np.arange(lat_min, lat_max, step)
    longitudes = np.arange(lon_min, lon_max, step)
    
    for i in range(len(latitudes) - 1):
        for j in range(len(longitudes) - 1):
            carre = {
                "id": f"{i}_{j}",  # Identifiant unique pour chaque carré
                "lat_min": latitudes[i],
                "lat_max": latitudes[i + 1],
                "lon_min": longitudes[j],
                "lon_max": longitudes[j + 1],
                "center_lat": latitudes[i] + step / 2,  # Centre du carré
                "center_lon": longitudes[j] + step / 2  # Centre du carré
            }
            grille.append(carre)
    return grille

In [10]:
from shapely.geometry import Point
from scipy.spatial import cKDTree
import numpy as np
import pandas as pd

def calculate_distances_multiple_metrics_optimized_occup(
    time_series, df_stations, arrondissements_gdf, lat_min, lat_max, lon_min, lon_max, step=0.005
):
    """
    Calcule les distances et métriques pour les stations les plus proches à chaque point de la grille,
    en utilisant des opérations optimisées et en se basant sur la fonction `creer_grille`.
    
    :param time_series: Liste ou tableau des timestamps.
    :param df_stations: DataFrame contenant les données des stations (avec un 'timestamp').
    :param arrondissements_gdf: GeoDataFrame des arrondissements de Paris.
    :param lat_min: Latitude minimale pour créer la grille.
    :param lat_max: Latitude maximale pour créer la grille.
    :param lon_min: Longitude minimale pour créer la grille.
    :param lon_max: Longitude maximale pour créer la grille.
    :param step: Pas de la grille en degrés.
    :return: DataFrame contenant les distances et métriques pour chaque carré à chaque timestamp.
    """
    # Création de la grille à partir des limites et du pas
    grille = creer_grille(lat_min, lat_max, lon_min, lon_max, step)
    
    # Conversion de la grille en DataFrame
    grille_df = pd.DataFrame(grille)
    
    # Filtrer les points de la grille en fonction de Paris
    grille_df['geometry'] = grille_df.apply(lambda row: Point(row['center_lon'], row['center_lat']), axis=1)
    grille_df = grille_df[grille_df['geometry'].apply(lambda p: arrondissements_gdf.geometry.contains(p).any())]

    # Centres valides de la grille
    valid_grid_centers = grille_df[['center_lat', 'center_lon', 'id']].values

    results = []

    for timestamp in time_series:
        # Stations disponibles pour l'instant donné
        df_stations_at_t = df_stations[df_stations['timestamp'] == timestamp].copy()

        if df_stations_at_t.empty:
            continue

        # Construire un arbre k-d pour rechercher efficacement les distances
        station_coords = df_stations_at_t[['latitude', 'longitude']].values
        kdtree = cKDTree(station_coords)

        # Trouver les 3 plus proches stations pour chaque point de la grille
        distances, indices = kdtree.query(valid_grid_centers[:, :2], k=3)  # distances: (N, 3), indices: (N, 3)

        # Calcul des métriques
        for i, (lat, lon, carre_id) in enumerate(valid_grid_centers):
            
            closest_1 = haversine(lat, lon, station_coords[indices[i, 0], 0], station_coords[indices[i, 0], 1])
            closest_2 = haversine(lat, lon, station_coords[indices[i, 1], 0], station_coords[indices[i, 1], 1])
            closest_3 = haversine(lat, lon, station_coords[indices[i, 2], 0], station_coords[indices[i, 2], 1])


            taux_occup_1 = df_stations_at_t.iloc[indices[i, 0]]['taux_occup']
            taux_occup_2 = df_stations_at_t.iloc[indices[i, 1]]['taux_occup']
            taux_occup_3 = df_stations_at_t.iloc[indices[i, 2]]['taux_occup']

            results.append({
                'timestamp': timestamp,
                'carre_id': carre_id,
                'center_lat': lat,
                'center_lon': lon,
                'distance_to_closest_1': closest_1,
                'taux_occup_closest_1': taux_occup_1,
                'distance_to_closest_2': closest_2,
                'taux_occup_closest_2': taux_occup_2,
                'distance_to_closest_3': closest_3,
                'taux_occup_closest_3': taux_occup_3,
                'normalized_distance': closest_1 / distances[:, 0].max() if distances[:, 0].max() > 0 else 0
            })

    return pd.DataFrame(results)


In [11]:
time_series = df_real['timestamp'].unique()

# Appeler la fonction avec les bons paramètres
result_df_occup = calculate_distances_multiple_metrics_optimized_occup(
    time_series=time_series,
    df_stations=df_real,
    arrondissements_gdf=arrondissements_gdf,
    lat_min=lat_min,
    lat_max=lat_max,
    lon_min=lon_min,
    lon_max=lon_max,
    step=step
)
result_df_occup['timestamp']=result_df_occup['timestamp']+ pd.Timedelta(hours=1) #décaler les données des stations d'une heure pour correspondre aux données de trafic

In [12]:
result_df_occup.head(10)

Unnamed: 0,timestamp,carre_id,center_lat,center_lon,distance_to_closest_1,taux_occup_closest_1,distance_to_closest_2,taux_occup_closest_2,distance_to_closest_3,taux_occup_closest_3,normalized_distance
0,2020-01-22 01:00:00+00:00,1_15,48.8175,2.3325,0.829681,0.666667,0.89572,0.333333,1.278762,0.333333,42.012806
1,2020-01-22 01:00:00+00:00,1_16,48.8175,2.3375,0.767356,0.666667,0.58303,0.333333,1.089418,0.333333,38.856816
2,2020-01-22 01:00:00+00:00,1_17,48.8175,2.3425,0.381517,0.333333,0.870231,0.666667,1.003205,0.333333,19.318955
3,2020-01-22 01:00:00+00:00,1_18,48.8175,2.3475,0.468196,0.333333,1.045944,0.333333,1.092598,0.666667,23.708146
4,2020-01-22 01:00:00+00:00,1_20,48.8175,2.3575,0.979265,0.166667,1.081931,0.333333,1.758763,0.333333,49.587318
5,2020-01-22 01:00:00+00:00,1_21,48.8175,2.3625,0.735509,0.166667,1.712055,0.333333,1.431185,0.333333,37.244163
6,2020-01-22 01:00:00+00:00,1_22,48.8175,2.3675,0.625296,0.166667,1.742692,0.333333,1.787184,0.333333,31.663296
7,2020-01-22 01:00:00+00:00,2_12,48.8225,2.3175,1.012358,0.666667,1.114363,0.333333,1.615456,0.25,51.263073
8,2020-01-22 01:00:00+00:00,2_13,48.8225,2.3225,0.962238,0.333333,1.010981,0.666667,1.071818,0.666667,48.725127
9,2020-01-22 01:00:00+00:00,2_14,48.8225,2.3275,0.936985,0.333333,0.716461,0.666667,1.134564,0.666667,47.446375


In [13]:
df_traffic.head(10)

Unnamed: 0,timestamp,carre_id,moyenne_k,somme_k,count_k,lati,long
0,2020-01-22 01:00:00+00:00,0_16,0.325,0.65,2,48.8125,2.3375
1,2020-01-22 01:00:00+00:00,0_20,0.0,0.0,2,48.8125,2.3575
2,2020-01-22 01:00:00+00:00,10_1,1.478148,8.86889,6,48.8625,2.2625
3,2020-01-22 01:00:00+00:00,10_10,1.640304,18.04334,11,48.8625,2.3075
4,2020-01-22 01:00:00+00:00,10_11,2.367567,49.7189,21,48.8625,2.3125
5,2020-01-22 01:00:00+00:00,10_12,2.206557,33.09835,15,48.8625,2.3175
6,2020-01-22 01:00:00+00:00,10_13,2.294744,29.83167,13,48.8625,2.3225
7,2020-01-22 01:00:00+00:00,10_14,1.412666,7.06333,5,48.8625,2.3275
8,2020-01-22 01:00:00+00:00,10_15,2.18628,21.8628,10,48.8625,2.3325
9,2020-01-22 01:00:00+00:00,10_16,1.849908,11.09945,6,48.8625,2.3375


In [14]:
merged_df = pd.merge(result_df_occup, df_traffic, left_on=['carre_id', 'timestamp'], right_on=['carre_id', 'timestamp'], how='inner')
merged_df.drop(['somme_k','count_k','center_lat','center_lon'],axis =1, inplace= True)
merged_df['moyenne_k'] = merged_df['moyenne_k'].fillna(0)
merged_df.head(20)

Unnamed: 0,timestamp,carre_id,distance_to_closest_1,taux_occup_closest_1,distance_to_closest_2,taux_occup_closest_2,distance_to_closest_3,taux_occup_closest_3,normalized_distance,moyenne_k,lati,long
0,2020-01-22 01:00:00+00:00,1_15,0.829681,0.666667,0.89572,0.333333,1.278762,0.333333,42.012806,2.515874,48.8175,2.3325
1,2020-01-22 01:00:00+00:00,1_16,0.767356,0.666667,0.58303,0.333333,1.089418,0.333333,38.856816,2.9625,48.8175,2.3375
2,2020-01-22 01:00:00+00:00,1_17,0.381517,0.333333,0.870231,0.666667,1.003205,0.333333,19.318955,0.758611,48.8175,2.3425
3,2020-01-22 01:00:00+00:00,1_18,0.468196,0.333333,1.045944,0.333333,1.092598,0.666667,23.708146,0.157334,48.8175,2.3475
4,2020-01-22 01:00:00+00:00,1_20,0.979265,0.166667,1.081931,0.333333,1.758763,0.333333,49.587318,1.810644,48.8175,2.3575
5,2020-01-22 01:00:00+00:00,1_21,0.735509,0.166667,1.712055,0.333333,1.431185,0.333333,37.244163,0.586459,48.8175,2.3625
6,2020-01-22 01:00:00+00:00,1_22,0.625296,0.166667,1.742692,0.333333,1.787184,0.333333,31.663296,3.083333,48.8175,2.3675
7,2020-01-22 01:00:00+00:00,2_12,1.012358,0.666667,1.114363,0.333333,1.615456,0.25,51.263073,1.124872,48.8225,2.3175
8,2020-01-22 01:00:00+00:00,2_13,0.962238,0.333333,1.010981,0.666667,1.071818,0.666667,48.725127,0.743269,48.8225,2.3225
9,2020-01-22 01:00:00+00:00,2_14,0.936985,0.333333,0.716461,0.666667,1.134564,0.666667,47.446375,2.056635,48.8225,2.3275


On définit un taux d'occupation moyen par carré, selon le nombre de stations voisines comptabilisés. 

Le taux d'occupation moyen $occup_{moy,p}$ d'un carré à l'instant $t$ en comptant ses $p$ stations les plus proches est 
 $$ occup_{moy,p}(t)= \frac{1}{p} \sum_{i=1}^p occup_i(t)  \frac{distance_i}{l_{carré}} $$
où $distance_i$ est la distance du centre du carré à la $i$ème station la plus proche et $l_{carré}$ est la demie-largeur d'un carré. On calcule $l_{carré}$ ci-dessous.

In [15]:
l_carre = haversine (48.8225,2.3725,48.8225,2.375)
print('La demie-largeur d\'un carré est égale à environ {} m.'.format(int(l_carre*1000)))


La demie-largeur d'un carré est égale à environ 183 m.


In [16]:
#Calcul des occupations moyennes par la formule précédente pour 1,2,3 voisins

merged_df['occup_moy_1'] = merged_df['taux_occup_closest_1'] * (merged_df['distance_to_closest_1'] / l_carre)

merged_df['occup_moy_2'] = (1 / 2) * (merged_df['taux_occup_closest_1'] * (merged_df['distance_to_closest_1'] / l_carre) 
                                      + merged_df['taux_occup_closest_2'] * (merged_df['distance_to_closest_2'] / l_carre))

merged_df['occup_moy_3'] = (1 / 3) * (merged_df['taux_occup_closest_1'] * (merged_df['distance_to_closest_1'] / l_carre) 
                                      + merged_df['taux_occup_closest_2'] * (merged_df['distance_to_closest_2'] / l_carre) 
                                      + merged_df['taux_occup_closest_3'] * (merged_df['distance_to_closest_3'] / l_carre))




merged_df=merged_df[['timestamp','carre_id','lati','long','moyenne_k','occup_moy_1','occup_moy_2','occup_moy_3']]
merged_df.head(20)



Unnamed: 0,timestamp,carre_id,lati,long,moyenne_k,occup_moy_1,occup_moy_2,occup_moy_3
0,2020-01-22 01:00:00+00:00,1_15,48.8175,2.3325,2.515874,3.022103,2.326714,2.327455
1,2020-01-22 01:00:00+00:00,1_16,48.8175,2.3375,2.9625,2.795084,1.928462,1.947006
2,2020-01-22 01:00:00+00:00,1_17,48.8175,2.3425,0.758611,0.694834,1.932319,1.897239
3,2020-01-22 01:00:00+00:00,1_18,48.8175,2.3475,0.157334,0.852698,1.378808,2.245796
4,2020-01-22 01:00:00+00:00,1_20,48.8175,2.3575,1.810644,0.89174,1.4311,2.021778
5,2020-01-22 01:00:00+00:00,1_21,48.8175,2.3625,0.586459,0.66977,1.893919,2.131458
6,2020-01-22 01:00:00+00:00,1_22,48.8175,2.3675,3.083333,0.569408,1.871637,2.332724
7,2020-01-22 01:00:00+00:00,2_12,48.8225,2.3175,1.124872,3.687502,2.858514,2.641211
8,2020-01-22 01:00:00+00:00,2_13,48.8225,2.3225,0.743269,1.75247,2.717478,3.113013
9,2020-01-22 01:00:00+00:00,2_14,48.8225,2.3275,2.056635,1.706478,2.158089,2.81627


On a le dataframe final contenant les données de comptage et d'occupation des stations, agrégées par `timestamp`et par carré.

In [17]:
merged_df.to_csv("Final.csv")