**Purpose of the notebook: Calculate the numbers of stations in a 5km radius.**

In [1]:
import os
os.chdir("C:/Users/zetru/OneDrive/Mémoire python/")
import numpy as np
import pandas as pd
import geopandas as gpd
import matplotlib.pyplot as plt
df=pd.read_hdf('df_v4.h5','df_v4')
df

Unnamed: 0,cp,id,nom_carburant,id_carburant,maj,valeur,litre_brut,type,latitude,longitude,ville
0,01,1000001,Gazole,1,2022-01-03,0.702500,0.435626,R,46.201,5.198,SAINT-DENIS-LèS-BOURG
1,01,1000001,Gazole,1,2022-01-05,0.692500,0.447949,R,46.201,5.198,SAINT-DENIS-LèS-BOURG
2,01,1000001,Gazole,1,2022-01-10,0.728333,0.452874,R,46.201,5.198,SAINT-DENIS-LèS-BOURG
3,01,1000001,Gazole,1,2022-01-17,0.739167,0.484043,R,46.201,5.198,SAINT-DENIS-LèS-BOURG
4,01,1000001,Gazole,1,2022-01-21,0.769167,0.497616,R,46.201,5.198,SAINT-DENIS-LèS-BOURG
...,...,...,...,...,...,...,...,...,...,...,...
4537444,95,95870010,SP98,6,2022-12-23,0.856233,0.488413,R,48.936,2.206,Bezons
4537445,95,95870010,SP98,6,2022-12-27,0.874567,0.487324,R,48.936,2.206,Bezons
4537446,95,95870010,SP98,6,2022-12-28,0.874567,0.484175,R,48.936,2.206,Bezons
4537447,95,95870010,SP98,6,2022-12-29,0.879567,0.477600,R,48.936,2.206,Bezons


In [2]:
from scipy.spatial import cKDTree

def degrees_to_radians(df):
    df.loc[:, 'latitude_rad'] = np.radians(df['latitude'])
    df.loc[:, 'longitude_rad'] = np.radians(df['longitude'])
    return df

def calculate_competition(unique_df, all_df, radius_km, station_type=['A', 'R']):
    radius_in_radians = radius_km / 6371.0088  # Rayon moyen de la terre en km
    
    # Filtrer les données uniques pour ne garder que les types spécifiés
    filtered_unique_df = unique_df[unique_df['type'].isin(station_type)]
    
    # Conversion des degrés en radians
    filtered_unique_df = degrees_to_radians(filtered_unique_df)
    coordinates = filtered_unique_df[['latitude_rad', 'longitude_rad']].values
    
    # Création de l'arbre KD à partir des coordonnées filtrées
    tree = cKDTree(coordinates)
    
    # Préparation des données uniques pour l'ensemble des stations
    unique_df = degrees_to_radians(unique_df)
    all_coordinates = unique_df[['latitude_rad', 'longitude_rad']].values
    
    # Recherche des voisins dans le rayon pour chaque point dans l'ensemble filtré
    counts = tree.query_ball_point(all_coordinates, r=radius_in_radians)
    
    # Calcul du nombre de stations du même type dans le rayon
    unique_df[f'stations_rayon_{radius_km}km'] = [len(count) - 1 for count in counts]

    all_df = all_df.merge(unique_df[['id', f'stations_rayon_{radius_km}km']], on='id', how='left')
    
    return all_df

unique_stations = df[['id', 'latitude', 'longitude', 'type']].drop_duplicates(subset='id')

df_stations = calculate_competition(unique_stations, df, 5)


In [3]:
df_stations

Unnamed: 0,cp,id,nom_carburant,id_carburant,maj,valeur,litre_brut,type,latitude,longitude,ville,stations_rayon_5km
0,01,1000001,Gazole,1,2022-01-03,0.702500,0.435626,R,46.201,5.198,SAINT-DENIS-LèS-BOURG,5
1,01,1000001,Gazole,1,2022-01-05,0.692500,0.447949,R,46.201,5.198,SAINT-DENIS-LèS-BOURG,5
2,01,1000001,Gazole,1,2022-01-10,0.728333,0.452874,R,46.201,5.198,SAINT-DENIS-LèS-BOURG,5
3,01,1000001,Gazole,1,2022-01-17,0.739167,0.484043,R,46.201,5.198,SAINT-DENIS-LèS-BOURG,5
4,01,1000001,Gazole,1,2022-01-21,0.769167,0.497616,R,46.201,5.198,SAINT-DENIS-LèS-BOURG,5
...,...,...,...,...,...,...,...,...,...,...,...,...
4537444,95,95870010,SP98,6,2022-12-23,0.856233,0.488413,R,48.936,2.206,Bezons,15
4537445,95,95870010,SP98,6,2022-12-27,0.874567,0.487324,R,48.936,2.206,Bezons,15
4537446,95,95870010,SP98,6,2022-12-28,0.874567,0.484175,R,48.936,2.206,Bezons,15
4537447,95,95870010,SP98,6,2022-12-29,0.879567,0.477600,R,48.936,2.206,Bezons,15


In [6]:
df_stations['stations_rayon_5km'].quantile(0.75)

9.0

In [27]:
labels = [1, 2, 3, 4]
df_stations['competition_geo'] = pd.qcut(df_stations['stations_rayon_5km'], q=4, labels=labels)

In [28]:
df_stations

Unnamed: 0,cp,id,nom_carburant,id_carburant,maj,valeur,litre_brut,type,latitude,longitude,ville,stations_rayon_5km,competition_geo
0,01,1000001,Gazole,1,2022-01-03,0.702500,0.435626,R,46.201,5.198,SAINT-DENIS-LèS-BOURG,5,3
1,01,1000001,Gazole,1,2022-01-05,0.692500,0.447949,R,46.201,5.198,SAINT-DENIS-LèS-BOURG,5,3
2,01,1000001,Gazole,1,2022-01-10,0.728333,0.452874,R,46.201,5.198,SAINT-DENIS-LèS-BOURG,5,3
3,01,1000001,Gazole,1,2022-01-17,0.739167,0.484043,R,46.201,5.198,SAINT-DENIS-LèS-BOURG,5,3
4,01,1000001,Gazole,1,2022-01-21,0.769167,0.497616,R,46.201,5.198,SAINT-DENIS-LèS-BOURG,5,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...
4537444,95,95870010,SP98,6,2022-12-23,0.856233,0.488413,R,48.936,2.206,Bezons,15,4
4537445,95,95870010,SP98,6,2022-12-27,0.874567,0.487324,R,48.936,2.206,Bezons,15,4
4537446,95,95870010,SP98,6,2022-12-28,0.874567,0.484175,R,48.936,2.206,Bezons,15,4
4537447,95,95870010,SP98,6,2022-12-29,0.879567,0.477600,R,48.936,2.206,Bezons,15,4


In [29]:
df_stations['stations_rayon_5km'].idxmax()

376568

In [30]:
df_a=df_stations[df_stations['type']=='A']
df_r=df_stations[df_stations['type']=='R']

In [31]:
df_a['stations_rayon_5km'].describe()

count    380270.000000
mean          3.636705
std           3.967912
min           0.000000
25%           1.000000
50%           2.000000
75%           4.000000
max          23.000000
Name: stations_rayon_5km, dtype: float64

In [32]:
df_r['stations_rayon_5km'].describe()

count    4.157179e+06
mean     6.420914e+00
std      6.229963e+00
min      0.000000e+00
25%      2.000000e+00
50%      4.000000e+00
75%      9.000000e+00
max      3.600000e+01
Name: stations_rayon_5km, dtype: float64

In [33]:
df_stations.sample(10)

Unnamed: 0,cp,id,nom_carburant,id_carburant,maj,valeur,litre_brut,type,latitude,longitude,ville,stations_rayon_5km,competition_geo
3125462,69,69330006,Gazole,1,2022-12-08,0.999967,0.452857,R,45.756,5.067,PUSIGNAN,2,1
541296,14,14100007,E10,5,2022-04-29,0.903967,0.646327,R,49.133,0.2245,LISIEUX,3,2
3661539,78,78230002,E85,3,2022-07-28,0.780867,0.676483,R,48.881,2.097,Le Pecq,11,4
1784459,40,40700004,SP95,2,2022-06-03,1.2673,0.737303,R,43.657,-0.58,Hagetmau,2,1
2177595,51,51000001,E85,3,2022-01-22,0.4642,0.497616,R,48.942,4.389,CHâLONS-EN-CHAMPAGNE,3,2
2384551,56,56860004,E85,3,2022-04-12,0.672533,0.606473,R,47.65,-2.722,SENE,6,3
4015320,85,85150001,Gazole,1,2022-01-04,0.638333,0.442323,R,46.607,-1.664,LES ACHARDS,1,1
774670,21,21490003,E85,3,2022-05-07,0.697533,0.678488,A,47.422,5.17,BROGNON,1,1
2031041,45,45520003,E10,5,2022-03-10,1.2248,0.65574,A,47.98,1.856,GIDY,3,2
2531846,59,59310001,E85,3,2022-10-13,0.880867,0.612185,R,50.47,3.246,Orchies,3,2


In [34]:
df_stations['competition_geo']=df_stations['competition_geo'].astype(object)

In [35]:
df_stations.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4537449 entries, 0 to 4537448
Data columns (total 13 columns):
 #   Column              Dtype         
---  ------              -----         
 0   cp                  object        
 1   id                  object        
 2   nom_carburant       object        
 3   id_carburant        object        
 4   maj                 datetime64[ns]
 5   valeur              float64       
 6   litre_brut          float64       
 7   type                object        
 8   latitude            float64       
 9   longitude           float64       
 10  ville               object        
 11  stations_rayon_5km  int64         
 12  competition_geo     object        
dtypes: datetime64[ns](1), float64(4), int64(1), object(7)
memory usage: 450.0+ MB


In [36]:
df_stations.to_hdf('df_v5.h5',key='df_v5',mode='w')

your performance may suffer as PyTables will pickle object types that it cannot
map directly to c-types [inferred_type->mixed-integer,key->block3_values] [items->Index(['cp', 'id', 'nom_carburant', 'id_carburant', 'type', 'ville',
       'competition_geo'],
      dtype='object')]

  df_stations.to_hdf('df_v5.h5',key='df_v5',mode='w')
