Auteur: Michaël Leroy

# Contenu:

[+] Données

    + Chargement des open datas + sauvegarde locale

    + Aggregation des tables:

        + Communes:
            + Communes france métro, données demographiques, geometrie, nb véhicules, nb VE, code insee, region, département, pourcentage VE/habitants
        * Bornes de charge:
            + dédoublonnage des pdc ayant les mêmes géolocalisations et caractéristiques
            + toutes les caractéristiques regroupées dans popup_html
            + puissance_nominale et nb_pdc par bornes
            
        * toutes les agrégations sur le code commune insee, 
        * Zone france métro + corse (compatible avec trajet routier)

[+] Communes:

    date_arrete:
            2020-12-31
            2021-03-31
            2021-06-30
            2021-09-30
            2021-12-31
            2022-03-31
            2022-06-30
            2022-09-30
            2022-12-31


    <class 'geopandas.geodataframe.GeoDataFrame'>
    Int64Index: 34821 entries, 0 to 34954
    Data columns (total 8 columns):
    #   Column        Non-Null Count  Dtype   
    ---  ------        --------------  -----   
    0   insee         34821 non-null  object  
    1   nom           34821 non-null  object  
    2   geometry      34821 non-null  geometry
    3   dep           34821 non-null  object  
    4   dep_name      34821 non-null  object  
    5   region_name   34821 non-null  object  
    6   VE_per_inhab  34815 non-null  float64 
    7   html_popup    34821 non-null  object  
    dtypes: float64(1), geometry(1), object(6)
    memory usage: 2.4+ MB

[+] Bornes

    <class 'geopandas.geodataframe.GeoDataFrame'>
    RangeIndex: 17623 entries, 0 to 17622
    Data columns (total 4 columns):
    #   Column              Non-Null Count  Dtype   
    ---  ------              --------------  -----   
    0   nbre_pdc            17623 non-null  float64 
    1   puissance_nominale  17623 non-null  float64 
    2   geometry            17623 non-null  geometry
    3   html_popup          17623 non-null  object  
    dtypes: float64(2), geometry(1), object(1)
    memory usage: 550.8+ KB

In [None]:
# from tqdm.notebook import tqdm
# from tqdm import tqdm

import os
os.environ['USE_PYGEOS'] = '0'

# Data management
import pandas as pd
import geopandas as gpd
import numpy as np


# Visualisation
import matplotlib.pyplot as plt
import seaborn as sns
import folium
from folium.features import Choropleth
from folium.plugins import MarkerCluster

# Preprocessing
from sklearn.preprocessing import StandardScaler
import umap


# I/O
import gc
import io, requests
import zipfile, shutil
import joblib

# tqdm().pandas()

# data_path = 'C:/Users/demo/Desktop/Lattitude/datas/'
data_path = 'datas'
os.makedirs(data_path, exist_ok=True)

# Datas

## Bornes de recharge

In [None]:
file_name = 'consolidation-etalab-schema-irve-statique-v-2.2.0-20230327'
ext = '.json'
url = 'https://www.data.gouv.fr/fr/datasets/r/7eee8f09-5d1b-4f48-a304-5e99e8da1e26'

try: 
    print('Loading data from local file...')
    bornes = gpd.read_feather(os.path.join(data_path,file_name + '.feather'))
    
except:
    print('Loading data from url...')
    # from url
    bornes = gpd.read_file( url)

    # bornes = gpd.read_file(os.path.join(data_path,file_name + ext))
    print('Saving data to local file...')
    bornes.to_feather(os.path.join(data_path,file_name + '.feather'))

display(bornes.info() )   

In [None]:
# Create a folium map centered on the first row of datas_
centroid = list(bornes.iloc[0].geometry.centroid.coords[0])
display(centroid, centroid[::-1])


# create a folium map
m = folium.Map(location=[*centroid[::-1]], zoom_start=7)#, crs='EPSG3857')
display(m)
del m

In [None]:
def convert_strings_bools(val):
    if val.lower() == 'true':
        return True
    elif val.lower() == 'false':
        return False
    else:
        return val
    
# find all columns that contain the string values 'True', 'true', 'False', or 'false',
#  convert those values to boolean, and set the column type to boolean
bool_columns = bornes.applymap(lambda x: isinstance(x, str) and x.lower() in ['true', 'false']).any()
bornes.loc[:, bool_columns] = bornes.loc[:, bool_columns].applymap(convert_strings_bools).astype(bool)
bornes.info()
   

In [None]:
for col in bornes.select_dtypes(include='object').columns:
    try :
        bornes[col] = bornes[col].astype('float')
    except ValueError:
        continue

bornes.info()

## Types de voitures

In [None]:
file_name = 'voitures-par-commune-par-energie'
ext = '.geojson'
url = 'http://opendata.agenceore.fr/explore/dataset/voitures-par-commune-par-energie/download?format=geojson&timezone=Europe/Berlin&use_labels_for_header=false'

try: 
    print('Loading data from local file...')
    cars = gpd.read_feather(os.path.join(data_path,file_name + '.feather'))
except:
    print('Loading data from url...')
    # from url
    s = requests.get(url).content
    cars = gpd.read_file(io.StringIO(s.decode('utf-8')))

    # avoiding multi dowloads for GES emmision lowering
    print('Saving data to local file...')
    cars.to_feather(os.path.join(data_path,file_name + '.feather'))

print(cars.info())

## Découpage  Administratif


In [None]:


file_name = 'decoupage_administratif'
ext = '.json'
url ='https://www.data.gouv.fr/fr/datasets/r/fb3580f6-e875-408d-809a-ad22fc418581'
# temp_path = 'temp_unzip'


try: 
    print('Loading data from local file...')
    decoupage = gpd.read_feather(os.path.join(data_path,file_name + '.feather'))
except:
    print('Loading data from url...')

    s = requests.get(url).content
    decoupage = gpd.read_file(io.StringIO(s.decode('utf-8')))
    
    print('Saving data to local file...')
    decoupage.to_feather(os.path.join(data_path,file_name + '.feather'))

decoupage.info()




## Departements et régions

In [None]:
file_name = 'depatements-regions'
ext = '.csv'
url ='https://www.data.gouv.fr/fr/datasets/r/987227fb-dcb2-429e-96af-8979f97c9c84'
# temp_path = 'temp_unzip'


try: 
    print('Loading data from local file...')
    regions = pd.read_feather(os.path.join(data_path,file_name + '.feather'))
except:
    print('Loading data from url...')

    s = requests.get(url).content
    regions = pd.read_csv(io.StringIO(s.decode('utf-8')))
    
    print('Saving data to local file...')
    regions.to_feather(os.path.join(data_path,file_name + '.feather'))

regions.info()

In [None]:
file_name = 'communes-20220101'
ext = '.shp'
url ='https://www.data.gouv.fr/fr/datasets/r/0e117c06-248f-45e5-8945-0e79d9136165'
temp_path = 'temp_unzip'


try: 
    print('Loading data from local file...')
    communes = gpd.read_feather(os.path.join(data_path,file_name + '.feather'))
except:
    print('Loading data from url...')
    # Zip file from url  
    zip_file = requests.get(url)
    os.makedirs(temp_path, exist_ok=True)
    with zipfile.ZipFile(io.BytesIO(zip_file.content)) as archive:
        archive.extractall(temp_path)
    communes = gpd.read_file(os.path.join(temp_path,file_name + ext))   
    shutil.rmtree(temp_path) 
    
    print('Saving data to local file...')
    communes.to_feather(os.path.join(data_path,file_name + '.feather'))

communes.info()


# Données démographiques

In [None]:



file_name = 'Communes'
ext = '.csv'
url ='https://www.insee.fr/fr/statistiques/fichier/4265429/ensemble.zip'
temp_path = 'temp_unzip'


try: 
    print('Loading data from local file...')
    pop_communes = pd.read_feather(os.path.join(data_path,file_name + '.feather'))
except:
    print('Loading data from url...')
    # Zip file from url  
    zip_file = requests.get(url)
    os.makedirs(temp_path, exist_ok=True)
    with zipfile.ZipFile(io.BytesIO(zip_file.content)) as archive:
        archive.extractall(temp_path)
    pop_communes = pd.read_csv(os.path.join(temp_path,file_name + ext), sep=';')     
    shutil.rmtree(temp_path) 

    # Rename the columns
    pop_communes.rename(columns={'DEPCOM': 'insee'}, inplace=True)
    
    print('Saving data to local file...')
    pop_communes.to_feather(os.path.join(data_path,file_name + '.feather'))

pop_communes.info()


--------------------

# Join

## Communes and cars

In [None]:
display(cars.head(2), cars.shape, len(set(cars.codgeo)))

In [None]:
datas = communes.join(pop_communes.drop('COM',axis=1).set_index('insee'), 
                      on='insee',
                      how='left', 
                      validate='1:1'
                        )\
                .join(cars.rename(columns={'codgeo': 'insee'}).drop('geometry',axis=1).set_index('insee'), 
                      on='insee', 
                      how='left', 
                      # rsuffix='_cars',
                      validate='1:m'
                      )\
            #     .join(bornes.rename(columns={'consolidated_code_postal': 'insee'}).set_index('insee'), 
            #           on='insee', 
            #           how='left', 
            #           rsuffix='_bornes',
            #           validate='m:m')

del pop_communes            
datas.info()

## regions

Création dep en prennant les deux premiers str de insee

In [None]:
datas['dep'] = datas['insee'].apply(lambda s : s[0:2])


display(regions.head(2), regions.shape, len(set(datas.dep)))

set(regions.num_dep) - set(datas.dep)

In [None]:
datas = datas.join(regions.rename(columns={'num_dep': 'dep'}).set_index('dep'), on='dep', how='left', validate='m:1')

# del regions
datas.info()

# Create some metric on electrics stuff

In [None]:
datas['VE_pct'] = datas.nb_vp_rechargeables_el / datas.nb_vp
datas['VE_per_inhab'] = datas.nb_vp_rechargeables_el / datas.PMUN
datas['VE_per_ha'] = datas.nb_vp_rechargeables_el / datas.surf_ha


# Create html popup column

In [None]:
def single_popup(df):
    texts = []

    # loop on rows
    for index, row in df.iterrows():
        text = ''
        for col in communes_info_cols:
            text += f'<b>{col}:</b> {row[col]}<br>'
        texts.append(text)
    return texts    

# list of columns from datas_ to be displayed
communes_info_cols = datas.columns.tolist()
communes_info_cols.remove('geometry')
communes_info_cols.remove('wikipedia')
communes_info_cols.remove('libgeo')
communes_info_cols.remove('nb_vp_rechargeables_gaz')




datas['html_popup'] = single_popup(datas)   
datas.html_popup.head(2)

In [None]:
datas.columns

## Save communes par date_arrete

In [None]:
file_name = 'dataset_communes'

file_name_dict = dict()

for date in datas.date_arrete.value_counts().sort_index().index:
    print(date)
    tmp = datas.query("date_arrete == @date").copy()
    tmp.drop(
        [
        # 'insee',
        # 'nom', 
        'wikipedia', 
        'surf_ha', 
        # 'geometry', 
        'PMUN', 
        'PCAP',
        'PTOT', 
        'nb_vp', 
        'libepci', 
        'libgeo', 
        'nb_vp_rechargeables_gaz',
        'date_arrete', 
        'epci', 
        'nb_vp_rechargeables_el', 
        # 'dep', 
        # 'dep_name',
        # 'region_name', 
        'VE_pct', 
        # 'VE_per_inhab', 
        'VE_per_ha', 
        # 'html_popup'
        ], 
        axis=1, 
        inplace=True
    )

    tmp.dropna( subset='region_name', axis=0, inplace=True) 

    tmp.fillna('n.r.') 
    
    # Save by region to feather and keep a file catalog for future use
    list_files = []
    for region in tmp.region_name.unique():
        # print(region)
        region_cut = region[:5]
        file_ = f'{file_name}_{region_cut}_{date}.feather'
        list_files.append(file_)
        tmp.query("region_name == @region")\
            .to_feather(os.path.join(data_path, file_)
                    # compression='zstd'
        )
    file_name_dict[date] = list_files   
# dump catalog to disk
joblib.dump(file_name_dict, os.path.join(data_path,'file_catalog.joblib'))     
tmp.plot()
display(tmp.info())
del tmp

# bornes de charge

In [None]:
bornes.info()

In [None]:
bornes.sort_values(by='date_mise_en_service', inplace=True)
display(bornes.head(2), bornes.shape, len(set(bornes.consolidated_code_postal)))

en reprennant les codes insee et polygons de datas,  creation de insee dans bornes

In [None]:
# Convert the GeoDataFrames to a dask DataFrame
com_ = communes[['insee','geometry']]
bor_ = bornes[['consolidated_code_postal','geometry']]


# display(com_.shape[0], bor_.shape[0])

bornes['insee'] = gpd.sjoin(com_, bor_,  how='right', predicate='contains',lsuffix='_com', rsuffix='_bor')['insee'].astype(str)
bornes.info()

In [None]:
# Add dep, dep_name, region_name

bornes['dep'] = bornes['insee'].apply(lambda s : s[0:2])

bornes = bornes.join(regions.rename(columns={'num_dep': 'dep'}).set_index('dep'), on='dep', how='left', validate='m:1')



# del regions, communes, com_, bor_
bornes.info()

In [None]:
# check Nan
bornes[bornes.insee.isna()].consolidated_is_lon_lat_correct.value_counts()

## Dédoublonnage bornes de charge

In [None]:
# bornes.columns

In [None]:
tmp = bornes.drop(columns=[
    'coordonneesXY',
    'observations',
    'date_maj', 
    'last_modified', 'datagouv_dataset_id',
       'datagouv_resource_id', 'datagouv_organization_or_owner',
       'consolidated_longitude', 'consolidated_latitude',
       'consolidated_code_postal', 'consolidated_commune',
       'consolidated_is_lon_lat_correct',
       'consolidated_is_code_insee_verified',
])

In [None]:
tmp.info()

In [None]:
tmp.dropna( subset='region_name', inplace=True) 

In [None]:
tmp = tmp.cx[-5:10, 41:54]
tmp.plot()

In [None]:
# tmp.info()

In [None]:
# Get nbre_pdc by geometry
tmp.dissolve(by='insee',
             aggfunc={
            'nbre_pdc': list
             },
)

In [None]:
# View on TAVERNY pdcs
tmp.query("insee == '95607'")[['insee','nom_station','nbre_pdc','geometry','puissance_nominale', 'prise_type_ef', 'prise_type_2',
       'prise_type_combo_ccs', 'prise_type_chademo', 'prise_type_autre']].T

In [None]:
# Group pdc by coordinates

# get x, y from geomerty
tmp['X'] = tmp.geometry.x
tmp['Y'] = tmp.geometry.y

# groupby x,y and take first occurnce
tmp = tmp.groupby(by=['X','Y']).agg('first').reset_index(drop=True)
tmp

In [None]:
# Check aggregation
tmp.query("insee == '95607'")[['insee','nom_station','nbre_pdc','geometry','puissance_nominale', 'prise_type_ef', 'prise_type_2',
       'prise_type_combo_ccs', 'prise_type_chademo', 'prise_type_autre']].T

In [None]:
tmp.plot()

# Create html popup column

In [None]:
tmp.columns

In [None]:
tmp.drop(columns=['raccordement','code_insee_commune'], inplace=True)

In [None]:
tmp= tmp.loc[ :,   
    [
      'nom_amenageur', 'id_pdc_itinerance', 'gratuit',
      'siren_amenageur', 'id_pdc_local', 'paiement_acte',
      'contact_amenageur','nbre_pdc', 'paiement_cb',
      'nom_operateur', 'puissance_nominale','paiement_autre',
      'contact_operateur', 'prise_type_ef','tarification',
      'telephone_operateur','prise_type_2','condition_acces',
      'nom_enseigne','prise_type_combo_ccs', 'reservation',
      'id_station_itinerance', 'prise_type_chademo','horaires',
      'id_station_local',  'prise_type_autre','accessibilite_pmr',
      'nom_station', 'cable_t2_attache','restriction_gabarit', 
      'implantation_station','num_pdl','station_deux_roues',
      'adresse_station','insee','dep',
      'date_mise_en_service', 'dep_name',  'region_name'  , 
       
        
      
         
        
       
      'geometry' 
    ]
]

In [None]:
# Column names generator 
def split_list(list_a, chunk_size):
  for i in range(0, len(list_a), chunk_size):
    yield list_a[i:i + chunk_size]


def column_popup(df, info_cols,num_cols=3,width=20):
    # Mise en page
    num_cols = 3
    width = 20
    px = np.ceil(width / num_cols / 2)

    texts = []

    # Loop on rows
    for index, row in df.iterrows():
        text = f'<table style="width:{width}%"><tr>'
        # Create header
        for n in range(num_cols):
          text += f'<td style="font-weight:bold">{n}</td>'
        # lines
        for cols in split_list(bornes_info_cols,num_cols):
                text += '<tr>'
                for col in cols:
                        text += f'<th style="width:{px}%"><b>{col}:</b><br> {row[col]}</th>' 
                text  += '</tr>'
        # ends table         
        text += '</table>'
        # append to previous
        texts.append(text)
    
    return texts

 # list of columns from datas_ to be displayed
bornes_info_cols = tmp.columns.tolist()
bornes_info_cols.remove('geometry')





tmp['html_popup'] = column_popup(tmp, info_cols=bornes_info_cols)   
tmp.html_popup.head(2)

In [None]:
bornes_info_cols.remove('puissance_nominale')

bornes_info_cols.remove('nbre_pdc')

In [None]:
set(tmp.columns) - set ( bornes_info_cols)

In [None]:
tmp.drop(bornes_info_cols, axis=1, inplace=True)

In [None]:
tmp.info()

In [None]:
tmp.plot(column='puissance_nominale', legend=True, figsize=(10,10))

------------------------------------------------

### Save to local

In [None]:
# save bornes
file_name = 'dataset_charge_points'
print('Saving bornes to local file...')
tmp.to_feather(os.path.join(data_path,file_name + '.feather'),
                # compression='zstd'
                )
print('Done.')