In [1]:
import requests
import pandas as pd
import zipfile
import io
import os

import xml.etree.ElementTree as ET
import html

import numpy as np
os.makedirs("results/inputs_extracted_xml", exist_ok=True)
years = ['2007', '2008', '2009', '2010', '2011', '2012', '2013', '2014', '2015', '2016', '2017', '2018', '2019', '2020', '2021', '2022', '2023', '2024', '2025']

In [2]:
for year in years:
    print("LOAD", year)
    ###### LOAD DATAS XML TO DF ##########
    # year = "2007"
    
    # Source = "https://www.prix-carburants.gouv.fr/rubrique/opendata/" (site gouvernemental)
    url = f"https://donnees.roulez-eco.fr/opendata/annee/{year}"
    
    response = requests.get(url)
    print(response)
    
    zip_file = zipfile.ZipFile(io.BytesIO(response.content))
    file_name = zip_file.namelist()[0]
    print(file_name)
    
    with zipfile.ZipFile(io.BytesIO(response.content), 'r') as zip_ref:
        zip_ref.extractall("results/inputs_extracted_xml")
    print(file_name, "extracted")
    
    
    tree = ET.parse(f"results/inputs_extracted_xml/PrixCarburants_annuel_{year}.xml")
    root = tree.getroot()
    
    data = []
    for pdv in root.findall("pdv"):
        for p in pdv.findall("prix"):
            row = {
                "id": pdv.get("id"),
                "latitude": pdv.get("latitude"),
                "longitude": pdv.get("longitude"),
                "cp": pdv.get("cp"),
                "ville": pdv.find("ville").text if pdv.find("ville") is not None and pdv.find("ville").text else None,
                "adresse": pdv.find("adresse").text.replace(",", " ").replace(";", " ") if pdv.find("adresse") is not None and pdv.find("adresse").text else None,
                "nom": p.get("nom"),
                "maj": p.get("maj"),
                "valeur": p.get("valeur")
            }
            data.append(row)
    
    df = pd.DataFrame(data)
    
    ###### CLEAN DF AND ADD TYPES ##############
    
    print(df.dtypes)
    
    df["id"] = df["id"].astype(int)
    df["latitude"] = df["latitude"].replace(["", "0"], np.nan).astype(float)
    df["longitude"] = df["longitude"].replace(["", "0"], np.nan).astype(float)
    df["valeur"] = df["valeur"].astype(float)
    
    # (le cp est de valeur '35***' sur une des valeurs de 2008 et son id "35200004" ne correspond a aucune station sur d'autres année, donc on supprime.)
    df = df[df["cp"] != "35***"]
    df["cp"] = df["cp"].replace("", np.nan).astype(int)
    
    
    # (format date used between 2014 and now)
    df['maj_without_microsec_with_T'] = pd.to_datetime(df['maj'], format='%Y-%m-%dT%H:%M:%S', errors='coerce').dt.strftime('%Y_%m_%d')
    # (format date used between 2007 and 2013)
    df['maj_without_microsec'] = pd.to_datetime(df['maj'], format='%Y-%m-%d %H:%M:%S', errors='coerce').dt.strftime('%Y_%m_%d')
    df['maj_with_microsec'] = pd.to_datetime(df['maj'], format='%Y-%m-%d %H:%M:%S.%f', errors='coerce').dt.strftime('%Y_%m_%d')
    
    df['maj'] = df['maj_with_microsec'].fillna(df['maj_without_microsec']).fillna(df['maj_without_microsec_with_T'])
    df = df.drop(columns=["maj_without_microsec", "maj_with_microsec", "maj_without_microsec_with_T"])

    # values format used between 2007 and 2021 = 500 to 2000
    # values format used between 2022 ans now = 0.5 to 2.0 (need to be adapted to 500 to 2000)
    df['valeur'] = df['valeur'] * 1000 if int(year) > 2021 else df['valeur']
    
    print(df.dtypes)
    
    ########### SAVE DF TO CSV ###################
    
    os.makedirs("results/inputs_csv", exist_ok=True)
    df.to_csv(f"results/inputs_csv/PrixCarburants_annuel_{year}.csv", index=False)
    df.head(5)

LOAD 2007
<Response [200]>
PrixCarburants_annuel_2007.xml
PrixCarburants_annuel_2007.xml extracted
id           object
latitude     object
longitude    object
cp           object
ville        object
adresse      object
nom          object
maj          object
valeur       object
dtype: object
id             int32
latitude     float64
longitude    float64
cp             int32
ville         object
adresse       object
nom           object
maj           object
valeur       float64
dtype: object
LOAD 2008
<Response [200]>
PrixCarburants_annuel_2008.xml
PrixCarburants_annuel_2008.xml extracted
id           object
latitude     object
longitude    object
cp           object
ville        object
adresse      object
nom          object
maj          object
valeur       object
dtype: object
id             int32
latitude     float64
longitude    float64
cp             int32
ville         object
adresse       object
nom           object
maj           object
valeur       float64
dtype: object
LOAD 200