In [1]:
import requests
import pandas as pd
import zipfile
import io
import os
os.makedirs("results/inputs_extracted_xml", exist_ok=True)
year = "2025"

# Source = "https://www.prix-carburants.gouv.fr/rubrique/opendata/" (site gouvernemental)
url = f"https://donnees.roulez-eco.fr/opendata/annee/{year}"

response = requests.get(url)
print(response)

zip_file = zipfile.ZipFile(io.BytesIO(response.content))
file_name = zip_file.namelist()[0]
print(file_name)

with zipfile.ZipFile(io.BytesIO(response.content), 'r') as zip_ref:
    zip_ref.extractall("results/inputs_extracted_xml")
print(file_name, "extracted")

<Response [200]>
PrixCarburants_annuel_2025.xml
PrixCarburants_annuel_2025.xml extracted


In [2]:
import xml.etree.ElementTree as ET
import html

tree = ET.parse(f"results/inputs_extracted_xml/PrixCarburants_annuel_{year}.xml")
root = tree.getroot()

data = []
for pdv in root.findall("pdv"):
    for p in pdv.findall("prix"):
        row = {
            "id": pdv.get("id"),
            "latitude": pdv.get("latitude"),
            "longitude": pdv.get("longitude"),
            "cp": pdv.get("cp"),
            "ville": pdv.find("ville").text if pdv.find("ville") is not None and pdv.find("ville").text else None,
            "adresse": pdv.find("adresse").text.replace(",", " ").replace(";", " ") if pdv.find("adresse") is not None and pdv.find("adresse").text else None,
            "nom": p.get("nom"),
            "maj": p.get("maj"),
            "valeur": p.get("valeur")
        }
        data.append(row)

df = pd.DataFrame(data)

# os.makedirs("results/inputs_csv", exist_ok=True)
# df.to_csv(f"results/inputs_csv/PrixCarburants_annuel_{year}_original.csv", index=False)
# df.head(5)

In [3]:
import numpy as np
print(df.dtypes)

df["id"] = df["id"].astype(int)
df["latitude"] = df["latitude"].replace(["", "0"], np.nan).astype(float)
df["longitude"] = df["longitude"].replace(["", "0"], np.nan).astype(float)
df['adresse'] = df['adresse'].str.replace("\n", " ", regex=True)
df["valeur"] = df["valeur"].astype(float)

# (le cp est de valeur '35***' sur une des valeurs de 2008 et son id "35200004" ne correspond a aucune station sur d'autres année, donc on supprime.)
df = df[df["cp"] != "35***"]
df["cp"] = df["cp"].replace("", np.nan).astype(int)

# (format date used between 2014 and now)
df['maj_without_microsec_with_T'] = pd.to_datetime(df['maj'], format='%Y-%m-%dT%H:%M:%S', errors='coerce').dt.strftime('%Y_%m_%d')
# (format date used between 2007 and 2013)
df['maj_without_microsec'] = pd.to_datetime(df['maj'], format='%Y-%m-%d %H:%M:%S', errors='coerce').dt.strftime('%Y_%m_%d')
df['maj_with_microsec'] = pd.to_datetime(df['maj'], format='%Y-%m-%d %H:%M:%S.%f', errors='coerce').dt.strftime('%Y_%m_%d')

df['maj'] = df['maj_with_microsec'].fillna(df['maj_without_microsec']).fillna(df['maj_without_microsec_with_T'])
df = df.drop(columns=["maj_without_microsec", "maj_with_microsec", "maj_without_microsec_with_T"])

# values format used between 2007 and 2021 = 500 to 2000
# values format used between 2022 ans now = 0.5 to 2.0 (need to be adapted to 500 to 2000)
df['valeur'] = df['valeur'] * 1000 if int(year) > 2021 else df['valeur']

print(df.dtypes)

os.makedirs("results/inputs_csv", exist_ok=True)
df.to_csv(f"results/inputs_csv/PrixCarburants_annuel_{year}.csv", index=False)
df.head(5)

id           object
latitude     object
longitude    object
cp           object
ville        object
adresse      object
nom          object
maj          object
valeur       object
dtype: object
id             int32
latitude     float64
longitude    float64
cp             int32
ville         object
adresse       object
nom           object
maj           object
valeur       float64
dtype: object


Unnamed: 0,id,latitude,longitude,cp,ville,adresse,nom,maj,valeur
0,1000001,4620100.0,519800.0,1000,SAINT-DENIS-LèS-BOURG,596 AVENUE DE TREVOUX,Gazole,2025_01_02,1707.0
1,1000001,4620100.0,519800.0,1000,SAINT-DENIS-LèS-BOURG,596 AVENUE DE TREVOUX,Gazole,2025_01_02,1707.0
2,1000001,4620100.0,519800.0,1000,SAINT-DENIS-LèS-BOURG,596 AVENUE DE TREVOUX,Gazole,2025_01_03,1725.0
3,1000001,4620100.0,519800.0,1000,SAINT-DENIS-LèS-BOURG,596 AVENUE DE TREVOUX,Gazole,2025_01_07,1705.0
4,1000001,4620100.0,519800.0,1000,SAINT-DENIS-LèS-BOURG,596 AVENUE DE TREVOUX,Gazole,2025_01_09,1688.0


In [4]:
print("id unique", len(df["id"].unique()))

print("all values", len(df))

id unique 14080
all values 859899


In [5]:
df.isnull().sum()

id              0
latitude       73
longitude      73
cp              0
ville           0
adresse        41
nom          4549
maj          4549
valeur       4549
dtype: int64

In [6]:
# check bad string
# print(df[df['adresse'].str.startswith("ZI La Laure", na=False)])

In [7]:
# check what ID have None data
# df2 = df.copy()
# df2 = df2[df2['adresse'].isna()]

# df2 = df2[["id", "adresse"]].drop_duplicates()
# df2.head(25)