In [1]:
import pandas as pd
import warnings
import pymongo
from datetime import datetime
# pd.set_option('display.max_rows', None)

# Setup MongoDB connection (local)
mongo_host = "localhost"
mongo_port = 27017
mongo_user = "admin"
mongo_password = "password"
auth_db = "admin"
client_mongo = pymongo.MongoClient(
    host=mongo_host,
    port=mongo_port,
    username=mongo_user,
    password=mongo_password,
    authSource=auth_db
)
db_mongo_denorm = client_mongo.get_database("denormalization")
db_mongo_denorm.drop_collection("real_transp_fees_gas_eur_liter")


# GET transportation_fees_gas_eur_liter (yearly data)
url = "https://www.ecologie.gouv.fr/politiques-publiques/prix-produits-petroliers"

# Found all tables in HTML page
tables = pd.read_html(url)
df_transportation_fees = pd.DataFrame()
for i, table in enumerate(tables):
    if {"En c€/l", "Gazole", "Eurosuper sp95-e5", "Eurosuper sp95-e10"}.issubset(set(table.iloc[0])):
        #defined values
        df_transportation_fees = table[1:].reset_index(drop=True)
        # defined header
        df_transportation_fees.columns = table.iloc[0]
        break

df_transportation_fees = df_transportation_fees.rename(columns={
    'En c€/l': 'Date',
    'Gazole': 'real_transp_fees_GAZOLE_eur_liter',
    'Eurosuper sp95-e5': 'real_transp_fees_SP95_eur_liter',
    'Eurosuper sp95-e10': 'real_transp_fees_E10_eur_liter',
})
df_transportation_fees['Date'] = pd.to_datetime(df_transportation_fees['Date'])
df_transportation_fees['real_transp_fees_GAZOLE_eur_liter'] = (
    df_transportation_fees['real_transp_fees_GAZOLE_eur_liter'].astype(float) / 1000
)
df_transportation_fees['real_transp_fees_SP95_eur_liter'] = (
    df_transportation_fees['real_transp_fees_SP95_eur_liter'].astype(float) / 1000
)
df_transportation_fees['real_transp_fees_E10_eur_liter'] = (
    df_transportation_fees['real_transp_fees_E10_eur_liter'].astype(float) / 1000
)
print('df_transportation_fees\n', df_transportation_fees)


# ----- Push to MongoDB -----
collection_mongo = db_mongo_denorm.get_collection("real_transp_fees_gas_eur_liter")
collection_mongo.create_index([("Date", pymongo.ASCENDING)])

records = df_transportation_fees.to_dict(orient="records")
collection_mongo.insert_many(records)
print("correctly loaded real_transp_fees_gas_eur_liter to denormalized collection MongoDB")

df_transportation_fees
 0       Date  real_transp_fees_GAZOLE_eur_liter  \
0 2015-01-01                              0.086   
1 2016-01-01                              0.103   
2 2017-01-01                              0.107   
3 2018-01-01                              0.124   
4 2019-01-01                              0.140   
5 2020-01-01                              0.162   
6 2021-01-01                              0.164   
7 2022-01-01                              0.195   
8 2023-01-01                              0.238   
9 2024-01-01                              0.208   

0  real_transp_fees_SP95_eur_liter  real_transp_fees_E10_eur_liter  
0                            0.102                           0.096  
1                            0.110                           0.111  
2                            0.109                           0.109  
3                            0.131                           0.131  
4                            0.147                           0.147  
