In [2]:
import pandas as pd
import os
import warnings
import pymongo
import json
import datetime

pd.options.mode.chained_assignment = None
warnings.filterwarnings('ignore', category=RuntimeWarning)

# Setup MongoDB connection (local)
mongo_host = "localhost"
mongo_port = 27017
mongo_user = "admin"
mongo_password = "password"
auth_db = "admin"
client_mongo = pymongo.MongoClient(
    host=mongo_host,
    port=mongo_port,
    username=mongo_user,
    password=mongo_password,
    authSource=auth_db
)
db_mongo = client_mongo.get_database("datalake")

# Supprimer la collection si elle existe
db_mongo.drop_collection("french_gas_station")

collection_mongo = db_mongo.get_collection("french_gas_station")
collection_mongo.create_index([("Date", pymongo.ASCENDING), ("Nom", pymongo.ASCENDING)])

# ALL YEARS
years = ['2007', '2008', '2009', '2010', '2011', '2012', '2013', '2014', '2015', '2016', '2017', '2018', '2019', '2020', '2021', '2022', '2023', '2024', '2025']
#years = ['2007', '2008', '2009', '2010', '2011', '2012']
# ONE YEAR
#years = ['2007']

for year in years:
    print("YEAR: ", year)
    
    file_path = f"results/inputs_csv_zscore_last/PrixCarburants_annuel_filtered_{year}.csv"
    
    if not os.path.exists(file_path):
        print(f"Le fichier {file_path} n'existe pas.")
    else:
        df = pd.read_csv(file_path)
        print("len = ", len(df))
        df['date'] = pd.to_datetime(df['date'], format='%Y_%m_%d')
        df['heuremin'] = pd.to_datetime(df['heuremin'], format='%H:%M').dt.strftime('%H:%M:%S')
        df['valeur'] = df['valeur']*0.001
        
        df.columns = [col.capitalize() for col in df.columns]
        print(df.head(5))
        print(df.dtypes)
        
        records = df.to_dict(orient="records")
        collection_mongo.insert_many(records)
        print("correctly loaded", year, "csv to MongoDB")

YEAR:  2007
len =  651116
        Date     Nom   Latitude  Longitude    Cp                  Ville  \
0 2007-01-04  Gazole  4620114.0   519791.0  1000  SAINT-DENIS-LèS-BOURG   
1 2007-01-04    SP95  4620114.0   519791.0  1000  SAINT-DENIS-LèS-BOURG   
2 2007-01-09  Gazole  4620114.0   519791.0  1000  SAINT-DENIS-LèS-BOURG   
3 2007-01-09    SP95  4620114.0   519791.0  1000  SAINT-DENIS-LèS-BOURG   
4 2007-01-12  Gazole  4620114.0   519791.0  1000  SAINT-DENIS-LèS-BOURG   

           Adresse  Valeur  Heuremin  Gas_station_id  
0  ROUTE NATIONALE   0.999  18:43:00         1000001  
1  ROUTE NATIONALE   1.170  18:43:00         1000001  
2  ROUTE NATIONALE   0.999  08:32:00         1000001  
3  ROUTE NATIONALE   1.160  08:32:00         1000001  
4  ROUTE NATIONALE   0.989  08:42:00         1000001  
Date              datetime64[ns]
Nom                       object
Latitude                 float64
Longitude                float64
Cp                         int64
Ville                     ob

correctly loaded 2014 csv to MongoDB
YEAR:  2015
len =  2270285
        Id       Date     Nom   Latitude  Longitude    Cp  \
0  1000001 2015-01-02  Gazole  4620114.0   519791.0  1000   
1  1000001 2015-01-02    SP95  4620114.0   519791.0  1000   
2  1000001 2015-01-02    SP98  4620114.0   519791.0  1000   
3  1000001 2015-01-03  Gazole  4620114.0   519791.0  1000   
4  1000001 2015-01-03    SP95  4620114.0   519791.0  1000   

                   Ville                Adresse  Valeur  Heuremin  
0  SAINT-DENIS-LèS-BOURG  596 AVENUE DE TREVOUX   1.141  11:01:00  
1  SAINT-DENIS-LèS-BOURG  596 AVENUE DE TREVOUX   1.311  11:01:00  
2  SAINT-DENIS-LèS-BOURG  596 AVENUE DE TREVOUX   1.327  11:01:00  
3  SAINT-DENIS-LèS-BOURG  596 AVENUE DE TREVOUX   1.141  09:01:00  
4  SAINT-DENIS-LèS-BOURG  596 AVENUE DE TREVOUX   1.311  09:01:00  
Id                    int64
Date         datetime64[ns]
Nom                  object
Latitude            float64
Longitude           float64
Cp                   

correctly loaded 2022 csv to MongoDB
YEAR:  2023
len =  3348441
        Id       Date     Nom   Latitude  Longitude    Cp  \
0  1000001 2023-01-02  Gazole  4620100.0   519800.0  1000   
1  1000001 2023-01-02    SP95  4620100.0   519800.0  1000   
2  1000001 2023-01-05  Gazole  4620100.0   519800.0  1000   
3  1000001 2023-01-05    SP95  4620100.0   519800.0  1000   
4  1000001 2023-01-05    SP98  4620100.0   519800.0  1000   

                   Ville                Adresse  Valeur  Heuremin  
0  SAINT-DENIS-LèS-BOURG  596 AVENUE DE TREVOUX   1.867  07:53:00  
1  SAINT-DENIS-LèS-BOURG  596 AVENUE DE TREVOUX   1.800  07:53:00  
2  SAINT-DENIS-LèS-BOURG  596 AVENUE DE TREVOUX   1.877  09:33:00  
3  SAINT-DENIS-LèS-BOURG  596 AVENUE DE TREVOUX   1.832  09:33:00  
4  SAINT-DENIS-LèS-BOURG  596 AVENUE DE TREVOUX   1.851  09:33:00  
Id                    int64
Date         datetime64[ns]
Nom                  object
Latitude            float64
Longitude           float64
Cp                   