### Concaténation des différents dataframes

In [2]:
import os
import pandas as pd
import pyarrow as pa
import pyarrow.parquet as pq

In [3]:
#  Avantage : lit chaque CSV par morceaux (chunks) et écrit immédiatement sur disque en Parquet.
# On ne garde jamais tout en mémoire → pas de pd.concat géant → plus de MemoryError.

dossier = r"02_resultats_commune_agregation"   # dossier des CSV
fichier_sortie = "mon_dataframe.parquet"       # fichier Parquet final

writer = None
cols = None

for f in sorted(os.listdir(dossier)):
    if not f.endswith(".csv"):
        continue
    chemin = os.path.join(dossier, f)
    for chunk in pd.read_csv(chemin, chunksize=1_000_000, low_memory=True):
        # Normalise le schéma si les colonnes varient légèrement entre fichiers
        if writer is None:
            cols = chunk.columns.tolist()
            table = pa.Table.from_pandas(chunk, preserve_index=False)
            writer = pq.ParquetWriter(fichier_sortie, table.schema)
        else:
            # Aligne sur les colonnes initiales (ajoute/retire au besoin)
            chunk = chunk.reindex(columns=cols)
            table = pa.Table.from_pandas(chunk, preserve_index=False)

        writer.write_table(table)

if writer is not None:
    writer.close()

print(" Terminé :", fichier_sortie)

 Terminé : mon_dataframe.parquet


In [4]:
# Chemin vers le fichier Parquet
fichier_sortie = "mon_dataframe.parquet"

# Lire le fichier Parquet dans un DataFrame
df_vent = pd.read_parquet(fichier_sortie)

# Afficher les premières lignes du DataFrame
print(df_vent.head())

   INSEE_COM        date  wind_speed_10m_mean  wind_speed_10m_max  \
0       1002  2012-01-01             2.437408            3.518439   
1       1002  2012-01-02             2.770377            4.397294   
2       1002  2012-01-03             2.324536            3.704427   
3       1002  2012-01-04             3.139477            5.018902   
4       1002  2012-01-05             4.137015            6.588655   

   wind_speed_10m_min  wind_speed_10m_std  wind_speed_100m_mean  \
0            1.665976            0.433766              4.391337   
1            0.927427            1.266565              5.452989   
2            0.572585            0.970641              4.418672   
3            1.201510            1.110434              5.539307   
4            3.085253            0.845917              7.738133   

   wind_speed_100m_max  wind_speed_100m_min  wind_speed_100m_std  \
0             7.427219             3.083515             1.052151   
1             8.389891             1.449097   

### Agrégation au mois 

In [7]:
# 1. Création de la colonne "date_trimestre"
df_vent['date_trimestre'] = pd.to_datetime(df_vent['date']).dt.to_period('Q').dt.to_timestamp()

# 2. Liste des colonnes météo à agréger
colonnes_meteo = [col for col in df_vent.columns if col not in ['date', 'INSEE_COM', 'date_trimestre']]

# 3. Agrégation par commune et trimestre (max de chaque variable)
df_vent_trimestriel_final = df_vent.groupby(['INSEE_COM', 'date_trimestre'])[colonnes_meteo].max().reset_index()

In [8]:
df_vent_trimestriel_final.shape

(1822230, 18)

In [10]:
df_vent_trimestriel_final["INSEE_COM"].nunique()

33745

In [12]:
df_vent_trimestriel_final

Unnamed: 0,INSEE_COM,date_trimestre,wind_speed_10m_mean,wind_speed_10m_max,wind_speed_10m_min,wind_speed_10m_std,wind_speed_100m_mean,wind_speed_100m_max,wind_speed_100m_min,wind_speed_100m_std,wind_speed_10m_neutral_mean,wind_speed_10m_neutral_max,wind_speed_10m_neutral_min,wind_speed_10m_neutral_std,gust_mean,gust_max,gust_min,gust_std
0,1001,2012-01-01,6.653448,8.014953,5.795943,1.908511,10.606186,11.952362,8.821354,2.734180,6.653501,8.057697,5.812607,2.041728,14.079675,17.397026,11.980936,3.624302
1,1001,2012-04-01,7.096284,10.117747,4.554617,2.633052,12.062060,16.517200,8.214607,3.696472,7.027391,10.094688,4.471093,2.716491,15.064568,20.859613,8.981316,6.131558
2,1001,2012-07-01,6.058658,7.707341,4.860898,2.524474,9.582786,12.879599,8.232708,3.662876,6.063623,7.667396,4.802291,2.561111,12.394590,17.126976,9.969259,4.991399
3,1001,2012-10-01,7.805512,10.307178,6.070050,2.557302,12.692831,15.403713,10.463328,3.825374,7.811988,10.341628,6.018473,2.659688,15.817590,21.099740,13.088001,5.165870
4,1001,2013-01-01,6.269657,7.856739,4.597571,2.519039,10.674157,12.786068,7.254874,3.954022,6.191749,7.843233,4.615259,2.612139,13.437818,19.924583,8.306178,5.615470
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1822225,95690,2024-04-01,6.603777,10.398830,5.126576,2.297402,10.546861,15.681480,8.790747,3.237000,6.566730,10.418272,4.999872,2.330773,13.361029,23.334358,9.513707,4.918053
1822226,95690,2024-07-01,6.613493,8.887031,5.290490,1.672723,10.652470,12.877749,9.190878,2.561212,6.555574,8.952482,5.179672,1.864182,12.981605,18.273138,10.132625,3.612286
1822227,95690,2024-10-01,8.854355,10.862511,7.815808,2.087152,14.614110,16.560163,13.559045,3.140699,8.751109,10.847654,7.656476,2.202967,17.895367,22.492900,15.353423,4.498697
1822228,95690,2025-01-01,8.360559,11.352139,6.560353,2.745049,13.423950,17.943642,10.927737,4.282795,8.285457,11.291422,6.506045,2.897186,15.958603,24.977722,12.860877,5.290032


In [13]:
df_vent_trimestriel_final.to_csv('df_vent_trimestriel_final.csv', index=False)