In [None]:
#=====================================
# Libraries
#=====================================
import xarray as xr
import pandas as pd

In [None]:
#========================================
# Preparing the preprocessing
#========================================
# Directory of the project
basedir = "C:/Users/ilung/Documents/Jedha_bootcamp/capstone_project/data"

# Importing the spreadsheet containing the name and geographic coordinates of cities
cities = pd.read_excel("C:/Users/ilung/Documents/Jedha_bootcamp/capstone_project/data/clean/france_communes_wgs84.xlsx")
cities_list = cities["name"].to_list()
cities_long = cities["longitude"].to_list()
cities_lat = cities["latitude"].to_list()

# Variables to be used in the preprocessing
pollutants_list = ["no2", "pm2p5", "pm10", "o3", "so2"] # Also the names used in the folders, files names and netcdf variables
months_list = [str(x).zfill(2) for x in range(1, 13)] # Months from 01 to 12. Used in files name
years_list = [str(x) for x in list(range(2013, 2023))] # From 2013 to 2022

In [36]:
#===============================
# Summarising the raw data
#===============================

for year in years_list:
    pollutants_df = []

    for pollutant in pollutants_list:
        monthly_avg_list = []
        concentration_avg_list = []

        for month in months_list:
            file = f"{basedir}/raw/{year}/{pollutant}/cams.eaq.vra.ENSa.{pollutant}.l0.{year}-{month}.nc"
            dataset = xr.open_dataset(file, engine="netcdf4")
            
            # Summarising the dataset from hourly to monthly average
            monthly_avg = dataset[pollutant].mean(dim='time') # Compute the mean over the time dimension
            monthly_avg_ds = monthly_avg.to_dataset(name=pollutant) # Create a new dataset with only latitude and longitude dimensions
            monthly_avg_list.append(monthly_avg_ds) # A list of monthly average

        # Summarising the monthly dataset to yearly
        ds_list = [ds.expand_dims(time=[i+1]) for i, ds in enumerate(monthly_avg_list)] # Adding a time dimension to each dataset
        merged_ds = xr.concat(ds_list, dim="time") # Concatenating the dataset along the time dimension
        yearly_avg = merged_ds[pollutant].mean(dim="time") #  Creating the yearly average dataset
        yearly_avg_ds = yearly_avg.to_dataset(name=pollutant)
        
        # Extraction of concentration for each city
        for latitude, longitude in zip(cities_lat, cities_long):
            concentration = yearly_avg_ds[pollutant].sel(lon = longitude , lat = latitude, method = "nearest").item() # Extraction of the year concentration
            concentration_avg_list.append(concentration)
        df = pd.DataFrame(concentration_avg_list, columns=[pollutant])
        pollutants_df.append(df)

    final_df = pd.concat(pollutants_df, axis=1)
    df_final_merged = pd.concat([cities, final_df], axis=1)
    df_final_merged.to_excel(f"C:/Users/ilung/Documents/Jedha_bootcamp/capstone_project/data/clean/france/{year}.xlsx", index=False)
