In [None]:
##VERSION ONE###

import os
import pandas as pd
from loguru import logger
import geopandas as gpd
from glob import glob
import json
from rasterstats import zonal_stats
from datetime import datetime
import xarray as xr
import rasterio
import numpy as np
from shapely.geometry import mapping
import rioxarray as rxr
from datetime import datetime
from datetime import timezone

workflow_folder = 'C:/Users/artuso/Documents/CLIMAXX/preprocessing/New-folder'
# debug if folder does not exist - issue an error to check path

nc_directory = "P:/watxene/ISIMIP/ISIMIP3a/InputData/climate/obsclim_updated/GSWP3-W5E5"
file_starts = 'gswp3-w5e5_obsclim_{}_global_daily_'

# create outputs folder
if not os.path.exists(os.path.join(workflow_folder, 'data')):
    os.makedirs(os.path.join(workflow_folder, 'data'))

out_dir='C:/Users/artuso/Documents/CLIMAXX/preprocessing/New-folder/data'

nc_vars = ['pr']
os.chdir(nc_directory)
ncs = glob('**/*.nc')

logger.info(f'Found {len(ncs)} files.')

# globals - TEMPORARY FOR THE DEMO
regions = ['ITF1', 'ITF2', 'ITF3'] # Set sample regions - ITF1-3 (Italy)
rcolors = ['#579fe1', '#E19957', '#57e199'] # Set regiob colors - ITF1-3 (Italy)

# load nuts2 shape data
print('Load NUTS2 map')
shapefile = os.path.join(workflow_folder, "NUTS_RG_20M_2021_4326.shp")
nuts = gpd.read_file(shapefile)
# Select 3 Polygons in IT to use as an ouput example - TEMPORARY FOR THE DEMO
nuts = nuts.loc[nuts['NUTS_ID'].isin(regions)]
print(nuts)

# prepare an output table
output_tbl = pd.DataFrame(nuts)
output_tbl = output_tbl.drop(['geometry'], axis = 1)
output_tbl = output_tbl.loc[:,['NUTS_ID']]

for nc_var in nc_vars: 
    #if does not work install dependencies netcdf4, h5netcdf and dask
    nc_file_pattern = file_starts.format(nc_var)
    #print(nc_file_pattern)
    varfiles =  [f for f in ncs if nc_file_pattern in f]
    #logger.info(f'Loading {len(varfiles)} {nc_var} files...')
    ds = xr.open_mfdataset(varfiles)
    #logger.info(f'Resampling {nc_var} by month...')
    ds = ds.resample(time='M').mean()
    if ds.rio.crs is None:
        ds.rio.write_crs(4326, inplace=True)
    ds_c = ds.rio.clip(nuts.geometry.apply(mapping), nuts.crs)
    time_var='time'
    times = ds_c[time_var].values
    lat_var='lat'
    lon_var='lon'
    time_format="%Y-%m-%dT%H:%M:%S.%f000"
    #multiplier=1
    lats = ds_c[lat_var].values
    lons = ds_c[lon_var].values
    counter = 0
    for t in range(ds_c.time.size):
        # Get the data for this time step
        data = ds_c[nc_var][t,:,:].values
        #data = data * multiplier
        t_step = times[t]
        # t_step  = datetime.fromisoformat("%Y-%m-%dT%H:%M:%S.%f000"[:-1]).astimezone(timezone.utc)
        # t_step.strftime('%Y-%m-%d %H:%M:%S')
        datetime_step = datetime.strptime(str(t_step), time_format)
        month = str(datetime_step.month)
        year = str(datetime_step.year)
        monthyear = month + year
        if(len(month) == 1):
            month = '0' + month
        filename = f'{year}-{month}_{nc_var}.tif'
        filename = os.path.join(out_dir, filename)
        # Create a raster file using rasterio
        with rasterio.open(
            filename,
            'w',
            driver='GTiff',
            height=data.shape[0],
            width=data.shape[1],
            count=1,
            dtype=data.dtype,
            crs='epsg:4326',
            transform=rasterio.transform.from_bounds(lons.min(), lats.min(), lons.max(), lats.max(), data.shape[1], data.shape[0])
        ) as dst:
            #data[np.isnan(data)] = dst.nodata
            dst.write(data, 1) # writes data (each time step) in a new (tif) format
        with rasterio.open(filename) as src:
            affine = src.transform
            array = src.read(1)
            #array[np.isnan(array)] = src.nodata
            #nodata = src.nodata
            ds_zonal_stats = zonal_stats(nuts,array,affine=affine, stats=['sum']) #all_touched=True #nodata=nodata, #stats=['min', 'max', 'median', 'majority', 'sum']
            ds_zonal_stats = [i['sum'] for i in ds_zonal_stats]
            output_tbl[monthyear] = ds_zonal_stats
            # this loop would results in rows [nutsids] and columns [mon-year] -->
            # we need to drop all columns but the ids and values
                      
            
# we need to transpose - so we have rows mon-year and columns nuts ids (as columns names)           
print (output_tbl)

output_tbl = pd.DataFrame(output_tbl)
output_tbl_T = output_tbl.transpose()
print (output_tbl_T)

# and last we need to write to csv ,e.g., df.to_csv("./popInBasin2015_aqueduct4.csv", index=True)
output_tbl_T.to_csv("C:/Users/artuso/Documents/CLIMAXX/preprocessing/New-folder/data/precip_table.csv", index=True)



##VERSION TWO###
import os
import pandas as pd
from loguru import logger
import geopandas as gpd
from glob import glob
from rasterstats import zonal_stats
from datetime import datetime
import xarray as xr
import rasterio
import numpy as np
from shapely.geometry import mapping
import rioxarray as rxr

def process_variable(nc_var, varfiles, nuts, out_dir):
    logger.info(f'Loading {len(varfiles)} {nc_var} files...')
    
    ds = xr.open_mfdataset(varfiles)
    logger.info(f'Resampling {nc_var} by month...')
    ds = ds.resample(time='M').mean()
    
    if ds.rio.crs is None:
        ds.rio.write_crs(4326, inplace=True)
        
    ds_c = ds.rio.clip(nuts.geometry.apply(mapping), nuts.crs)
    
    time_var, lat_var, lon_var = 'time', 'lat', 'lon'
    time_format = "%Y-%m-%dT%H:%M:%S.%f000"
    
    lats, lons = ds_c[lat_var].values, ds_c[lon_var].values
    output_tbl = pd.DataFrame(nuts)
    
    for t, time_step in enumerate(ds_c[time_var].values):
        data = ds_c[nc_var][t, :, :].values
        datetime_step = datetime.strptime(str(time_step), time_format)
        monthyear = datetime_step.strftime('%m%Y')
        
        filename = f'{datetime_step:%Y-%m}_{nc_var}.tif'
        filename = os.path.join(out_dir, filename)
        
        save_raster(filename, data, lons, lats)
        
        with rasterio.open(filename) as src:
            affine = src.transform
            array = src.read(1)
            
            ds_zonal_stats = calculate_zonal_stats(nuts, array, affine)
            output_tbl[monthyear] = ds_zonal_stats
            
            if t == 0:
                output_tbl = output_tbl.loc[:,['NUTS_ID', monthyear]]

    return output_tbl

def save_raster(filename, data, lons, lats):
    with rasterio.open(
        filename,
        'w',
        driver='GTiff',
        height=data.shape[0],
        width=data.shape[1],
        count=1,
        dtype=data.dtype,
        crs='epsg:4326',
        transform=rasterio.transform.from_bounds(lons.min(), lats.min(), lons.max(), lats.max(), data.shape[1], data.shape[0])
    ) as dst:
        dst.write(data, 1)

def calculate_zonal_stats(nuts, array, affine):
    ds_zonal_stats = zonal_stats(nuts, array, affine=affine, stats=['sum'])
    return [i['sum'] for i in ds_zonal_stats]

workflow_folder = 'C:/Users/artuso/Documents/CLIMAXX/preprocessing/New-folder'
# debug if folder does not exist - issue an error to check path

nc_directory = "P:/watxene/ISIMIP/ISIMIP3a/InputData/climate/obsclim_updated/GSWP3-W5E5"

# create outputs folder
if not os.path.exists(os.path.join(workflow_folder, 'data')):
    os.makedirs(os.path.join(workflow_folder, 'data'))
out_dir = 'C:/Users/artuso/Documents/CLIMAXX/preprocessing/New-folder/data'
nc_vars = ['pr']

os.chdir(nc_directory)
ncs = glob('**/*.nc')

logger.info(f'Found {len(ncs)} files.')

regions = ['ITF1', 'ITF2', 'ITF3']
shapefile = os.path.join(workflow_folder, "NUTS_RG_20M_2021_4326.shp")
nuts = gpd.read_file(shapefile)
nuts = nuts.loc[nuts['NUTS_ID'].isin(regions)]

output_tbl = pd.DataFrame(nuts)
output_tbl = output_tbl.drop(['geometry'], axis = 1)
output_tbl = output_tbl.loc[:,['NUTS_ID']]

for nc_var in nc_vars:
    nc_file_pattern = f'gswp3-w5e5_obsclim_{nc_var}_global_daily_'
    varfiles = [f for f in ncs if nc_file_pattern in f]
    
    result_tbl = process_variable(nc_var, varfiles, nuts, out_dir)
    
    output_tbl = pd.merge(output_tbl, result_tbl, on='NUTS_ID')

output_tbl_T = output_tbl.set_index('NUTS_ID').transpose()
output_tbl_T.to_csv(os.path.join(out_dir, 'precip_table.csv'), index=True)