Notebook for the manipulation of the climate data before manipulating them with QGIS to reduce the size

In [1]:
import netCDF4 as nc
import numpy as np

## Version for mean temperature

## Version for precipitation sum

In [2]:
path_to_climate_data = './rr_ens_mean_0.1deg_reg_v21.0e.nc'
output_file = 'yearly_precipitation_sum_port.nc'
var_name = 'rr'
out_var_name = 'precipitation_sum'

# Function to use to aggregate days in a year
func_over_year = np.sum

## Read file and create a new one

In [3]:
climate_data = nc.Dataset(path_to_climate_data)
climate_data

<class 'netCDF4._netCDF4.Dataset'>
root group (NETCDF4 data model, file format HDF5):
    CDI: Climate Data Interface version 1.6.3 (http://code.zmaw.de/projects/cdi)
    CDO: Climate Data Operators version 1.6.3 (http://code.zmaw.de/projects/cdo)
    E-OBS_version: 21.0e
    Conventions: CF-1.4
    References: http://surfobs.climate.copernicus.eu/dataaccess/access_eobs.php
    dimensions(sizes): longitude(705), latitude(465), time(25567)
    variables(dimensions): float64 longitude(longitude), float64 latitude(latitude), float64 time(time), int16 rr(time, latitude, longitude)
    groups: 

In [4]:
climate_data_by_year = nc.Dataset(output_file, "w")

In [5]:
# Copy global attributes
climate_data_by_year.setncatts(climate_data.__dict__)

#### For the longitude and latitude, let's restrict the map just to around Portugal

In [6]:
lon_lim = [-10., -6.]
lat_lim = [36., 42.5]

In [7]:
lons = climate_data.variables['longitude'][:]
lats = climate_data.variables['latitude'][:]

In [8]:
# Get indexes
where_lon = np.where((lons > lon_lim[0]) & (lons < lon_lim[1]))[0]
where_lat = np.where((lats > lat_lim[0]) & (lats < lat_lim[1]))[0]

In [9]:
port_lons = lons[where_lon]
port_lats = lats[where_lat]

In [10]:
coordinates = {'longitude': port_lons, 'latitude': port_lats}

In [11]:
# Copy dimensions but time
for name, dimension in climate_data.dimensions.items():    
    if name != 'time':
        climate_data_by_year.createDimension(name, len(coordinates[name]))

In [12]:
# Copy longitude and latitute variables
for name, variable in climate_data.variables.items():
    if name == 'longitude' or name == 'latitude':
        var = climate_data_by_year.createVariable(name, variable.datatype, variable.dimensions)
        climate_data_by_year[name][:] = coordinates[name]
        climate_data_by_year[name].setncatts(climate_data[name].__dict__)

In [13]:
climate_data_by_year

<class 'netCDF4._netCDF4.Dataset'>
root group (NETCDF4 data model, file format HDF5):
    CDI: Climate Data Interface version 1.6.3 (http://code.zmaw.de/projects/cdi)
    CDO: Climate Data Operators version 1.6.3 (http://code.zmaw.de/projects/cdo)
    E-OBS_version: 21.0e
    Conventions: CF-1.4
    References: http://surfobs.climate.copernicus.eu/dataaccess/access_eobs.php
    dimensions(sizes): longitude(40), latitude(65)
    variables(dimensions): float64 longitude(longitude), float64 latitude(latitude)
    groups: 

## Modification of time dimension to years

In [14]:
time_var = climate_data['time']

In [15]:
initial_year = 1995
final_year = 2019
years = np.arange(initial_year, final_year) # Will start indexing from 1995 up to 2018
num_of_years = len(years)

In [16]:
years

array([1995, 1996, 1997, 1998, 1999, 2000, 2001, 2002, 2003, 2004, 2005,
       2006, 2007, 2008, 2009, 2010, 2011, 2012, 2013, 2014, 2015, 2016,
       2017, 2018])

### Create the dimension and variable 'time' restricting to the wanted years

In [17]:
climate_data_by_year.createDimension('time', num_of_years)

<class 'netCDF4._netCDF4.Dimension'>: name = 'time', size = 24

In [18]:
climate_data_by_year.createVariable('time', 'i4', ('time',))
climate_data_by_year['time'].setncatts(time_var.__dict__)
climate_data_by_year['time'][:] = years

In [19]:
climate_data_by_year['time'].units = 'year'
climate_data_by_year['time'].long_name = 'Time in years'
climate_data_by_year['time']

<class 'netCDF4._netCDF4.Variable'>
int32 time(time)
    standard_name: time
    long_name: Time in years
    units: year
    calendar: standard
unlimited dimensions: 
current shape = (24,)
filling on, default _FillValue of -2147483647 used

### Restrict the original file days to the years 1995-2019

In [20]:
days_date = nc.num2date(time_var, units=time_var.units, calendar=time_var.calendar)[:]
where_days = np.where([d.year >= initial_year for d in days_date])[0]

In [21]:
days_sel = days_date[where_days]

In [22]:
days_sel

masked_array(data=[cftime.DatetimeGregorian(1995, 1, 1, 0, 0, 0, 0),
                   cftime.DatetimeGregorian(1995, 1, 2, 0, 0, 0, 0),
                   cftime.DatetimeGregorian(1995, 1, 3, 0, 0, 0, 0), ...,
                   cftime.DatetimeGregorian(2019, 12, 29, 0, 0, 0, 0),
                   cftime.DatetimeGregorian(2019, 12, 30, 0, 0, 0, 0),
                   cftime.DatetimeGregorian(2019, 12, 31, 0, 0, 0, 0)],
             mask=False,
       fill_value='?',
            dtype=object)

In [23]:
len(days_sel)

9131

In [24]:
(num_of_years + 1)*365 + num_of_years//4

9131

The number of days correspond to the number of years

# Calculate average values of the variable over each year (for each lon and lat)

In [25]:
var = climate_data[var_name]
var.shape

(25567, 465, 705)

#### Apply maks by longitude, latitude and year

In [None]:
var_sel = var[where_days, where_lat, where_lon]

In [None]:
var_sel.shape

In [None]:
func_over_year(var_sel)

### Create a different array for each agricultural year and average over it

**Note:** the agricultural year starts on September 1st. With the year 1995, we mean the weather in agricultural year going from 1st September 1995 to 31st August 1996. 

In [None]:
import cftime

In [None]:
# Create one mask for each agricultural year
where_years = []
days_sel_array = np.array(days_sel) # Otherwise masked array gave problem for comparison
for year in years:
    where_year = np.where((days_sel_array >= cftime.datetime(year, 9, 1, calendar='gregorian'))
                          & (days_sel_array < cftime.datetime(year + 1, 9, 1, calendar='gregorian')))[0]
    where_years.append(where_year)

In [None]:
# Create one array for each year
yearly_arrays = []
for i in range(len(years)):
    yearly_array = var_sel[where_years[i], :, :]
    yearly_arrays.append(yearly_array)

yearly_arrays[0].shape

In [None]:
func_over_year([func_over_year(arr) for arr in yearly_arrays])

Makes sense the average changes since we are not considering all the days of the years

In [None]:
# Average over first dimension everything
av_yearly_arrays = []
for array in yearly_arrays:
    av_yearly_arrays.append(func_over_year(array, axis=0))

In [None]:
func_over_year([func_over_year(arr) for arr in av_yearly_arrays])

### Restack to form the final one

In [None]:
var_sel_over_year = np.ma.stack(av_yearly_arrays, axis=0) 
var_sel_over_year.shape

In [None]:
func_over_year(var_sel_over_year)

# Add variable per year

In [None]:
climate_data_by_year.createVariable(out_var_name, 'f4', ('time', 'latitude', 'longitude'))

In [None]:
yearly_temp = climate_data_by_year[out_var_name]
attributes = var.__dict__
attributes.pop('_FillValue')
climate_data_by_year[out_var_name].setncatts(attributes)

In [None]:
climate_data_by_year[out_var_name][:, :, :] = var_sel_over_year