Notebook for the manipulation of the climate data before manipulating them with QGIS to reduce the size

In [47]:
import netCDF4 as nc
import numpy as np

## Version for mean temperature

### Version for mean temperature above 20°C

### Version for mean temperature above 25°C

## Version for minimum temperature

### Version for minimum temperature below 0°C

## Version for maximum temperature

In [48]:
path_to_climate_data = './tx_ens_mean_0.1deg_reg_v21.0e.nc'
output_file = 'yearly_average_maximum_temperatures_port.nc.nc'
var_name = 'tx'
out_var_name = 'average_max_temperature'

# Function to use to aggregate days in a year
func_over_year = np.average

### Version for maximum temperature above 30°C

## Version for precipitation sum

### Version for number of days without precipitation

### Version for longest period without rain

## Read file and create a new one

In [49]:
climate_data = nc.Dataset(path_to_climate_data)
climate_data

<class 'netCDF4._netCDF4.Dataset'>
root group (NETCDF4 data model, file format HDF5):
    E-OBS_version: 21.0e
    Conventions: CF-1.4
    References: http://surfobs.climate.copernicus.eu/dataaccess/access_eobs.php
    dimensions(sizes): longitude(705), latitude(465), time(25567)
    variables(dimensions): float64 longitude(longitude), float64 latitude(latitude), int32 time(time), int16 tx(time, latitude, longitude)
    groups: 

In [50]:
climate_data_by_year = nc.Dataset(output_file, "w")

In [51]:
# Copy global attributes
climate_data_by_year.setncatts(climate_data.__dict__)

#### For the longitude and latitude, let's restrict the map just to around Portugal

In [52]:
lon_lim = [-10., -6.]
lat_lim = [36., 42.5]

In [53]:
lons = climate_data.variables['longitude'][:]
lats = climate_data.variables['latitude'][:]

In [54]:
# Get indexes
where_lon = np.where((lons > lon_lim[0]) & (lons < lon_lim[1]))[0]
where_lat = np.where((lats > lat_lim[0]) & (lats < lat_lim[1]))[0]

In [55]:
port_lons = lons[where_lon]
port_lats = lats[where_lat]

In [56]:
coordinates = {'longitude': port_lons, 'latitude': port_lats}

In [57]:
# Copy dimensions but time
for name, dimension in climate_data.dimensions.items():    
    if name != 'time':
        climate_data_by_year.createDimension(name, len(coordinates[name]))

In [58]:
# Copy longitude and latitute variables
for name, variable in climate_data.variables.items():
    if name == 'longitude' or name == 'latitude':
        var = climate_data_by_year.createVariable(name, variable.datatype, variable.dimensions)
        climate_data_by_year[name][:] = coordinates[name]
        climate_data_by_year[name].setncatts(climate_data[name].__dict__)

In [59]:
climate_data_by_year

<class 'netCDF4._netCDF4.Dataset'>
root group (NETCDF4 data model, file format HDF5):
    E-OBS_version: 21.0e
    Conventions: CF-1.4
    References: http://surfobs.climate.copernicus.eu/dataaccess/access_eobs.php
    dimensions(sizes): longitude(40), latitude(65)
    variables(dimensions): float64 longitude(longitude), float64 latitude(latitude)
    groups: 

## Modification of time dimension to years

In [60]:
time_var = climate_data['time']

In [61]:
initial_year = 1995
final_year = 2019
years = np.arange(initial_year, final_year) # Will start indexing from 1995 up to 2018
num_of_years = len(years)

In [62]:
years

array([1995, 1996, 1997, 1998, 1999, 2000, 2001, 2002, 2003, 2004, 2005,
       2006, 2007, 2008, 2009, 2010, 2011, 2012, 2013, 2014, 2015, 2016,
       2017, 2018])

### Create the dimension and variable 'time' restricting to the wanted years

In [63]:
climate_data_by_year.createDimension('time', num_of_years)

<class 'netCDF4._netCDF4.Dimension'>: name = 'time', size = 24

In [64]:
climate_data_by_year.createVariable('time', 'i4', ('time',))
climate_data_by_year['time'].setncatts(time_var.__dict__)
climate_data_by_year['time'][:] = years

In [65]:
climate_data_by_year['time'].units = 'year'
climate_data_by_year['time'].long_name = 'Time in years'
climate_data_by_year['time']

<class 'netCDF4._netCDF4.Variable'>
int32 time(time)
    units: year
    long_name: Time in years
    calendar: standard
    standard_name: time
unlimited dimensions: 
current shape = (24,)
filling on, default _FillValue of -2147483647 used

### Restrict the original file days to the years 1995-2019

In [66]:
days_date = nc.num2date(time_var, units=time_var.units, calendar=time_var.calendar)[:]
where_days = np.where([d.year >= initial_year for d in days_date])[0]

In [67]:
days_sel = days_date[where_days]

In [68]:
days_sel

masked_array(data=[cftime.DatetimeGregorian(1995, 1, 1, 0, 0, 0, 0),
                   cftime.DatetimeGregorian(1995, 1, 2, 0, 0, 0, 0),
                   cftime.DatetimeGregorian(1995, 1, 3, 0, 0, 0, 0), ...,
                   cftime.DatetimeGregorian(2019, 12, 29, 0, 0, 0, 0),
                   cftime.DatetimeGregorian(2019, 12, 30, 0, 0, 0, 0),
                   cftime.DatetimeGregorian(2019, 12, 31, 0, 0, 0, 0)],
             mask=False,
       fill_value='?',
            dtype=object)

In [69]:
len(days_sel)

9131

In [70]:
(num_of_years + 1)*365 + num_of_years//4

9131

The number of days correspond to the number of years

# Calculate values of the variable over each year (for each lon and lat)

In [71]:
var = climate_data[var_name]
var.shape

(25567, 465, 705)

### Apply maks by longitude, latitude and year

In [72]:
var_sel = var[where_days, where_lat, where_lon]

In [73]:
# count() is a method of masked arrays that returns the number of not masked values in the array
var_sel[var_sel == 0].count()

66

## Transformations to calculate different values

In [74]:
#Values not masked -> below -20 there is no value
var_sel[var_sel<=-20].count()

0

In [75]:
# Masked values
var_sel[var_sel<=-20].size

9039690

**NOTE:** if you mask with values < a certain values, masked arrays will be included! Therefore, always need to include also a lower boundary to avoid it.

In [76]:
# To check, this value has to remain the same after the transformation
var_sel.count()

14700910

In [77]:
# For the version to calculate the number of days with average temperature over 20°C
if out_var_name == 'days_mean_temperature_over_20':
    var_sel[(var_sel < 20) & (var_sel > -20)] = 0
    var_sel[var_sel >= 20] = 1

In [78]:
# For the version to calculate the number of days with average temperature over 25°C
if out_var_name == 'days_mean_temperature_over_25':
    var_sel[(var_sel < 25) & (var_sel > -20)] = 0
    var_sel[var_sel >= 25] = 1

In [79]:
# For the version to calculate the number of days with minimum temperature below 0°C
if out_var_name == 'days_min_temperature_below_0':
    var_sel[var_sel > 0] = 10
    var_sel[(var_sel <= 0) & (var_sel > -20)] = 1
    var_sel[var_sel == 10] = 0

In [80]:
# For the version to calculate the number of days with average temperature over 25°C
if out_var_name == 'days_max_temperature_over_30':
    var_sel[(var_sel < 30) & (var_sel > -20)] = 0
    var_sel[var_sel >= 30] = 1

In [81]:
# If we want to calculate the days without precipitation in a year, put to 1 the number of days without precipitation and the
# others to 0
if (out_var_name == 'days_without_precipitation') or (out_var_name == 'max_days_without_precipitation'):
    var_sel[(var_sel != 0)  & (var_sel > -20)] = 10
    var_sel[var_sel == 0] = 1
    var_sel[var_sel == 10] = 0

In [82]:
# For comparison
func_over_year(var_sel)

21.50194620605119

In [83]:
var_sel.count()

14700910

### Create a different array for each agricultural year and average over it

**Note:** the agricultural year starts on September 1st. With the year 1995, we mean the weather in agricultural year going from 1st September 1995 to 31st August 1996. 

In [84]:
import cftime

In [85]:
# Create one mask for each agricultural year
where_years = []
days_sel_array = np.array(days_sel) # Otherwise masked array gave problem for comparison
for year in years:
    where_year = np.where((days_sel_array >= cftime.datetime(year, 9, 1, calendar='gregorian'))
                          & (days_sel_array < cftime.datetime(year + 1, 9, 1, calendar='gregorian')))[0]
    where_years.append(where_year)

In [86]:
# Create one array for each year
yearly_arrays = []
for i in range(len(years)):
    yearly_array = var_sel[where_years[i], :, :]
    yearly_arrays.append(yearly_array)

yearly_arrays[0].shape

(366, 65, 40)

In [87]:
# Check
func_over_year([func_over_year(arr) for arr in yearly_arrays])

21.4820613201962

Makes sense the value changes since we are not considering all the days of the years

In [88]:
# Calculate over first dimension everything
av_yearly_arrays = []
for array in yearly_arrays:
    av_yearly_arrays.append(func_over_year(array, axis=0))

av_yearly_arrays[0].shape

(65, 40)

In [89]:
# Check
func_over_year([func_over_year(arr) for arr in av_yearly_arrays])

21.482062949702538

### Restack to form the final one

In [90]:
var_sel_over_year = np.ma.stack(av_yearly_arrays, axis=0) 
var_sel_over_year.shape

(24, 65, 40)

In [91]:
 # Check
func_over_year(var_sel_over_year)

21.482062949702538

# Add variable per year

In [92]:
climate_data_by_year.createVariable(out_var_name, 'f4', ('time', 'latitude', 'longitude'))

<class 'netCDF4._netCDF4.Variable'>
float32 average_max_temperature(time, latitude, longitude)
unlimited dimensions: 
current shape = (24, 65, 40)
filling on, default _FillValue of 9.969209968386869e+36 used

In [93]:
yearly_temp = climate_data_by_year[out_var_name]
attributes = var.__dict__
attributes.pop('_FillValue')
climate_data_by_year[out_var_name].setncatts(attributes)

In [94]:
climate_data_by_year[out_var_name][:, :, :] = var_sel_over_year