## Load datasets

In [None]:
import xarray as xr
import pandas as pd
import numpy as np

# to resolve xarray FutureWarning
xr.set_options(use_new_combine_kwarg_defaults=True)

<xarray.core.options.set_options at 0x20e606ea270>

In [2]:
# Check all available datasets first
eu_capitals = pd.read_csv("datasets/european_cities.csv")

In [3]:
past = ['1940-1949', '1950-1959', '1960-1970']
# present = ['1994-2003', '2004-2013', '2014-2024']

datasets = {}

for period in past:
    print(f"Loading {period}...")
    # Ouvrir le dataset
    ds = xr.open_dataset(rf"era5/t2m/{period}_t2m.nc")
    datasets[period] = ds

# Concatenate all datasets
ds_all = xr.concat(datasets.values(), dim='time')

ds_all

Loading 1940-1949...
Loading 1950-1959...
Loading 1960-1970...


In [4]:
ds_all['2t'].time.values

array(['1940-01-01T00:00:00.000000000', '1940-01-01T03:00:00.000000000',
       '1940-01-01T06:00:00.000000000', ...,
       '1970-12-31T15:00:00.000000000', '1970-12-31T18:00:00.000000000',
       '1970-12-31T21:00:00.000000000'],
      shape=(90584,), dtype='datetime64[ns]')

## Keep points in a radius of 50km around selected cities

In [5]:
radius = 0.45

# capitales tri√©es
eu_cap = eu_capitals.sort_values(["latitude", "longitude"])

# limites du dataset
lat_min_ds = ds_all.lat.min().item()
lat_max_ds = ds_all.lat.max().item()
lon_min_ds = ds_all.lon.min().item()
lon_max_ds = ds_all.lon.max().item()

# filtrer les capitales valides
eu_cap = eu_cap[
    (eu_cap['latitude'] >= lat_min_ds) & (eu_cap['latitude'] <= lat_max_ds) &
    (eu_cap['longitude'] >= lon_min_ds) & (eu_cap['longitude'] <= lon_max_ds)
].reset_index(drop=True)

print(f"Processing {len(eu_cap)} cities")

# Process each city one at a time to avoid memory issues
city_data_list = []

for idx, row in eu_cap.iterrows():
    lat = row['latitude']
    lon = row['longitude']
    name = row['name']
    
    # Select data within radius using xarray lazy operations
    lat_min = lat - radius
    lat_max = lat + radius
    lon_min = lon - radius
    lon_max = lon + radius
    
    # Use xarray selection to get subset
    ds_subset = ds_all['2t'].sel(
        lat=slice(lat_max, lat_min),  # latitude is descending in ERA5
        lon=slice(lon_min, lon_max)
    )
    
    # Compute mean over spatial dimensions (this loads only the subset)
    city_mean = ds_subset.mean(dim=['lat', 'lon']).values
    
    city_data_list.append(city_mean)
    
    if (idx + 1) % 10 == 0:
        print(f"  Processed {idx + 1}/{len(eu_cap)} cities")

# Stack all city data
city_data_array = np.column_stack(city_data_list)  # (time, n_cities)

# Create DataArray with proper coordinates
ds_cities = xr.DataArray(
    city_data_array,
    dims=("time", "city"),
    coords={
        "time": ds_all['2t'].time.values,
        "city": eu_cap['name'].values,
        "latitude": ("city", eu_cap['latitude'].values),
        "longitude": ("city", eu_cap['longitude'].values)
    }
)

print(f"\n‚úÖ Processing completed!")
print(f"Final shape: {ds_cities.shape}")
print(f"Cities: {len(eu_cap)}")
print(f"Time steps: {len(ds_cities.time)}")

Processing 292 cities
  Processed 10/292 cities
  Processed 20/292 cities
  Processed 30/292 cities
  Processed 40/292 cities
  Processed 50/292 cities
  Processed 60/292 cities
  Processed 70/292 cities
  Processed 80/292 cities
  Processed 90/292 cities
  Processed 100/292 cities
  Processed 110/292 cities
  Processed 120/292 cities
  Processed 130/292 cities
  Processed 140/292 cities
  Processed 150/292 cities
  Processed 160/292 cities
  Processed 170/292 cities
  Processed 180/292 cities
  Processed 190/292 cities
  Processed 200/292 cities
  Processed 210/292 cities
  Processed 220/292 cities
  Processed 230/292 cities
  Processed 240/292 cities
  Processed 250/292 cities
  Processed 260/292 cities
  Processed 270/292 cities
  Processed 280/292 cities
  Processed 290/292 cities

‚úÖ Processing completed!
Final shape: (90584, 292)
Cities: 292
Time steps: 90584


In [6]:
ds_cities

## Compute daily max and min temperature per city

In [7]:
def compute_t2m_daily_max_min(da):
    """Compute daily maximum and minimum temperature from hourly temperature data.

    Args:
        da (xr.DataArray): Input DataArray with dimensions ('time', 'city') or ('time', 'latitude', 'longitude').
    Returns:
        xr.Dataset: Dataset with 'max_t2m' and 'min_t2m' variables
    """

    daily_max = da.resample(time='1D').max()
    daily_min = da.resample(time='1D').min()
    
    # Create a Dataset with both variables
    ds = xr.Dataset({
        'max_t2m': daily_max,
        'min_t2m': daily_min
    })
    
    return ds

In [8]:
# Keep only max and min daily temperature per city

ds_cities_daily = compute_t2m_daily_max_min(ds_cities)
ds_cities_daily

In [9]:
# Verify that max_t2m is always greater than or equal to min_t2m

assert np.all(ds_cities_daily['max_t2m'] >= ds_cities_daily['min_t2m']), "Error: max_t2m is less than min_t2m somewhere!"
print("Verification passed: max_t2m is always greater than or equal to min_t2m.")

Verification passed: max_t2m is always greater than or equal to min_t2m.


## Compute nb of days with temperature above 30 degrees Celsius

In [10]:
ds_cities_daily.max_t2m

In [11]:
def find_nb_days_above_threshold_per_year(da, threshold):
    """Find number of days above a temperature threshold per year."""
    # Convert threshold from Celsius to Kelvin
    threshold_k = threshold + 273.15
    
    # Create a boolean DataArray where True indicates temperature above threshold
    above_threshold = da > threshold_k
    
    # Group by year and sum the number of days above the threshold
    nb_days_per_year = above_threshold.groupby('time.year').sum(dim='time')
    
    return nb_days_per_year

def compute_avg_hot_days_per_city(ds_reduced, threshold=30):
    """
    Compute average number of days above threshold per year for each city over the entire period.
    
    Args:
        ds_reduced: dict with reduced datasets
        period: string like '2021_2050'
        threshold: temperature threshold in Celsius
    
    Returns:
        dict: {city_name: avg_days_above_threshold}
    """
    # Get tasmax data
    tasmax = ds_reduced.max_t2m
    
    # Find days above threshold per year for each city
    days_per_year = find_nb_days_above_threshold_per_year(tasmax, threshold)
    
    # Calculate mean across all years for each city
    avg_days_per_city = days_per_year.mean(dim='year')
    
    # Convert to dictionary
    result = {str(city): float(avg_days_per_city.sel(city=city).values) 
              for city in avg_days_per_city.city.values}
    
    return result

In [12]:
avg_days_above_30C_per_city = compute_avg_hot_days_per_city(ds_cities_daily, threshold=30)
avg_days_above_30C_per_city['Bern']

0.12903225806451613

In [13]:
# Example

print(f"Average number of days above 30¬∞C per year in Lausanne: {avg_days_above_30C_per_city['Lausanne']}")
print(f"Average number of days above 30¬∞C per year in Madrid: {avg_days_above_30C_per_city['Madrid']}")

Average number of days above 30¬∞C per year in Lausanne: 0.12903225806451613
Average number of days above 30¬∞C per year in Madrid: 45.096774193548384


## Compute nb consecutive days above 30 degrees Celsius

In [14]:
def max_consecutive_days(arr):
    """Helper function to find the maximum number of consecutive True values in a 1D boolean array."""
    max_count = 0
    current_count = 0
    
    for value in arr:
        if value:
            current_count += 1
            max_count = max(max_count, current_count)
        else:
            current_count = 0
            
    return max_count

def find_max_consecutive_days_per_year(da, threshold):
    """Find maximum number of consecutive days above a temperature threshold per year for each city.

    Args:
        da: xarray DataArray with dimensions (time, city)
        threshold: temperature threshold in Celsius
    Returns:
        xarray DataArray with dimensions (year, city) containing max consecutive days per year
    """
    threshold_k = threshold + 273.15
    above_threshold = da > threshold_k
    
    # Group by year
    grouped = above_threshold.groupby('time.year')
    
    # For each year and each city, find max consecutive days
    results = []
    years = []
    
    for year, group in grouped:
        years.append(year)
        # group has dimensions (time, city)
        year_results = []
        
        for city_idx in range(group.sizes['city']):
            # Extract 1D array for this city
            city_data = group.isel(city=city_idx).values
            max_consec = max_consecutive_days(city_data)
            year_results.append(max_consec)
        
        results.append(year_results)
    
    # Create DataArray with proper dimensions
    result_da = xr.DataArray(
        results,
        dims=['year', 'city'],
        coords={
            'year': years,
            'city': da.city.values
        }
    )
    
    return result_da

def compute_avg_consec_hot_days_per_city(ds_reduced, threshold=30):
    """
    Compute average maximum consecutive days above threshold per year for each city.
    
    Args:
        ds_reduced: dict with reduced datasets
        period: string like '2021_2050'
        threshold: temperature threshold in Celsius
    Returns:
        dict: {city_name: avg_max_consecutive_days_above_threshold}
    """
    # Get tasmax data
    tasmax = ds_reduced.max_t2m
    
    # Find max consecutive days per year for each city
    consec_days_per_year = find_max_consecutive_days_per_year(tasmax, threshold)
    
    # Calculate mean across all years for each city
    avg_consec_days_per_city = consec_days_per_year.mean(dim='year')
    
    # Convert to dictionary with proper string keys
    result = {}
    for city in avg_consec_days_per_city.city.values:
        city_name = city.item() if hasattr(city, 'item') else str(city)
        result[city_name] = float(avg_consec_days_per_city.sel(city=city).values)
    
    return result

In [15]:
avg_consecutive_days_above_30C_per_city = compute_avg_consec_hot_days_per_city(ds_cities_daily, threshold=30)

In [16]:
# Example

print(f"Average number of consecutive days above 30¬∞C per year in Lausanne: {avg_consecutive_days_above_30C_per_city['Lausanne']}")
print(f"Average number of consecutive days above 30¬∞C per year in Madrid: {avg_consecutive_days_above_30C_per_city['Madrid']}")

Average number of consecutive days above 30¬∞C per year in Lausanne: 0.12903225806451613
Average number of consecutive days above 30¬∞C per year in Madrid: 15.225806451612904


## Aggregate daily data to seasonal data

In [17]:
def aggregate_per_season(dataset):
    """Aggregate daily data to seasonal data by taking the mean over each season."""
    seasonal_data = dataset.resample(time='QS-DEC').mean()
    return seasonal_data

In [18]:
ds_seasonal = aggregate_per_season(ds_cities_daily)

In [19]:
ds_seasonal

In [20]:
# Verification
print(f"Original dataset time points: {ds_cities_daily.sizes['time']}")
print(f"Seasonal aggregated dataset time points: {ds_seasonal.sizes['time']}")

Original dataset time points: 11323
Seasonal aggregated dataset time points: 125


# Average climate features for each decade

In [21]:
def average_over_period(dataset, month) -> xr.Dataset:
    """Average the dataset over a specified time period."""
    period_data = dataset.sel(time=dataset['time.month'] == month)
    averaged_data = period_data.mean(dim='time')
    return averaged_data

In [22]:
ds_periods = xr.Dataset()
months = [12, 3, 6, 9]  # winter, spring, summer, autumn

for month in months:
    ds_month = average_over_period(
        ds_seasonal,
        month
    )
    if ds_periods.sizes:
        ds_periods = xr.concat([ds_periods, ds_month], dim='time')
    else:
        ds_periods = ds_month

In [23]:
ds_periods.sel(city='Lausanne')  # Example: winter

## Reconstruct the dataframe with all features for all decades

In [24]:
def construct_dataframe(dataset, hot_days, hot_days_consec) -> pd.DataFrame:
    """Construct dataframe with one column for each season and features and additional columns for hot days statistics and columns for metadata.

    Args:
        dataset (xr.Dataset): Dataset with seasonal temperature features.
        hot_days (dict): Dictionary with average number of hot days per city.
        hot_days_consec (dict): Dictionary with average number of consecutive hot days per city.
    Returns:
        pd.DataFrame: DataFrame with seasonal features and hot days statistics.
    """

    data = {}
    seasons = ['winter', 'spring', 'summer', 'autumn']
    
    # Extract city names and ensure string type for dictionary lookup
    cities = [str(city) for city in dataset.city.values]
    
    # Add metadata columns first
    data['city'] = cities
    data['latitude'] = [dataset.latitude.sel(city=city).values for city in dataset.city.values]
    data['longitude'] = [dataset.longitude.sel(city=city).values for city in dataset.city.values]
    
    # Add seasonal features
    for i, season in enumerate(seasons):
        season_data = dataset.isel(time=i)
        for feature in season_data.data_vars:
            col_name = f"{feature}_{season}"
            # Use explicit city indexing to maintain order
            data[col_name] = [season_data[feature].sel(city=city).values for city in dataset.city.values]
        # Compute temperature range
        data[f'temp_range_{season}'] = [
            (season_data['max_t2m'].sel(city=city).values - season_data['min_t2m'].sel(city=city).values)
            for city in dataset.city.values
        ]
    
    # Add hot days statistics with explicit string conversion
    data['hot_days'] = [hot_days[str(city)] for city in dataset.city.values]
    data['hot_days_consec'] = [hot_days_consec[str(city)] for city in dataset.city.values]
    
    return pd.DataFrame(data)

In [25]:
df_final = construct_dataframe(ds_periods, avg_days_above_30C_per_city, avg_consecutive_days_above_30C_per_city).sort_values('city').reset_index(drop=True)
df_final.head()

Unnamed: 0,city,latitude,longitude,max_t2m_winter,min_t2m_winter,temp_range_winter,max_t2m_spring,min_t2m_spring,temp_range_spring,max_t2m_summer,min_t2m_summer,temp_range_summer,max_t2m_autumn,min_t2m_autumn,temp_range_autumn,hot_days,hot_days_consec
0,Aachen,50.776642,6.08342,276.10883,272.22626,3.882568,285.0144,277.98325,7.031158,292.88797,285.56985,7.318115,285.62918,280.07114,5.558044,1.225806,0.677419
1,Aberdeen,57.143688,-2.09814,278.4561,275.89136,2.564728,281.3751,278.114,3.261078,287.36282,284.17975,3.183075,284.12115,281.33508,2.786072,0.0,0.0
2,Aix-en-Provence,43.528301,5.44973,281.84695,275.5724,6.274567,289.44458,281.70117,7.743408,299.23914,290.57208,8.667053,291.3472,284.20383,7.143372,8.741935,3.83871
3,Alcal√° de Henares,40.482052,-3.35996,281.22195,273.9165,7.30545,289.46692,279.2759,10.19101,301.44443,288.99738,12.447052,291.34665,281.74014,9.606506,40.0,13.967742
4,Alicante,38.345169,-0.48149,286.0779,280.24158,5.836334,291.297,284.60458,6.692413,300.09616,293.21368,6.882477,294.0286,287.95935,6.069244,9.096774,3.516129


In [27]:
df_final.shape

(292, 17)

## Save the datasets as CSV files

In [26]:
df_final.to_csv(f"datasets/climate_features_1940-1970_temperature.csv", index=False)
print(f"Saved climate_features_1940-1970_temperature.csv")

Saved climate_features_1940-1970_temperature.csv
