In [None]:
# Install packages from requirements.txt (needed for this session)
! pip install -r ./../../requirements.txt

# Importing Packages

In Python, **importing packages** means loading external libraries or modules into your program so you can use their pre-built functions and tools. 

This is important because it allows you to leverage existing, tested code — saving time and effort — and helps organize your work by using specialized functionality like handling dates (`pandas`), numerical computing (`numpy`), or working with NetCDF files (`xarray`).


In [None]:
import xarray as xr
import numpy as np
import pandas as pd

# Creation of a Synthetic NetCDF File Using xarray

## Understanding NetCDF Structure First

In this exercise, we will create a synthetic NetCDF dataset using the powerful Python library **xarray**. While xarray offers many convenient tools and abstractions for working with multi-dimensional scientific data, our primary focus here is to understand the underlying **structure and architecture of NetCDF files**.

NetCDF (Network Common Data Form) is a widely used format for storing array-oriented scientific data. It organizes data into:

- **Dimensions** (e.g., time, latitude, longitude, vertical levels)  
- **Coordinates** that describe these dimensions  
- **Variables** containing the actual data values  
- **Metadata (attributes)** that provide essential context and descriptions  

Before diving deep into xarray’s features, it's important to grasp how NetCDF files are structured so that you can better appreciate the abstractions xarray provides and understand the data you work with.

Let’s start by building a simple synthetic dataset and exploring its components step-by-step.


In [None]:
import numpy as np
import pandas as pd
import xarray as xr

# -------------------------------
# Define dimensions
# -------------------------------

time = pd.date_range("2025-01-01", periods=24*1, freq="h")  # hourly for 7 days
# Cyprus latitude range approx 34.5° to 35.7°
lat = np.arange(34.5, 35.7 + 0.05, 0.05)

# Cyprus longitude range approx 32.2° to 34.0°
lon = np.arange(32.2, 34.0 + 0.05, 0.05)
level = [1000, 850, 700, 500, 300]  # pressure levels in hPa

# -------------------------------
# Create data variables
# -------------------------------

tm2 = (15 + 8 * np.random.randn(len(time), len(lat), len(lon)))+273.15
# Compute baseline pressure profile (in hPa) decreasing exponentially with vertical level (altitude)
# 1013.25 hPa is sea level pressure, np.linspace creates levels from 0 to 1.5 (unitless scale height)
pressure_base = 1013.25 * np.exp(-np.linspace(0, 1.5, len(level)))

# Expand baseline profile to 3D array by adding new axes for lat and lon (broadcasting)
# Then add Gaussian noise (mean=0, std=5 hPa) to simulate horizontal variability in pressure
pressure = pressure_base[:, np.newaxis, np.newaxis] + 5 * np.random.randn(len(level), len(lat), len(lon))

pressure = np.tile(pressure, (len(time), 1, 1, 1))
pressure += 5 * np.random.randn(*pressure.shape)

# -------------------------------
# Build Dataset with variable attributes
# -------------------------------

ds = xr.Dataset(
    {
        "tm2": (["time", "lat", "lon"], tm2,
                                {
                                    "units": "K",
                                    "long_name": "Surface Air Temperature",
                                    "standard_name": "air_temperature",
                                    "description": "Temperature measured near the surface"
                                }),
        "pressure": (["time", "level", "lat", "lon"], pressure,
                     {
                         "units": "hPa",
                         "long_name": "Atmospheric Pressure",
                         "standard_name": "air_pressure",
                         "description": "Pressure at standard atmospheric levels"
                     }),
    },
    coords={
        "time": ("time", time, {
            "long_name": "Time",
            "standard_name": "time"
        }),
        "level": ("level", level, {
            "units": "hPa",
            "long_name": "Pressure Level",
            "positive": "down",
            "standard_name": "air_pressure"
        }),
        "lat": ("lat", lat, {
            "units": "degrees_north",
            "long_name": "Latitude",
            "standard_name": "latitude"
        }),
        "lon": ("lon", lon, {
            "units": "degrees_east",
            "long_name": "Longitude",
            "standard_name": "longitude"
        }),
    },
    attrs={
        "title": "Synthetic Multi-Variable NetCDF for Cyprus (Hourly, 7 days)",
        "description": "Surface temperature and atmospheric pressure variables with metadata",
        "institution": "Training Session 2025"
    }
)

# -------------------------------
# Save to NetCDF, if desired
# -------------------------------
if True: #set True/False to toggle file saving 
    ds.to_netcdf("cyprus_multi_variable_hourly_metadata.nc", unlimited_dims="time")

# Display dataset structure
ds


In [None]:
print(ds)

# Data and Metadata in NetCDF

Metadata is a crucial feature of NetCDF files. 

- **Data** contains the actual measurements or variables (e.g., temperature, pressure).
- **Metadata** provides descriptive information that explains the data:
  - Units of measurement
  - Variable names and descriptions
  - Coordinates and dimensions
  - Creation history and source

Metadata helps make data self-describing and easier to interpret by people and software.

---

| Aspect        | Data                          | Metadata                              |
|---------------|-------------------------------|-------------------------------------|
| **Definition**| Actual measurements or values | Descriptions about the data          |
| **Examples**  | Temperature, pressure         | Units, variable names, attributes    |
| **Type**     | Numeric arrays (floats, ints) | Strings, numbers, dates, attributes  |
| **Purpose**   | Represents physical phenomena | Provides context and meaning         |
| **Stored as** | Variables                     | Attributes (global or variable-level)|


In [None]:
print(ds.tm2) # or, equivalent: print(ds['surface_temperature'])
print("\n")
print(50*"-")
print("\n")

print("Detect and print metadata for each variable")

In [None]:
#Nested loop 
# Loop over variables in the dataset
# Loop over variables in the dataset
for var_name in ds.variables:
    print(f"\nAttributes for variable '{var_name}':")

    # Access variable attributes
    for key, val in ds[var_name].attrs.items():
        print(f"  {key}: {val}")
    if var_name in ds.coords:
        print(f"  [+] {var_name} is a coordinate!")
    else:
        print(f"  [-] '{var_name}' is full variable")

In [None]:
# Convert 'tm2' variable from Kelvin to Celsius inplace
ds["tm2_C"] = ds["tm2"] - 273.15
ds["tm2_C"].attrs["units"] = "°C"  # Update units metadata accordingly

print("The 'tm2' variable has been converted inplace from Kelvin to Celsius.")
print()

print(f"After conversion, the minimum and maximum values of 'tm2' are: {ds['tm2_C'].min():.2f} °C and {ds['tm2_C'].max():.2f} °C respectively.")
print()

#Something missing