# Weather Data Retrieval Testing

The purpose of this notebook is to test the outputs of the weather data retrieval module to ensure data is in the correct format, and retrieved as requested.

### Importing Libraries

In [18]:
import numpy as np
import matplotlib.pyplot as plt
import xarray as xr
import os
from pathlib import Path
import zipfile
import magic

# import pygrib as pg

### Helper Functions

In [19]:
def show_dir_contents(path='.'):
    """List contents of current directory"""
    base = Path(path).resolve()
    print("Contents of current directory:")
    for p in base.rglob('*'):
        print("  -> " + str(p.relative_to(base)))

In [20]:
def print_directory_tree(start_path='.', show_files=False):
    """Print directory tree similar to 'tree' command"""
    for root, dirs, files in os.walk(start_path):
        # Calculate indentation
        level = root.replace(start_path, '').count(os.sep)
        indent = '‚îÇ   ' * level

        if root == start_path:
            print(f"üìÅ {os.path.basename(root) or '.'}")
        else:
            print(f"{indent}‚îî‚îÄ‚îÄ üìÅ {os.path.basename(root)}")

        subindent = '‚îÇ   ' * (level + 1)

        if show_files:
            for f in sorted(files):
                print(f"{subindent}‚îî‚îÄ‚îÄ üìÑ {f}")

In [21]:
def get_mime_type(path):
    """Get MIME type of file using python-magic"""
    m = magic.Magic(mime=True)
    return m.from_file(path)


In [22]:
def read_magic(path, n=8):
    """Read first n bytes of file to determine MIME type"""
    with open(path, 'rb') as f:
        return f.read(n)

In [23]:
def is_grib(path):
    """Check if file is GRIB by reading first 4 bytes"""
    try:
        with open(path, 'rb') as f:
            return f.read(4) == b'GRIB'
    except OSError:
        return False

In [24]:
def is_zip(path):
    """Check if file is ZIP by using zipfile module"""
    return zipfile.is_zipfile(path)

In [25]:
def testing_filepaths(path):
    print(f"Testing filepath for [{os.path.basename(path)}]:")
    print(f"  - MIME type: {get_mime_type(path)}")
    print(f"  - Magic bytes: {read_magic(path)}")

In [26]:
def print_grib_summary(ds, show_attrs=False, show_stats=True):
    """Pretty print GRIB dataset summary"""

    print("=" * 80)
    print("GRIB FILE SUMMARY")
    print("=" * 80)

    # Basic info
    print(f"\nüìä BASIC INFORMATION")
    print(f"   File size: {ds.nbytes / (1024 * 1024):.2f} MB")
    print(f"   GRIB edition: {ds.attrs.get('GRIB_edition', 'N/A')}")
    print(f"   Centre: {ds.attrs.get('GRIB_centreDescription', ds.attrs.get('GRIB_centre', 'N/A'))}")

    # Dimensions
    print(f"\nüìê DIMENSIONS ({len(ds.dims)} total)")
    for dim_name, dim_size in ds.dims.items():
        coord = ds[dim_name]
        if hasattr(coord, 'values') and len(coord.values) > 0:
            range_str = f"{coord.values[0]} to {coord.values[-1]}"
        else:
            range_str = "N/A"
        print(f"   {dim_name:12s}: {dim_size:6d} elements ({range_str})")

    # Variables
    print(f"\nüìà DATA VARIABLES ({len(ds.data_vars)} total)")
    for var_name in ds.data_vars:
        var = ds[var_name]
        attrs = var.attrs

        print(f"   ‚îú‚îÄ {var_name}")
        print(f"   ‚îÇ  Shape: {var.shape}")
        print(f"   ‚îÇ  Type: {var.dtype}")
        print(f"   ‚îÇ  Size: {var.nbytes / 1024:.1f} KB")

        if 'long_name' in attrs:
            print(f"   ‚îÇ  Description: {attrs['long_name']}")
        if 'units' in attrs:
            print(f"   ‚îÇ  Units: {attrs['units']}")

        if show_stats:
            try:
                min_val = float(var.min().values)
                max_val = float(var.max().values)
                mean_val = float(var.mean().values)
                print(f"   ‚îÇ  Range: {min_val:.4f} to {max_val:.4f}")
                print(f"   ‚îÇ  Mean: {mean_val:.4f}")
            except:
                pass

    # Time information
    print(f"\n‚è∞ TIME INFORMATION")
    if 'time' in ds.coords:
        time_coord = ds['time']
        if len(time_coord) > 0:
            print(f"   Start: {time_coord.values[0]}")
            print(f"   End: {time_coord.values[-1]}")
            print(f"   Steps: {len(time_coord)}")
            print(f"   Frequency: Inferring from data...")

    if 'step' in ds.coords:
        step_coord = ds['step']
        if len(step_coord) > 0:
            print(f"   Forecast steps: {len(step_coord)}")
            print(f"   Step range: {step_coord.values[0]} to {step_coord.values[-1]}")

    # Spatial information
    print(f"\nüåç SPATIAL INFORMATION")
    if 'latitude' in ds.coords and 'longitude' in ds.coords:
        lat = ds['latitude']
        lon = ds['longitude']
        print(f"   Latitude: {lat.values[0]:.2f}¬∞ to {lat.values[-1]:.2f}¬∞ ({len(lat)} points)")
        print(f"   Longitude: {lon.values[0]:.2f}¬∞ to {lon.values[-1]:.2f}¬∞ ({len(lon)} points)")
        print(f"   Grid resolution: {(lat.values[1] - lat.values[0]):.2f}¬∞ x {(lon.values[1] - lon.values[0]):.2f}¬∞")
        print(f"   Grid size: {len(lat)} x {len(lon)} = {len(lat) * len(lon)} points")

    # Metadata
    if show_attrs:
        print(f"\nüìù METADATA")
        for key, value in ds.attrs.items():
            if isinstance(value, str) and len(value) > 100:
                print(f"   {key}: {value[:100]}...")
            else:
                print(f"   {key}: {value}")

    print("\n" + "=" * 80)

### Defining Filepaths

In [27]:
!pwd

/Users/Daniel/Desktop/open-source-marginal-emissions.nosync/notebooks


In [35]:
current_dir = os.getcwd()
base_dir = os.path.dirname(current_dir)

print_directory_tree(base_dir, show_files=False)

üìÅ open-source-marginal-emissions.nosync
‚îÇ   ‚îî‚îÄ‚îÄ üìÅ information
‚îÇ   ‚îî‚îÄ‚îÄ üìÅ logs
‚îÇ   ‚îÇ   ‚îî‚îÄ‚îÄ üìÅ weather_data_retrieval
‚îÇ   ‚îÇ   ‚îî‚îÄ‚îÄ üìÅ grid_data_retrieval
‚îÇ   ‚îî‚îÄ‚îÄ üìÅ configs
‚îÇ   ‚îÇ   ‚îî‚îÄ‚îÄ üìÅ pipelines
‚îÇ   ‚îÇ   ‚îî‚îÄ‚îÄ üìÅ weather
‚îÇ   ‚îÇ   ‚îî‚îÄ‚îÄ üìÅ grid
‚îÇ   ‚îî‚îÄ‚îÄ üìÅ packages
‚îÇ   ‚îÇ   ‚îî‚îÄ‚îÄ üìÅ osme_common
‚îÇ   ‚îÇ   ‚îÇ   ‚îî‚îÄ‚îÄ üìÅ tests
‚îÇ   ‚îÇ   ‚îÇ   ‚îî‚îÄ‚îÄ üìÅ docs
‚îÇ   ‚îÇ   ‚îÇ   ‚îî‚îÄ‚îÄ üìÅ src
‚îÇ   ‚îÇ   ‚îÇ   ‚îÇ   ‚îî‚îÄ‚îÄ üìÅ osme_common
‚îÇ   ‚îÇ   ‚îÇ   ‚îÇ   ‚îÇ   ‚îî‚îÄ‚îÄ üìÅ __pycache__
‚îÇ   ‚îÇ   ‚îî‚îÄ‚îÄ üìÅ weather_data_retrieval
‚îÇ   ‚îÇ   ‚îÇ   ‚îî‚îÄ‚îÄ üìÅ tests
‚îÇ   ‚îÇ   ‚îÇ   ‚îî‚îÄ‚îÄ üìÅ docs
‚îÇ   ‚îÇ   ‚îÇ   ‚îî‚îÄ‚îÄ üìÅ src
‚îÇ   ‚îÇ   ‚îÇ   ‚îÇ   ‚îî‚îÄ‚îÄ üìÅ weather_data_retrieval
‚îÇ   ‚îÇ   ‚îÇ   ‚îÇ   ‚îÇ   ‚îî‚îÄ‚îÄ üìÅ io
‚îÇ   ‚îÇ   ‚îÇ   ‚îÇ   ‚îÇ   ‚îÇ   ‚îî‚îÄ‚îÄ üìÅ __pycache__
‚îÇ   ‚îÇ   ‚îÇ   ‚îÇ   ‚îÇ   ‚îî‚îÄ‚îÄ 

In [29]:
downloaded_data_dir = base_dir + "/data/era5-land/raw/"
show_dir_contents(downloaded_data_dir)

Contents of current directory:
  -> era5-land_N37W68S6E98_8b8bc3761fec_2018-06.grib
  -> era5-land_N38W68S36E70_02254a67da08_2018-06.grib
  -> .DS_Store
  -> era5-land_N37W68S6E98_8b8bc3761fec_2018-02.download
  -> era5-land_N38W68S36E70_02254a67da08_2018-01.grib
  -> era5-land_N38W68S36E70_02254a67da08_2018-02.grib
  -> era5-land_N37W68S6E98_8b8bc3761fec_2018-03.grib
  -> era5-land_N38W68S36E70_02254a67da08_2018-02.grib.5b7b6.idx
  -> era5-land_N38W68S36E70_02254a67da08_2018-03.grib
  -> era5-land_N38W68S36E70_02254a67da08_2018-04.grib
  -> era5-land_N37W68S6E98_8b8bc3761fec_2018-04.grib
  -> era5-land_N37W68S6E98_8b8bc3761fec_2018-01 copy.zip
  -> era5-land_N37W68S6E98_8b8bc3761fec_2018-05.grib
  -> era5-land_N37W68S6E98_8b8bc3761fec_2018-01.download
  -> data.grib
  -> era5-land_N38W68S36E70_02254a67da08_2018-05.grib


In [30]:
test_filepath_1 = downloaded_data_dir + "era5-land_N37W68S6E98_8b8bc3761fec_2018-02.download"
test_filepath_2 = downloaded_data_dir + "era5-land_N38W68S36E70_02254a67da08_2018-02.grib"
test_filepath_3 = downloaded_data_dir + "era5-land_N37W68S6E98_8b8bc3761fec_2018-01 copy.zip"

In [31]:
testing_filepaths(test_filepath_1)
testing_filepaths(test_filepath_2)
testing_filepaths(test_filepath_3)

Testing filepath for [era5-land_N37W68S6E98_8b8bc3761fec_2018-02.download]:
  - MIME type: application/zip
  - Magic bytes: b'PK\x03\x04\x14\x00\x00\x00'
Testing filepath for [era5-land_N38W68S36E70_02254a67da08_2018-02.grib]:
  - MIME type: application/grib;edition=1
  - Magic bytes: b'GRIB\x00\x03\xde\x01'
Testing filepath for [era5-land_N37W68S6E98_8b8bc3761fec_2018-01 copy.zip]:
  - MIME type: application/octet-stream
  - Magic bytes: b'PK\x03\x04\x14\x00\x00\x00'


In [32]:
grib_file = xr.open_dataset(test_filepath_2, engine='cfgrib')
print(grib_file)

<xarray.Dataset> Size: 2MB
Dimensions:     (time: 29, step: 24, latitude: 21, longitude: 21)
Coordinates:
  * time        (time) datetime64[ns] 232B 2018-01-31 2018-02-01 ... 2018-02-28
  * step        (step) timedelta64[ns] 192B 01:00:00 ... 1 days 00:00:00
  * latitude    (latitude) float64 168B 38.0 37.9 37.8 37.7 ... 36.2 36.1 36.0
  * longitude   (longitude) float64 168B 68.0 68.1 68.2 68.3 ... 69.8 69.9 70.0
    number      int64 8B ...
    surface     float64 8B ...
    valid_time  (time, step) datetime64[ns] 6kB ...
Data variables:
    t2m         (time, step, latitude, longitude) float32 1MB ...
    tp          (time, step, latitude, longitude) float32 1MB ...
Attributes:
    GRIB_edition:            1
    GRIB_centre:             ecmf
    GRIB_centreDescription:  European Centre for Medium-Range Weather Forecasts
    GRIB_subCentre:          0
    Conventions:             CF-1.7
    institution:             European Centre for Medium-Range Weather Forecasts
    history:      

In [33]:
print_grib_summary(grib_file, show_stats=True)

GRIB FILE SUMMARY

üìä BASIC INFORMATION
   File size: 2.35 MB
   GRIB edition: 1
   Centre: European Centre for Medium-Range Weather Forecasts

üìê DIMENSIONS (4 total)
   time        :     29 elements (2018-01-31T00:00:00.000000000 to 2018-02-28T00:00:00.000000000)
   step        :     24 elements (3600000000000 nanoseconds to 86400000000000 nanoseconds)
   latitude    :     21 elements (38.0 to 36.0)
   longitude   :     21 elements (68.0 to 70.0)

üìà DATA VARIABLES (2 total)
   ‚îú‚îÄ t2m
   ‚îÇ  Shape: (29, 24, 21, 21)
   ‚îÇ  Type: float32
   ‚îÇ  Size: 1199.0 KB
   ‚îÇ  Description: 2 metre temperature
   ‚îÇ  Units: K
   ‚îÇ  Range: 245.7868 to 298.0542
   ‚îÇ  Mean: 278.1219
   ‚îú‚îÄ tp
   ‚îÇ  Shape: (29, 24, 21, 21)
   ‚îÇ  Type: float32
   ‚îÇ  Size: 1199.0 KB
   ‚îÇ  Description: Total precipitation
   ‚îÇ  Units: m


  for dim_name, dim_size in ds.dims.items():


   ‚îÇ  Range: 0.0000 to 0.0309
   ‚îÇ  Mean: 0.0008

‚è∞ TIME INFORMATION
   Start: 2018-01-31T00:00:00.000000000
   End: 2018-02-28T00:00:00.000000000
   Steps: 29
   Frequency: Inferring from data...
   Forecast steps: 24
   Step range: 3600000000000 nanoseconds to 86400000000000 nanoseconds

üåç SPATIAL INFORMATION
   Latitude: 38.00¬∞ to 36.00¬∞ (21 points)
   Longitude: 68.00¬∞ to 70.00¬∞ (21 points)
   Grid resolution: -0.10¬∞ x 0.10¬∞
   Grid size: 21 x 21 = 441 points



In [34]:
test_filepath_4 = downloaded_data_dir + "era5-land_N38W68S36E70_02254a67da08_2019-02.grib"

testing_filepaths(test_filepath_4)

Testing filepath for [era5-land_N38W68S36E70_02254a67da08_2019-02.grib]:
  - MIME type: application/grib;edition=1
  - Magic bytes: b'GRIB\x00\x03\xde\x01'
