In [72]:
!pip install python-dotenv
!pip install --quiet xarray netCDF4 h5netcdf requests pandas numpy pyarrow adlfs



In [73]:
print(os.environ.get('LINKS_FS'))

DefaultEndpointsProtocol=https;AccountName=ucalgarydatalake01;AccountKey=vKNY9SpzF1ydaZ6X8it3s2OOkXxS2v9qUcMLlNBUCUnVzJOunsDz0JronE+G4MZUif1tMAaMGfTt+AStUmavaA==;EndpointSuffix=core.windows.net


In [74]:
import io
from dotenv import load_dotenv
load_dotenv() # Reads the .env file in the current directory
import os
import re
from datetime import datetime

import numpy as np
import pandas as pd
import requests
import xarray as xr

# ==== Load required environment variables ====
import os

LAADS_TOKEN  = os.environ.get("LAADS_TOKEN")
ACCOUNT_NAME = os.environ.get("ACCOUNT_NAME")
ACCOUNT_KEY  = os.environ.get("ACCOUNT_KEY")

missing = [name for name, val in [
    ("LAADS_TOKEN", LAADS_TOKEN),
    ("ACCOUNT_NAME", ACCOUNT_NAME),
    ("ACCOUNT_KEY", ACCOUNT_KEY),
] if not val]

if missing:
    raise ValueError(f"Missing required environment variables: {missing}")

storage_options = {
    "account_name": ACCOUNT_NAME,
    "account_key": ACCOUNT_KEY,
}

# Containers (filesystems) are literally named raw / curated / gold
RAW_CONTAINER     = "raw"
CURATED_CONTAINER = "curated"
GOLD_CONTAINER    = "gold"  # not used yet

# Keep blob names *very* short
raw_path = f"abfss://{RAW_CONTAINER}@{ACCOUNT_NAME}.dfs.core.windows.net/laads_links.parquet"
curated_path = f"abfss://{CURATED_CONTAINER}@{ACCOUNT_NAME}.dfs.core.windows.net/calgary_cloud_fraction.parquet"

print("raw_path:", raw_path, "len:", len(raw_path))
print("curated_path:", curated_path, "len:", len(curated_path))


raw_path: abfss://raw@ucalgarydatalake01.dfs.core.windows.net/laads_links.parquet len: 71
curated_path: abfss://curated@ucalgarydatalake01.dfs.core.windows.net/calgary_cloud_fraction.parquet len: 86


In [75]:
# --- Query LAADS `content/details` ---
params_details = {
    'products': PRODUCT,
    'temporalRanges': TEMPORAL_RANGE,
    'regions': REGION_PARAM,
    'formats': 'json',
}

resp = requests.get(BASE_DETAILS_URL, params=params_details, headers=headers_nasa)
resp.raise_for_status()
details = resp.json()

# Extract list of items
if isinstance(details, dict) and 'content' in details:
    items = details['content']
elif isinstance(details, list):
    items = details
else:
    raise ValueError('Unexpected JSON structure from LAADS; inspect `details`.')

print('Granules returned:', len(items))
if items:
    print('Example keys:', list(items[0].keys()))

PATH_FIELD_CANDIDATES = ['archivePath', 'path', 'name', 'fileName']

def extract_path(item):
    for k in PATH_FIELD_CANDIDATES:
        if k in item:
            return item[k]
    raise KeyError(f'No archive/path field found in keys: {list(item.keys())}')

rows = []
for it in items:
    rel_path = extract_path(it)
    fname = rel_path.split('/')[-1]
    rows.append({
        'rel_path': rel_path,
        'file_name': fname,
        'raw_json': it,
    })

df_raw = pd.DataFrame(rows)
df_raw.head()

Granules returned: 1
Example keys: ['archiveSets', 'cksum', 'collections', 'dataDay', 'downloadsLink', 'fileId', 'md5sum', 'mtime', 'name', 'products', 'resourceType', 'self', 'size', 'start', 'status']


Unnamed: 0,rel_path,file_name,raw_json
0,MCD06COSP_D3_MODIS.A2010001.062.2022124195350.nc,MCD06COSP_D3_MODIS.A2010001.062.2022124195350.nc,"{'archiveSets': 62, 'cksum': '771683725', 'col..."


In [76]:
# --- Write raw metadata into ADLS (`raw` layer) ---
df_raw.to_parquet(
    raw_path,
    index=False,
    storage_options=storage_options,
)
print('Raw metadata written to:', raw_path)

Raw metadata written to: abfss://raw@ucalgarydatalake01.dfs.core.windows.net/laads_links.parquet


In [77]:
# --- Load raw links from the lake ---
df_raw = pd.read_parquet(raw_path, storage_options=storage_options)
df_raw.head()

Unnamed: 0,rel_path,file_name,raw_json
0,MCD06COSP_D3_MODIS.A2010001.062.2022124195350.nc,MCD06COSP_D3_MODIS.A2010001.062.2022124195350.nc,"{'archiveSets': 62, 'cksum': '771683725', 'col..."


In [82]:
def date_from_mcd06cosp_filename(filename: str) -> datetime:
    """Extract date from MCD06COSP_D3_MODIS filename (AYYYYDDD)."""
    m = re.search(r"\.A(\d{4})(\d{3})", filename)
    if not m:
        raise ValueError(f'Could not parse date from {filename}')
    year = int(m.group(1))
    doy  = int(m.group(2))
    return datetime.strptime(f"{year}{doy:03d}", "%Y%j")

from netCDF4 import Dataset
import numpy as np

def compute_cloud_fraction_from_bytes(
    file_bytes: bytes,
    lat_s: float = LAT_S,
    lat_n: float = LAT_N,
    lon_w: float = LON_W,
    lon_e: float = LON_E,
    group_name: str = "Cloud_Mask_Fraction",
) -> float:
    """
    Open one MCD06COSP_D3_MODIS netCDF file from raw bytes and compute
    the mean cloud fraction near your Calgary bbox.

    Strategy:
      - take the center of the bbox
      - find the *nearest* latitude/longitude grid indices
      - average a small 3x3 window around that grid cell
    """
    with Dataset("inmem", mode="r", memory=file_bytes) as nc:
        # 1) Lat/lon arrays (1D)
        lats = nc.variables["latitude"][:].astype(float)   # (nlat,)
        lons = nc.variables["longitude"][:].astype(float)  # (nlon,)

        nlat = lats.size
        nlon = lons.size

        if group_name not in nc.groups:
            raise KeyError(f"Group '{group_name}' not in {list(nc.groups.keys())}")

        grp = nc.groups[group_name]
        if "Mean" not in grp.variables:
            raise KeyError(
                f"'Mean' not found in group '{group_name}': "
                f"{list(grp.variables.keys())}"
            )

        cloud = grp.variables["Mean"][:].astype(float)
        shape = cloud.shape

        if shape == (nlat, nlon):
            lat_first = True       # cloud[lat, lon]
        elif shape == (nlon, nlat):
            lat_first = False      # cloud[lon, lat]
        else:
            raise ValueError(
                f"Unexpected cloud array shape {shape} with nlat={nlat}, nlon={nlon}"
            )

        lat_center = 0.5 * (lat_s + lat_n)
        lon_center = 0.5 * (lon_w + lon_e)

        lat_idx0 = int(np.argmin(np.abs(lats - lat_center)))
        lon_idx0 = int(np.argmin(np.abs(lons - lon_center)))

        lat_idx = np.arange(max(0, lat_idx0 - 1), min(nlat, lat_idx0 + 2))
        lon_idx = np.arange(max(0, lon_idx0 - 1), min(nlon, lon_idx0 + 2))

        if lat_idx.size == 0 or lon_idx.size == 0:
            return float("nan")

        if lat_first:
            subset = cloud[np.ix_(lat_idx, lon_idx)]
        else:
            subset = cloud[np.ix_(lon_idx, lat_idx)]

        subset = np.where(subset < -1e5, np.nan, subset)  # treat big negatives as fill

        if np.all(np.isnan(subset)):
            return float("nan")

        return float(np.nanmean(subset))


In [83]:
records = []
for _, row in df_raw.iterrows():
    rel_path = row['rel_path']
    fname = row['file_name']
    dt = date_from_mcd06cosp_filename(fname)
    url = f"{BASE_ARCHIVES_URL}/{rel_path.lstrip('/')}"
    print(f"[fetch] {fname} ({dt.date()})")
    r = requests.get(url, headers=headers_nasa, stream=True)
    r.raise_for_status()
    cf = compute_cloud_fraction_from_bytes(r.content)
    print(f"   -> mean cloud fraction over bbox: {cf}")
    records.append({'date': dt, 'cloud_fraction': cf})

df_curated = pd.DataFrame(records)
df_curated['date'] = pd.to_datetime(df_curated['date'])
df_curated = df_curated.sort_values('date').reset_index(drop=True)
df_curated.head()

[fetch] MCD06COSP_D3_MODIS.A2010001.062.2022124195350.nc (2010-01-01)
   -> mean cloud fraction over bbox: 0.856668038528928


Unnamed: 0,date,cloud_fraction
0,2010-01-01,0.856668


In [80]:

test_bytes = r.content

from netCDF4 import Dataset
import numpy as np

with Dataset("inmem", mode="r", memory=test_bytes) as nc:
    print("ROOT VARIABLES:")
    print(list(nc.variables.keys()))

    print("\nGROUPS:")
    print(list(nc.groups.keys()))

    # print lat/lon shapes + min/max
    for cand in ["latitude", "lat", "Latitude", "LAT"]:
        if cand in nc.variables:
            lat = nc.variables[cand][:]
            print(f"\nLatitude variable '{cand}': shape={lat.shape}, min={lat.min()}, max={lat.max()}")
            break

    for cand in ["longitude", "lon", "Longitude", "LON"]:
        if cand in nc.variables:
            lon = nc.variables[cand][:]
            print(f"\nLongitude variable '{cand}': shape={lon.shape}, min={lon.min()}, max={lon.max()}")
            break

    if "Cloud_Mask_Fraction" in nc.groups:
        grp = nc.groups["Cloud_Mask_Fraction"]
        print("\nCloud_Mask_Fraction variables:", list(grp.variables.keys()))
        data = grp.variables["Mean"][:]
        print("Cloud fraction shape:", data.shape)



ROOT VARIABLES:
['latitude', 'longitude']

GROUPS:
['Solar_Zenith', 'Solar_Azimuth', 'Sensor_Zenith', 'Sensor_Azimuth', 'Cloud_Top_Pressure', 'Cloud_Mask_Fraction', 'Cloud_Mask_Fraction_Low', 'Cloud_Mask_Fraction_Mid', 'Cloud_Mask_Fraction_High', 'Cloud_Optical_Thickness_Liquid', 'Cloud_Optical_Thickness_Ice', 'Cloud_Optical_Thickness_Total', 'Cloud_Optical_Thickness_PCL_Liquid', 'Cloud_Optical_Thickness_PCL_Ice', 'Cloud_Optical_Thickness_PCL_Total', 'Cloud_Optical_Thickness_Log10_Liquid', 'Cloud_Optical_Thickness_Log10_Ice', 'Cloud_Optical_Thickness_Log10_Total', 'Cloud_Particle_Size_Liquid', 'Cloud_Particle_Size_Ice', 'Cloud_Particle_Size_PCL_Liquid', 'Cloud_Particle_Size_PCL_Ice', 'Cloud_Water_Path_Liquid', 'Cloud_Water_Path_Ice', 'Cloud_Water_Path_PCL_Liquid', 'Cloud_Water_Path_PCL_Ice', 'Cloud_Retrieval_Fraction_Liquid', 'Cloud_Retrieval_Fraction_Ice', 'Cloud_Retrieval_Fraction_Total', 'Cloud_Retrieval_Fraction_PCL_Liquid', 'Cloud_Retrieval_Fraction_PCL_Ice', 'Cloud_Retrieval_Frac

In [81]:
df_curated.to_parquet(
    curated_path,
    index=False,
    storage_options=storage_options,
)
print('Curated dataset written to:', curated_path)

Curated dataset written to: abfss://curated@ucalgarydatalake01.dfs.core.windows.net/calgary_cloud_fraction.parquet
