In [1]:
from dotenv import load_dotenv
load_dotenv() # Reads the .env file in the current directory
import os
import re
from datetime import datetime
import numpy as np
import pandas as pd
import requests
from netCDF4 import Dataset
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.sql.types import DoubleType, DateType
from pyspark.sql.functions import pandas_udf
spark = SparkSession.builder.getOrCreate()
import os

ACCOUNT_NAME = os.environ.get("ACCOUNT_NAME")
ACCOUNT_KEY = os.environ.get("ACCOUNT_KEY")
LAADS_TOKEN = os.environ.get("LAADS_TOKEN")

missing = [name for name, val in [
    ('LAADS_TOKEN', LAADS_TOKEN),
    ('ACCOUNT_NAME', ACCOUNT_NAME),
    ('ACCOUNT_KEY', ACCOUNT_KEY),
] if not val]
if missing:
    raise ValueError(f'Missing required environment variables: {missing}')
PRODUCT = 'MCD06COSP_D3_MODIS'
BASE_DETAILS_URL = 'https://ladsweb.modaps.eosdis.nasa.gov/api/v2/content/details'
BASE_ARCHIVES_URL = 'https://ladsweb.modaps.eosdis.nasa.gov/api/v2/content/archives'
headers_nasa = {
    'X-Requested-With': 'XMLHttpRequest',
    'Authorization': f'Bearer {LAADS_TOKEN}',
}
LAT_N = 51.3769
LAT_S = 50.7726
LON_W = -114.3362
LON_E = -113.7319
RAW_CONTAINER = "raw"
CURATED_CONTAINER = "curated"
GOLD_CONTAINER = "gold"


raw_prefix = f"abfs://{RAW_CONTAINER}@{ACCOUNT_NAME}.dfs.core.windows.net"
curated_prefix = f"abfs://{CURATED_CONTAINER}@{ACCOUNT_NAME}.dfs.core.windows.net"

storage_options = {
    "account_name": ACCOUNT_NAME,
    "account_key": ACCOUNT_KEY,
}

print("raw_prefix:", raw_prefix)
print("curated_prefix:", curated_prefix)

raw_prefix: abfs://raw@ucalgarydatalake01.dfs.core.windows.net
curated_prefix: abfs://curated@ucalgarydatalake01.dfs.core.windows.net


In [2]:
RAW_CONTAINER = "raw"
raw_prefix = f"abfs://{RAW_CONTAINER}@{ACCOUNT_NAME}.dfs.core.windows.net"
storage_options = {
    "account_name": ACCOUNT_NAME,
    "account_key": ACCOUNT_KEY,
}

In [3]:
def date_from_mcd06cosp_filename(fname: str) -> datetime:
    m = re.search(r'\.A(\d{4})(\d{3})', fname)
    if not m:
        raise ValueError(f'Could not parse date from {fname}')
    year = int(m.group(1))
    doy = int(m.group(2))
    return datetime.strptime(f'{year}{doy:03d}', '%Y%j')

def compute_cloud_fraction_from_bytes(
    file_bytes: bytes,
    lat_s: float = LAT_S,
    lat_n: float = LAT_N,
    lon_w: float = LON_W,
    lon_e: float = LON_E,
    group_name: str = 'Cloud_Mask_Fraction',
) -> float:
    with Dataset('inmem', mode='r', memory=file_bytes) as nc:
        lats = nc.variables['latitude'][:].astype(float)
        lons = nc.variables['longitude'][:].astype(float)
        nlat = lats.size
        nlon = lons.size
        if group_name not in nc.groups:
            raise KeyError(f"Group '{group_name}' not in {list(nc.groups.keys())}")
        grp = nc.groups[group_name]
        if 'Mean' not in grp.variables:
            raise KeyError(f"'Mean' not found in group '{group_name}': {list(grp.variables.keys())}")
        cloud = grp.variables['Mean'][:].astype(float)
        shape = cloud.shape
        if shape == (nlat, nlon):
            lat_first = True
        elif shape == (nlon, nlat):
            lat_first = False
        else:
            raise ValueError(f'Unexpected cloud array shape {shape} with nlat={nlat}, nlon={nlon}')
        lat_center = 0.5 * (lat_s + lat_n)
        lon_center = 0.5 * (lon_w + lon_e)
        lat_idx0 = int(np.argmin(np.abs(lats - lat_center)))
        lon_idx0 = int(np.argmin(np.abs(lons - lon_center)))
        lat_idx = np.arange(max(0, lat_idx0 - 1), min(nlat, lat_idx0 + 2))
        lon_idx = np.arange(max(0, lon_idx0 - 1), min(nlon, lon_idx0 + 2))
        if lat_idx.size == 0 or lon_idx.size == 0:
            return float('nan')
        if lat_first:
            subset = cloud[np.ix_(lat_idx, lon_idx)]
        else:
            subset = cloud[np.ix_(lon_idx, lat_idx)]
        subset = np.where(subset < -1e5, np.nan, subset)
        if np.all(np.isnan(subset)):
            return float('nan')
        return float(np.nanmean(subset))

In [4]:
from datetime import date, timedelta, datetime
import pandas as pd
import requests
from concurrent.futures import ThreadPoolExecutor, as_completed

BASE_DETAILS_URL = "https://ladsweb.modaps.eosdis.nasa.gov/api/v2/content/details"

def fetch_summer_details_for_year(year: int) -> pd.DataFrame:
    rows = []
    lat_box = f"[BBOX]N{LAT_N} S{LAT_S} W{LON_W} E{LON_E}"

    def extract_path(item: dict) -> str:
        if "downloadsLink" in item:
            url = item["downloadsLink"]
            return url.split("/archives/", 1)[1]
        if "self" in item:
            url = item["self"]
            return url.split("/details/", 1)[1]
        if "name" in item:
            return item["name"]
        raise KeyError(f"No archive/path field in {list(item.keys())}")

    d = date(year, 6, 1)
    end = date(year, 8, 31)

    while d <= end:
        temporal_range = f"{d:%Y-%m-%d}..{d:%Y-%m-%d}"
        params = {
            "products": PRODUCT,
            "temporalRanges": temporal_range,
            "regions": lat_box,
            "formats": "json",
        }

        try:
            resp = requests.get(
                BASE_DETAILS_URL,
                params=params,
                headers=headers_nasa,
                timeout=60,
            )
            if resp.status_code >= 500:
                print(f"Year {year}, date {d}: LAADS {resp.status_code}, skipping")
                d += timedelta(days=1)
                continue
            resp.raise_for_status()
            data = resp.json()
        except requests.RequestException as e:
            print(f"Year {year}, date {d}: request failed ({e}), skipping")
            d += timedelta(days=1)
            continue

        if isinstance(data, dict) and "content" in data:
            items = data["content"]
        elif isinstance(data, list):
            items = data
        else:
            items = []

        for it in items:
            rel_path = extract_path(it)
            fname = it.get("name", rel_path.split("/")[-1])

            if "dataDay" in it:
                left = it["dataDay"].split("=", 1)[0].strip()
                year_str, doy_str = left.split("-")
                dd = datetime.strptime(year_str + doy_str.zfill(3), "%Y%j").date()
            else:
                dd = date_from_mcd06cosp_filename(fname).date()

            rows.append(
                {
                    "year": dd.year,
                    "date": dd,
                    "rel_path": rel_path,
                    "file_name": fname,
                }
            )

        d += timedelta(days=1)

    if not rows:
        return pd.DataFrame(columns=["year", "date", "rel_path", "file_name"])

    df = pd.DataFrame(rows)
    df["date"] = pd.to_datetime(df["date"])
    df = df.sort_values("date").reset_index(drop=True)
    return df


In [5]:
paths = []

def process_year(year: int):
    df_raw_year = fetch_summer_details_for_year(year)
    if df_raw_year.empty:
        print(f"Year {year}: no data")
        return None
    raw_path_year = f"{raw_prefix}/laads_links_summer_{year}.parquet"
    df_raw_year.to_parquet(raw_path_year, index=False, storage_options=storage_options)
    print(f"Wrote RAW {raw_path_year} with {len(df_raw_year)} rows")
    return raw_path_year

years = list(range(2000, 2026))

with ThreadPoolExecutor(max_workers=6) as ex:
    futures = {ex.submit(process_year, y): y for y in years}
    for fut in as_completed(futures):
        p = fut.result()
        if p is not None:
            paths.append(p)


Wrote RAW abfs://raw@ucalgarydatalake01.dfs.core.windows.net/laads_links_summer_2005.parquet with 92 rows
Year 2000: no data
Wrote RAW abfs://raw@ucalgarydatalake01.dfs.core.windows.net/laads_links_summer_2004.parquet with 92 rows
Wrote RAW abfs://raw@ucalgarydatalake01.dfs.core.windows.net/laads_links_summer_2002.parquet with 59 rows
Year 2001: no data
Wrote RAW abfs://raw@ucalgarydatalake01.dfs.core.windows.net/laads_links_summer_2003.parquet with 92 rows
Wrote RAW abfs://raw@ucalgarydatalake01.dfs.core.windows.net/laads_links_summer_2007.parquet with 92 rows
Wrote RAW abfs://raw@ucalgarydatalake01.dfs.core.windows.net/laads_links_summer_2008.parquet with 92 rows
Wrote RAW abfs://raw@ucalgarydatalake01.dfs.core.windows.net/laads_links_summer_2006.parquet with 92 rows
Wrote RAW abfs://raw@ucalgarydatalake01.dfs.core.windows.net/laads_links_summer_2009.parquet with 92 rows
Wrote RAW abfs://raw@ucalgarydatalake01.dfs.core.windows.net/laads_links_summer_2010.parquet with 92 rows
Wrote RA

In [6]:
paths_df = pd.DataFrame({'raw_path': paths})
manifest_path = f'{raw_prefix}/laads_links_manifest.parquet'
paths_df.to_parquet(manifest_path, index=False, storage_options=storage_options)
print('Wrote manifest to:', manifest_path)


Wrote manifest to: abfs://raw@ucalgarydatalake01.dfs.core.windows.net/laads_links_manifest.parquet


In [8]:
manifest_path = f'{raw_prefix}/laads_links_manifest.parquet'
paths_df = pd.read_parquet(manifest_path, storage_options=storage_options)
paths = paths_df['raw_path'].tolist()

dfs = [
    pd.read_parquet(p, storage_options=storage_options)
    for p in paths
]

df_raw_pd = pd.concat(dfs, ignore_index=True)

print("Total raw rows:", len(df_raw_pd))


def fetch_cf_for_rel_path(rel_path: str) -> float:
    try:
        url = f"{BASE_ARCHIVES_URL}/{rel_path.lstrip('/')}"
        r = requests.get(url, headers=headers_nasa, stream=True, timeout=60)
        r.raise_for_status()
        return compute_cloud_fraction_from_bytes(r.content)
    except Exception as e:
        print("failed for", rel_path, "->", e)
        return float("nan")

with ThreadPoolExecutor(max_workers=16) as pool:
    cf_values = list(pool.map(fetch_cf_for_rel_path, df_raw_pd["rel_path"].tolist()))

df_raw_pd["cloud_fraction"] = cf_values

df_raw_pd["date_check"] = df_raw_pd["file_name"].apply(
    lambda fn: date_from_mcd06cosp_filename(fn).date()
)
df_raw_pd["date"] = df_raw_pd["date"].fillna(df_raw_pd["date_check"])

df_curated = (
    df_raw_pd[["year", "date", "rel_path", "file_name", "cloud_fraction"]]
    .drop_duplicates(subset=["rel_path"])
    .sort_values("date")
    .reset_index(drop=True)
)

curated_output_path = (
    f"{curated_prefix}/calgary_cloud_fraction_summers_2000_2025.parquet"
)
df_curated.to_parquet(curated_output_path, index=False, storage_options=storage_options)
print("Wrote CURATED to:", curated_output_path)



Total raw rows: 2173
Wrote CURATED to: abfs://curated@ucalgarydatalake01.dfs.core.windows.net/calgary_cloud_fraction_summers_2000_2025.parquet


In [12]:
df_raw_pd

Unnamed: 0,year,date,rel_path,file_name,cloud_fraction,date_check
0,2005,2005-06-01,MCD06COSP_D3_MODIS.A2005152.062.2022125041308.nc,MCD06COSP_D3_MODIS.A2005152.062.2022125041308.nc,0.999701,2005-06-01
1,2005,2005-06-02,MCD06COSP_D3_MODIS.A2005153.062.2022125040416.nc,MCD06COSP_D3_MODIS.A2005153.062.2022125040416.nc,0.923896,2005-06-02
2,2005,2005-06-03,MCD06COSP_D3_MODIS.A2005154.062.2022125083517.nc,MCD06COSP_D3_MODIS.A2005154.062.2022125083517.nc,0.988393,2005-06-03
3,2005,2005-06-04,MCD06COSP_D3_MODIS.A2005155.062.2022125045129.nc,MCD06COSP_D3_MODIS.A2005155.062.2022125045129.nc,0.936813,2005-06-04
4,2005,2005-06-05,MCD06COSP_D3_MODIS.A2005156.062.2022125041611.nc,MCD06COSP_D3_MODIS.A2005156.062.2022125041611.nc,0.954493,2005-06-05
...,...,...,...,...,...,...
2168,2025,2025-08-27,MCD06COSP_D3_MODIS.A2025239.062.2025247000914.nc,MCD06COSP_D3_MODIS.A2025239.062.2025247000914.nc,0.089881,2025-08-27
2169,2025,2025-08-28,MCD06COSP_D3_MODIS.A2025240.062.2025248000944.nc,MCD06COSP_D3_MODIS.A2025240.062.2025248000944.nc,0.366335,2025-08-28
2170,2025,2025-08-29,MCD06COSP_D3_MODIS.A2025241.062.2025249001021.nc,MCD06COSP_D3_MODIS.A2025241.062.2025249001021.nc,0.198984,2025-08-29
2171,2025,2025-08-30,MCD06COSP_D3_MODIS.A2025242.062.2025250000123.nc,MCD06COSP_D3_MODIS.A2025242.062.2025250000123.nc,0.358446,2025-08-30


In [10]:
from concurrent.futures import ThreadPoolExecutor
import numpy as np

GOLD_CONTAINER = "gold"
gold_prefix = f"abfs://{GOLD_CONTAINER}@{ACCOUNT_NAME}.dfs.core.windows.net"

curated_path = f"{curated_prefix}/calgary_cloud_fraction_summers_2000_2025.parquet"
df_curated = pd.read_parquet(curated_path, storage_options=storage_options)
df_curated = df_curated.dropna(subset=["cloud_fraction"]).copy()

def classify_cf(cf: float) -> int:
    if np.isnan(cf):
        return -1
    if cf < 0.15:
        return 0
    elif cf < 0.35:
        return 1
    elif cf < 0.65:
        return 2
    else:
        return 3

with ThreadPoolExecutor(max_workers=8) as pool:
    classes = list(pool.map(classify_cf, df_curated["cloud_fraction"].tolist()))

df_curated["quality_class"] = classes

df_class = (
    df_curated[["year", "date", "rel_path", "file_name", "cloud_fraction", "quality_class"]]
    .sort_values("date")
    .reset_index(drop=True)
)

gold_path = f"{gold_prefix}/calgary_cloud_quality_classification_2000_2025.parquet"
df_class.to_parquet(gold_path, index=False, storage_options=storage_options)

print("Wrote classification dataset to:", gold_path)
df_class

Wrote classification dataset to: abfs://gold@ucalgarydatalake01.dfs.core.windows.net/calgary_cloud_quality_classification_2000_2025.parquet


Unnamed: 0,year,date,rel_path,file_name,cloud_fraction,quality_class
0,2002,2002-07-04,MCD06COSP_D3_MODIS.A2002185.062.2022125093622.nc,MCD06COSP_D3_MODIS.A2002185.062.2022125093622.nc,0.692232,3
1,2002,2002-07-05,MCD06COSP_D3_MODIS.A2002186.062.2022125090452.nc,MCD06COSP_D3_MODIS.A2002186.062.2022125090452.nc,0.351298,2
2,2002,2002-07-06,MCD06COSP_D3_MODIS.A2002187.062.2022125093033.nc,MCD06COSP_D3_MODIS.A2002187.062.2022125093033.nc,0.069563,0
3,2002,2002-07-07,MCD06COSP_D3_MODIS.A2002188.062.2022125085450.nc,MCD06COSP_D3_MODIS.A2002188.062.2022125085450.nc,0.096447,0
4,2002,2002-07-08,MCD06COSP_D3_MODIS.A2002189.062.2022125093738.nc,MCD06COSP_D3_MODIS.A2002189.062.2022125093738.nc,0.999263,3
...,...,...,...,...,...,...
2168,2025,2025-08-27,MCD06COSP_D3_MODIS.A2025239.062.2025247000914.nc,MCD06COSP_D3_MODIS.A2025239.062.2025247000914.nc,0.089881,0
2169,2025,2025-08-28,MCD06COSP_D3_MODIS.A2025240.062.2025248000944.nc,MCD06COSP_D3_MODIS.A2025240.062.2025248000944.nc,0.366335,2
2170,2025,2025-08-29,MCD06COSP_D3_MODIS.A2025241.062.2025249001021.nc,MCD06COSP_D3_MODIS.A2025241.062.2025249001021.nc,0.198984,1
2171,2025,2025-08-30,MCD06COSP_D3_MODIS.A2025242.062.2025250000123.nc,MCD06COSP_D3_MODIS.A2025242.062.2025250000123.nc,0.358446,2


In [13]:
import pandas as pd

gold_prefix = "abfs://gold@ucalgarydatalake01.dfs.core.windows.net/calculated_cloud_fraction"

gold_df = pd.read_parquet(
    gold_prefix,               # <-- directory, no glob
    storage_options=storage_options,
    engine="pyarrow",          # optional but explicit
)


print(gold_df)

output_csv_path = "gold_cloud_fraction.csv"
gold_df.to_csv(output_csv_path, index=False)

print("CSV written to:", output_csv_path)


FileNotFoundError: gold/calculated_cloud_fraction