In [1]:
# Cell 1 - Setup: Clone repo and install dependencies
# Mount Google Drive for output storage
from google.colab import drive
drive.mount('/content/drive')

import os

# Create output directory on Drive
TELNET_DATADIR = '/content/drive/MyDrive/telnet_data'
os.makedirs(TELNET_DATADIR, exist_ok=True)
os.makedirs(f'{TELNET_DATADIR}/data/models', exist_ok=True)
os.makedirs(f'{TELNET_DATADIR}/shapefiles', exist_ok=True)
os.makedirs(f'{TELNET_DATADIR}/results', exist_ok=True)
os.environ['TELNET_DATADIR'] = TELNET_DATADIR
print(f"TELNET_DATADIR set to: {TELNET_DATADIR}")

# Clone the repo
%cd /content
!rm -rf telnet
!git clone https://github.com/gscerveira/telnet.git
%cd telnet

# Install uv for fast package management
!pip install -q uv

# Install dependencies
!uv pip install --system -q -r docker/requirements.txt
!uv pip install --system -q gcsfs s3fs geopandas rioxarray

print("\nDependencies installed!")

Mounted at /content/drive
TELNET_DATADIR set to: /content/drive/MyDrive/telnet_data
/content
Cloning into 'telnet'...
remote: Enumerating objects: 345, done.[K
remote: Counting objects: 100% (50/50), done.[K
remote: Compressing objects: 100% (33/33), done.[K
remote: Total 345 (delta 27), reused 40 (delta 17), pack-reused 295 (from 1)[K
Receiving objects: 100% (345/345), 21.48 MiB | 49.88 MiB/s, done.
Resolving deltas: 100% (176/176), done.
/content/telnet
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m23.3/23.3 MB[0m [31m91.8 MB/s[0m eta [36m0:00:00[0m
[?25h
Dependencies installed!


In [None]:
# Cell 1.5 - Hardcode Maranhão coordinates (for non-interactive runs)
# Creates lat_lon_boundaries.txt so the notebook can run without user input

import os

# Maranhão state bounding box coordinates
MARANHAO_COORDS = {
    'init_lat': -10.5,   # Southernmost latitude
    'final_lat': -1.0,   # Northernmost latitude
    'init_lon': -49.0,   # Westernmost longitude
    'final_lon': -41.5,  # Easternmost longitude
}

# Create the boundaries file in the repo data directory
data_dir = '/content/telnet/data'
os.makedirs(data_dir, exist_ok=True)

bounds_file = os.path.join(data_dir, 'lat_lon_boundaries.txt')
with open(bounds_file, 'w') as f:
    f.write(f"{MARANHAO_COORDS['init_lat']}\n")
    f.write(f"{MARANHAO_COORDS['final_lat']}\n")
    f.write(f"{MARANHAO_COORDS['init_lon']}\n")
    f.write(f"{MARANHAO_COORDS['final_lon']}\n")

print(f"Maranhão coordinates hardcoded to: {bounds_file}")
print(f"  Latitude: {MARANHAO_COORDS['init_lat']} to {MARANHAO_COORDS['final_lat']}")
print(f"  Longitude: {MARANHAO_COORDS['init_lon']} to {MARANHAO_COORDS['final_lon']}")

In [None]:
# Cell 2 - Verify GPU
!nvidia-smi

import torch
print(f"\nPyTorch version: {torch.__version__}")
print(f"CUDA available: {torch.cuda.is_available()}")

if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name(0)}")
else:
    print("WARNING: No GPU available! Switch to a GPU runtime.")

In [None]:
# Cell 3 - Test ARCO ERA5 Access (quick connectivity check)
%cd /content/telnet

print("Testing ARCO ERA5 access...")
import gcsfs

fs = gcsfs.GCSFileSystem(token='anon')
path = 'gcp-public-data-arco-era5/ar/full_37-1h-0p25deg-chunk-1.zarr-v3'
files = fs.ls(path)[:5]
print("Connection successful! Found files:")
for f in files:
    print(f"  {f}")
print("\nARCO ERA5 is accessible.")

In [None]:
# Cell 3.5 - Setup CDS API credentials for ERA5 download
# You need a free account at https://cds.climate.copernicus.eu/
# Get your API key from: https://cds.climate.copernicus.eu/how-to-api

import os

# INSTRUCTIONS:
# 1. Register for a free account at https://cds.climate.copernicus.eu/
# 2. Go to https://cds.climate.copernicus.eu/how-to-api to get your API key
# 3. Replace 'YOUR-API-KEY-HERE' with your actual API key

CDS_URL = "https://cds.climate.copernicus.eu/api"
CDS_KEY = "YOUR-API-KEY-HERE"  # <-- Replace with your API key from CDS website

# Create .cdsapirc file
cdsapirc_content = f"""url: {CDS_URL}
key: {CDS_KEY}
"""

cdsapirc_path = os.path.expanduser('~/.cdsapirc')
with open(cdsapirc_path, 'w') as f:
    f.write(cdsapirc_content)

print("CDS API credentials configured.")
print(f"Config file written to: {cdsapirc_path}")
print()
if CDS_KEY == "YOUR-API-KEY-HERE":
    print("⚠️  WARNING: You need to replace 'YOUR-API-KEY-HERE' with your actual API key!")
    print("   Register at: https://cds.climate.copernicus.eu/")
    print("   Get API key: https://cds.climate.copernicus.eu/how-to-api")
else:
    print("✓ API key configured. Ready to download ERA5 data.")

In [None]:
# Cell 4 - Download ERSSTv5 and Maranhao shapefile
%cd /content/telnet
import os
os.environ['TELNET_DATADIR'] = '/content/drive/MyDrive/telnet_data'

# Download ERSSTv5
print("=" * 60)
print("  Downloading ERSSTv5 sea surface temperature data...")
print("=" * 60)
from download_preprocess_data import download_ersstv5
download_ersstv5('1940-01-01', '2025-12-01')

# Download shapefile
print("\n" + "=" * 60)
print("  Downloading Maranhao shapefile...")
print("=" * 60)
!python download_maranhao_shapefile.py

print("\nDownloads complete!")
print("\nNOTE: For ERA5 data, download manually from CDS:")
print("  - https://cds.climate.copernicus.eu/datasets/reanalysis-era5-single-levels-monthly-means")
print("  - https://cds.climate.copernicus.eu/datasets/reanalysis-era5-pressure-levels-monthly-means")
print("Then run Cell 4.5 to preprocess.")

In [None]:
# Cell 4.5 - Preprocess ERA5 files for climate indices
# Run this after downloading ERA5 data from CDS
# This creates the preprocessed files needed by compute_climate_indices.py

import xarray as xr
import numpy as np
import os

era5_dir = '/content/drive/MyDrive/telnet_data/era5'
os.makedirs(era5_dir, exist_ok=True)

# Check what raw files exist
print("Checking for ERA5 raw files...")
required_files = {
    'u10': 'era5_u10_1940-2025.nc',
    'v10': 'era5_v10_1940-2025.nc',
    'hgt_500': 'era5_hgt_500_1940-2025.nc',
    'hgt_700': 'era5_hgt_700_1940-2025.nc',
    'hgt_1000': 'era5_hgt_1000_1940-2025.nc',
}

missing = []
for var, fname in required_files.items():
    fpath = os.path.join(era5_dir, fname)
    if os.path.exists(fpath):
        size = os.path.getsize(fpath) / 1e9
        print(f"  ✓ {fname} ({size:.2f} GB)")
    else:
        print(f"  ✗ {fname} MISSING")
        missing.append(var)

if missing:
    print(f"\nMissing files: {missing}")
    print("Download from CDS and place in:", era5_dir)
    raise FileNotFoundError("Missing ERA5 raw files - see above")

# Target 2-degree grid (global)
lat2interp = np.arange(-88., 90., 2.0)[::-1]
lon2interp = np.arange(0., 360., 2.0)

# Process u10
print("\nProcessing u10...")
ds = xr.open_dataset(f'{era5_dir}/era5_u10_1940-2025.nc')
ds = ds.interp(latitude=lat2interp, longitude=lon2interp, method='linear')
ds = ds.rename({'valid_time': 'time', 'latitude': 'lat', 'longitude': 'lon'})
ds.to_netcdf(f'{era5_dir}/era5_u10_1940-present_preprocessed.nc')
print("  Saved era5_u10_1940-present_preprocessed.nc")
ds.close()

# Process v10
print("Processing v10...")
ds = xr.open_dataset(f'{era5_dir}/era5_v10_1940-2025.nc')
ds = ds.interp(latitude=lat2interp, longitude=lon2interp, method='linear')
ds = ds.rename({'valid_time': 'time', 'latitude': 'lat', 'longitude': 'lon'})
ds.to_netcdf(f'{era5_dir}/era5_v10_1940-present_preprocessed.nc')
print("  Saved era5_v10_1940-present_preprocessed.nc")
ds.close()

# Process geopotential (combine 3 levels, convert to height)
print("Processing geopotential height...")
ds_500 = xr.open_dataset(f'{era5_dir}/era5_hgt_500_1940-2025.nc')
ds_700 = xr.open_dataset(f'{era5_dir}/era5_hgt_700_1940-2025.nc')
ds_1000 = xr.open_dataset(f'{era5_dir}/era5_hgt_1000_1940-2025.nc')

# Convert geopotential to height (divide by g=9.80665)
g = 9.80665
ds_500['height'] = ds_500['z'] / g
ds_700['height'] = ds_700['z'] / g
ds_1000['height'] = ds_1000['z'] / g

# Combine levels
ds = xr.concat([ds_500['height'], ds_700['height'], ds_1000['height']], dim='pressure_level')
ds = ds.assign_coords(pressure_level=[500, 700, 1000])
ds = ds.to_dataset(name='height')

ds = ds.interp(latitude=lat2interp, longitude=lon2interp, method='linear')
ds = ds.rename({'valid_time': 'time', 'latitude': 'lat', 'longitude': 'lon'})
ds.to_netcdf(f'{era5_dir}/era5_hgt_1940-present_preprocessed.nc')
print("  Saved era5_hgt_1940-present_preprocessed.nc")

ds_500.close()
ds_700.close()
ds_1000.close()

print("\nDone! Ready to run climate indices (Cell 5).")

In [None]:
# Cell 5 - Compute Climate Indices (from ERSSTv5 data)
%cd /content/telnet
import os
os.environ['TELNET_DATADIR'] = '/content/drive/MyDrive/telnet_data'

print("=" * 60)
print("  Computing climate indices from ERSSTv5...")
print("=" * 60)

# Use 202412 to generate seasonal_climate_indices_1941-2024.txt
# (202512 would create a different filename that other scripts don't expect)
!python compute_climate_indices.py --finaldate 202412

print("\nClimate indices computed!")

In [None]:
# Cell 6 - Feature Pre-Selection (PMI ranking)
# Uses local ERA5 precipitation data (preprocessed from CDS)
%cd /content/telnet
import os
os.environ['TELNET_DATADIR'] = '/content/drive/MyDrive/telnet_data'

print("=" * 60)
print("  Running feature pre-selection (PMI ranking)")
print("  Using local ERA5 precipitation data...")
print("=" * 60)

# Use fewer samples for faster testing (100 is good, 1000 for production)
N_SAMPLES = 100

!python feature_pre_selection.py -n {N_SAMPLES}

# Copy results to Drive
import shutil
src = '/content/telnet/data/models/final_feats.txt'
dst = '/content/drive/MyDrive/telnet_data/data/models/final_feats.txt'
if os.path.exists(src):
    shutil.copy(src, dst)
    print(f"Copied final_feats.txt to Drive")

src_seeds = '/content/telnet/data/seeds_pmi.txt'
dst_seeds = '/content/drive/MyDrive/telnet_data/data/seeds_pmi.txt'
if os.path.exists(src_seeds):
    shutil.copy(src_seeds, dst_seeds)
    print(f"Copied seeds_pmi.txt to Drive")

print("\nFeature pre-selection complete!")

In [None]:
# Cell 7 - Model Selection (GPU grid search)
%cd /content/telnet
import os
os.environ['TELNET_DATADIR'] = '/content/drive/MyDrive/telnet_data'

print("=" * 60)
print("  Running model selection (hyperparameter grid search)")
print("  This is GPU-intensive and will take 2-3 hours...")
print("=" * 60)

!chmod +x model_selection.sh
!./model_selection.sh 100 1

In [None]:
# Cell 8 - Model Testing (final training with best hyperparameters)
%cd /content/telnet
import os
os.environ['TELNET_DATADIR'] = '/content/drive/MyDrive/telnet_data'

print("=" * 60)
print("  Running model testing (final training)")
print("=" * 60)

!python model_testing.py -n 100 -c 1

In [None]:
# Cell 9 - Generate Forecasts (4 quarterly initializations)
%cd /content/telnet
import os
os.environ['TELNET_DATADIR'] = '/content/drive/MyDrive/telnet_data'

print("=" * 60)
print("  Generating seasonal forecasts...")
print("=" * 60)

!chmod +x generate_forecast.sh

init_dates = ['202501', '202504', '202507', '202510']

for init_date in init_dates:
    print(f"\n=== Forecast for {init_date} ===")
    !./generate_forecast.sh {init_date} 1

In [None]:
# Cell 10 - Extract Maranhao Region from forecasts
%cd /content/telnet
import os
import glob

os.environ['TELNET_DATADIR'] = '/content/drive/MyDrive/telnet_data'
DATADIR = os.environ['TELNET_DATADIR']

print("=" * 60)
print("  Extracting Maranhao region from forecasts...")
print("=" * 60)

init_dates = ['202501', '202504', '202507', '202510']

for init_date in init_dates:
    results_dir = f'{DATADIR}/results/{init_date}'
    if os.path.exists(results_dir):
        for f in glob.glob(f'{results_dir}/*.nc'):
            basename = os.path.basename(f)
            if not basename.startswith('maranhao_'):
                output = f'{results_dir}/maranhao_{basename}'
                !python extract_maranhao.py "{f}" "{output}" --shapefile-dir {DATADIR}/shapefiles
                print(f"Extracted: {output}")

print("\nMaranhao extraction complete!")

In [None]:
# Cell 11 - View Results Summary
import os
import glob

DATADIR = '/content/drive/MyDrive/telnet_data'
init_dates = ['202501', '202504', '202507', '202510']

print("=" * 60)
print("  ARCO ERA5 Workflow Complete!")
print("=" * 60)
print()
print("Results saved to Google Drive:")
print(f"  {DATADIR}/results/")
print()

for init_date in init_dates:
    results_dir = f'{DATADIR}/results/{init_date}'
    if os.path.exists(results_dir):
        files = os.listdir(results_dir)
        nc_files = [f for f in files if f.endswith('.nc')]
        print(f"{init_date}: {len(nc_files)} forecast files")
        for f in sorted(nc_files)[:3]:
            print(f"  - {f}")
        if len(nc_files) > 3:
            print(f"  ... and {len(nc_files) - 3} more")
    else:
        print(f"{init_date}: No results yet")

print()
print("Feature ranking saved to:")
print(f"  {DATADIR}/data/models/final_feats.txt")