# üõ∞Ô∏è NASA PACE OCI Data Downloader

**Para o projeto LAG-FISH - Hackweek 2026**

In [8]:
# CELL 1: Install
import sys, subprocess
pkgs = ["earthaccess", "pandas", "xarray", "netCDF4", "h5netcdf", "h5py", "ipywidgets", "aiohttp"]
subprocess.check_call([sys.executable, "-m", "pip", "install"] + pkgs + ["--quiet"])
print("‚úì Instalado!")

‚úì Instalado!


In [9]:
# CELL 2: Imports
import os, re, tempfile
import earthaccess
import pandas as pd
import xarray as xr
import ipywidgets as widgets
from datetime import datetime, timedelta
from IPython.display import display, clear_output
from pathlib import Path
import warnings
warnings.filterwarnings('ignore')

PACE_PRODUCTS = {
    'RRS': {'short_name': 'PACE_OCI_L3M_RRS', 'description': 'Remote Sensing Reflectance'},
    'CHL': {'short_name': 'PACE_OCI_L3M_CHL', 'description': 'Chlorophyll-a'},
    'POC': {'short_name': 'PACE_OCI_L3M_POC', 'description': 'Particulate Organic Carbon'},
    'CARBON': {'short_name': 'PACE_OCI_L3M_CARBON', 'description': 'Phytoplankton Carbon'},
    'IOP': {'short_name': 'PACE_OCI_L3M_IOP', 'description': 'Inherent Optical Properties'},
    'KD': {'short_name': 'PACE_OCI_L3M_KD', 'description': 'Diffuse Attenuation'}
}

print(f"earthaccess {earthaccess.__version__}")
for k, v in PACE_PRODUCTS.items(): print(f"  {k}: {v['description']}")

earthaccess 0.15.1
  RRS: Remote Sensing Reflectance
  CHL: Chlorophyll-a
  POC: Particulate Organic Carbon
  CARBON: Phytoplankton Carbon
  IOP: Inherent Optical Properties
  KD: Diffuse Attenuation


In [10]:
# CELL 3: Auth
print("Autenticando...")
try:
    auth = earthaccess.login(persist=True)
    print("‚úì OK!" if auth else "‚ö† Falhou")
except Exception as e:
    print(f"‚úó {e}")

Autenticando...
‚úì OK!


In [11]:
# CELL 4: Core Functions - CHUNKED PROCESSING

def parse_dates_from_file(filepath):
    dates = []
    skip = ('#', '=', '-', 'lista', 'total', 'date', 'unique', 'list', 'lat', 'lon')
    with open(filepath, 'r') as f:
        for line in f:
            line = line.split('#')[0].strip()
            if not line or any(line.lower().startswith(s) for s in skip): continue
            try: dates.append(pd.to_datetime(line.split(',')[0].strip()))
            except: pass
    return sorted(set(dates))

def expand_dates(dates, window=4):
    expanded = set()
    for d in dates:
        for off in range(-window, window+1):
            expanded.add(d + timedelta(days=off))
    return sorted(expanded)

def extract_date(filename):
    m = re.search(r'\.(\d{8})\.', str(filename))
    return pd.to_datetime(m.group(1)) if m else None


def download_pace_chunked(products, dates, bbox, output_dir, 
                          resolution='0p1deg', window_days=4, chunk_size=8):
    """
    Download PACE data processing in CHUNKS.
    Downloads chunk_size files at a time, saves immediately, then next chunk.
    This gives immediate feedback and doesn't require waiting for all downloads.
    """
    output_path = Path(output_dir).resolve()
    output_path.mkdir(parents=True, exist_ok=True)
    
    print("\n" + "="*60)
    print("PACE DOWNLOAD (CHUNKED)")
    print("="*60)
    print(f"üìÅ OUTPUT: {output_path}")
    
    # Expand dates
    if window_days > 0:
        orig = len(dates)
        dates = expand_dates(dates, window_days)
        print(f"üìÖ Datas: {orig} ‚Üí {len(dates)} (¬±{window_days}d)")
    
    date_min = min(dates).strftime('%Y-%m-%d')
    date_max = max(dates).strftime('%Y-%m-%d')
    dates_set = {d.strftime('%Y%m%d') for d in dates}
    
    valid_prods = [p for p in products if p in PACE_PRODUCTS]
    print(f"üìä Produtos: {valid_prods}")
    print(f"üó∫Ô∏è  Regi√£o: lat[{bbox['lat_min']}:{bbox['lat_max']}] lon[{bbox['lon_min']}:{bbox['lon_max']}]")
    print(f"üì¶ Chunk size: {chunk_size}")
    
    total_saved = 0
    total_skipped = 0
    total_errors = 0
    
    for prod_key in valid_prods:
        short_name = PACE_PRODUCTS[prod_key]['short_name']
        print(f"\n{'='*60}")
        print(f"üì¶ {prod_key}")
        print(f"{'='*60}")
        
        # Search all at once
        print(f"  üîç Buscando {date_min} a {date_max}...")
        results = earthaccess.search_data(
            short_name=short_name,
            temporal=(date_min, date_max),
            granule_name=f"*.DAY.*.{resolution}.*"
        )
        
        if not results:
            print(f"  ‚ö† Nenhum resultado")
            continue
        
        print(f"  ‚úì {len(results)} granules encontrados")
        
        # Filter by requested dates and existing files
        to_process = []
        skipped = 0
        
        for r in results:
            try:
                fname = r.data_links()[0].split('/')[-1]
                gdate = extract_date(fname)
                if not gdate: continue
                dstr = gdate.strftime('%Y%m%d')
            except: continue
            
            if dstr not in dates_set: continue
            
            out_file = output_path / f"pace_{prod_key.lower()}_{dstr}.nc"
            if out_file.exists():
                skipped += 1
                continue
            
            to_process.append({'result': r, 'date_str': dstr, 'out_file': out_file})
        
        total_skipped += skipped
        if skipped: print(f"  ‚óã {skipped} j√° existem")
        
        if not to_process:
            print(f"  ‚úì Tudo pronto!")
            continue
        
        print(f"  üì• {len(to_process)} para baixar")
        
        # Process in chunks
        n_chunks = (len(to_process) + chunk_size - 1) // chunk_size
        
        for chunk_idx in range(n_chunks):
            start = chunk_idx * chunk_size
            end = min(start + chunk_size, len(to_process))
            chunk = to_process[start:end]
            
            print(f"\n  --- Chunk {chunk_idx+1}/{n_chunks} ({len(chunk)} arquivos) ---")
            
            with tempfile.TemporaryDirectory() as tmpdir:
                # Download this chunk
                chunk_results = [item['result'] for item in chunk]
                
                try:
                    downloaded = earthaccess.download(
                        chunk_results,
                        local_path=tmpdir,
                        threads=min(chunk_size, 8)
                    )
                except Exception as e:
                    print(f"  ‚úó Download falhou: {e}")
                    total_errors += len(chunk)
                    continue
                
                # Process and save immediately
                for dl_file, item in zip(downloaded, chunk):
                    out_file = item['out_file']
                    dstr = item['date_str']
                    
                    try:
                        ds = xr.open_dataset(dl_file, engine='h5netcdf')
                        ds_sub = ds.sel(
                            lat=slice(bbox['lat_max'], bbox['lat_min']),
                            lon=slice(bbox['lon_min'], bbox['lon_max'])
                        )
                        ds_sub.to_netcdf(out_file, engine='h5netcdf')
                        ds_sub.close()
                        ds.close()
                        
                        total_saved += 1
                        size_mb = out_file.stat().st_size / 1e6
                        print(f"    ‚úì {dstr} ‚Üí {out_file.name} ({size_mb:.1f}MB)")
                        
                    except Exception as e:
                        total_errors += 1
                        print(f"    ‚úó {dstr}: {str(e)[:40]}")
            
            # Show running total
            print(f"  [Total salvo at√© agora: {total_saved}]")
    
    # Final summary
    print("\n" + "="*60)
    print("RESULTADO")
    print("="*60)
    print(f"  ‚úì Salvos: {total_saved}")
    print(f"  ‚óã Existiam: {total_skipped}")
    print(f"  ‚úó Erros: {total_errors}")
    print(f"\nüìÅ {output_path}")
    
    files = list(output_path.glob('pace_*.nc'))
    if files:
        total_mb = sum(f.stat().st_size for f in files) / 1e6
        print(f"   {len(files)} arquivos, {total_mb:.1f} MB")

print("‚úì Fun√ß√µes carregadas!")

‚úì Fun√ß√µes carregadas!


In [5]:
# CELL 5: Quick test
test_out = Path('./pace_test').resolve()
test_out.mkdir(exist_ok=True)

download_pace_chunked(
    products=['CHL'],
    dates=[pd.to_datetime('2024-06-15')],
    bbox={'lat_min': 35, 'lat_max': 40, 'lon_min': -76, 'lon_max': -70},
    output_dir=test_out,
    window_days=0,
    chunk_size=4
)


PACE DOWNLOAD (CHUNKED)
üìÅ OUTPUT: /home/jovyan/2026-proj-Trawling4PACE/contributor_folders/leandro/pace_test
üìä Produtos: ['CHL']
üó∫Ô∏è  Regi√£o: lat[35:40] lon[-76:-70]
üì¶ Chunk size: 4

üì¶ CHL
  üîç Buscando 2024-06-15 a 2024-06-15...
  ‚úì 1 granules encontrados
  üì• 1 para baixar

  --- Chunk 1/1 (1 arquivos) ---


QUEUEING TASKS | :   0%|          | 0/1 [00:00<?, ?it/s]

PROCESSING TASKS | :   0%|          | 0/1 [00:00<?, ?it/s]

COLLECTING RESULTS | :   0%|          | 0/1 [00:00<?, ?it/s]

    ‚úì 20240615 ‚Üí pace_chl_20240615.nc (0.0MB)
  [Total salvo at√© agora: 1]

RESULTADO
  ‚úì Salvos: 1
  ‚óã Existiam: 0
  ‚úó Erros: 0

üìÅ /home/jovyan/2026-proj-Trawling4PACE/contributor_folders/leandro/pace_test
   1 arquivos, 0.0 MB


In [12]:
# CELL 6: UI

class FolderBrowser:
    def __init__(self, start='.'):
        self.cur = Path(start).resolve()
        self.sel = self.cur
        self.html = widgets.HTML(f"<code>{self.cur}</code>")
        self.dd = widgets.Select(options=self._list(), layout=widgets.Layout(width='100%', height='100px'))
        self.b_up = widgets.Button(description='‚¨Ü', layout=widgets.Layout(width='40px'))
        self.b_in = widgets.Button(description='üìÇ', layout=widgets.Layout(width='40px'))
        self.b_sel = widgets.Button(description='‚úì', button_style='success', layout=widgets.Layout(width='40px'))
        self.txt = widgets.Text(placeholder='nova', layout=widgets.Layout(width='80px'))
        self.b_new = widgets.Button(description='+', layout=widgets.Layout(width='40px'))
        self.selhtml = widgets.HTML(f"<b style='color:green'>üìÅ {self.sel}</b>")
        self.b_up.on_click(lambda b: self._up())
        self.b_in.on_click(lambda b: self._enter())
        self.b_sel.on_click(lambda b: self._select())
        self.b_new.on_click(lambda b: self._create())
        self.w = widgets.VBox([self.html, self.dd,
            widgets.HBox([self.b_up, self.b_in, self.b_sel, self.txt, self.b_new]), self.selhtml])
    def _list(self):
        try: return ['.'] + sorted([x.name for x in self.cur.iterdir() if x.is_dir() and not x.name.startswith('.')])
        except: return ['.']
    def _refresh(self): self.html.value = f"<code>{self.cur}</code>"; self.dd.options = self._list()
    def _up(self):
        if self.cur.parent != self.cur: self.cur = self.cur.parent; self._refresh()
    def _enter(self):
        if self.dd.value and self.dd.value != '.': self.cur = self.cur / self.dd.value; self._refresh()
    def _select(self):
        self.sel = self.cur / self.dd.value if self.dd.value and self.dd.value != '.' else self.cur
        self.selhtml.value = f"<b style='color:green'>üìÅ {self.sel}</b>"
    def _create(self):
        if self.txt.value: (self.cur / self.txt.value).mkdir(exist_ok=True); self.txt.value = ''; self._refresh()
    def path(self): return str(self.sel.resolve())

class FileBrowser:
    def __init__(self, start='.', ext=('.txt','.csv','.dat')):
        self.cur = Path(start).resolve()
        self.ext = ext
        self.selfile = None
        self.html = widgets.HTML(f"<code>{self.cur}</code>")
        self.dd = widgets.Select(options=self._list(), layout=widgets.Layout(width='100%', height='80px'))
        self.b_up = widgets.Button(description='‚¨Ü', layout=widgets.Layout(width='40px'))
        self.b_in = widgets.Button(description='üìÇ', layout=widgets.Layout(width='40px'))
        self.b_sel = widgets.Button(description='üìÑ', button_style='info', layout=widgets.Layout(width='40px'))
        self.selhtml = widgets.HTML("<i>-</i>")
        self.b_up.on_click(lambda b: self._up())
        self.b_in.on_click(lambda b: self._enter())
        self.b_sel.on_click(lambda b: self._select())
        self.w = widgets.VBox([self.html, self.dd, widgets.HBox([self.b_up, self.b_in, self.b_sel]), self.selhtml])
    def _list(self):
        items = ['.']
        try:
            for x in sorted(self.cur.iterdir()):
                if x.is_dir() and not x.name.startswith('.'): items.append(f"üìÅ {x.name}")
            for x in sorted(self.cur.iterdir()):
                if x.is_file() and x.suffix.lower() in self.ext: items.append(x.name)
        except: pass
        return items
    def _refresh(self): self.html.value = f"<code>{self.cur}</code>"; self.dd.options = self._list()
    def _up(self):
        if self.cur.parent != self.cur: self.cur = self.cur.parent; self._refresh()
    def _enter(self):
        if self.dd.value and self.dd.value.startswith('üìÅ '): self.cur = self.cur / self.dd.value[2:]; self._refresh()
    def _select(self):
        if self.dd.value and not self.dd.value.startswith('üìÅ') and self.dd.value != '.':
            self.selfile = self.cur / self.dd.value
            self.selhtml.value = f"<b style='color:blue'>üìÑ {self.selfile.name}</b>"
    def file(self): return str(self.selfile) if self.selfile else None

fb = FolderBrowser('.')
flb = FileBrowser('.')

w_lat = widgets.FloatRangeSlider(value=[30, 50], min=-90, max=90, step=0.5, description='Lat:')
w_lon = widgets.FloatRangeSlider(value=[-80, -60], min=-180, max=180, step=0.5, description='Lon:')
w_products = widgets.SelectMultiple(options=list(PACE_PRODUCTS.keys()), value=['CHL', 'RRS'],
    layout=widgets.Layout(width='150px', height='100px'))
w_resolution = widgets.Dropdown(options=[('0.1¬∞', '0p1deg'), ('4km', '4km')], value='0p1deg', description='Res:')
w_chunk = widgets.IntSlider(value=8, min=1, max=20, description='Chunk:')
w_mode = widgets.Dropdown(options=['Single', 'Range', 'File'], value='Single', description='Mode:')
w_single = widgets.DatePicker(description='Date:')
w_start = widgets.DatePicker(description='Start:')
w_end = widgets.DatePicker(description='End:')
w_window = widgets.IntSlider(value=4, min=0, max=15, description='¬±days:')

w_datebox = widgets.VBox([w_single])
def on_mode(c):
    if c['new'] == 'Single': w_datebox.children = [w_single]
    elif c['new'] == 'Range': w_datebox.children = [w_start, w_end]
    else: w_datebox.children = [flb.w]
w_mode.observe(on_mode, names='value')

w_btn = widgets.Button(description='üöÄ Download', button_style='primary')
w_log = widgets.Output(layout=widgets.Layout(max_height='500px', overflow='auto'))

def click(b):
    with w_log:
        clear_output()
        out = fb.path()
        print(f"üéØ Output: {out}")
        
        dates = []
        if w_mode.value == 'Single':
            if w_single.value: dates = [pd.to_datetime(w_single.value)]
        elif w_mode.value == 'Range':
            if w_start.value and w_end.value:
                dates = pd.date_range(w_start.value, w_end.value, freq='D').tolist()
        else:
            f = flb.file()
            if not f: print("‚ö† Arquivo!"); return
            print(f"üìÑ {f}")
            dates = parse_dates_from_file(f)
        
        if not dates: print("‚ö† Datas!"); return
        prods = list(w_products.value)
        if not prods: print("‚ö† Produtos!"); return
        
        bbox = {'lat_min': w_lat.value[0], 'lat_max': w_lat.value[1],
                'lon_min': w_lon.value[0], 'lon_max': w_lon.value[1]}
        
        download_pace_chunked(
            products=prods, dates=dates, bbox=bbox,
            output_dir=out,
            resolution=w_resolution.value,
            window_days=w_window.value,
            chunk_size=w_chunk.value
        )

w_btn.on_click(click)

display(widgets.VBox([
    widgets.HTML("<h2>üõ∞Ô∏è PACE Downloader</h2><hr>"),
    widgets.HTML("<b>üìÅ Output:</b>"), fb.w,
    widgets.HTML("<hr><b>üó∫Ô∏è Region:</b>"), w_lat, w_lon,
    widgets.HTML("<hr><b>üìä Products:</b>"), w_products, w_resolution, w_chunk,
    widgets.HTML("<hr><b>üìÖ Dates:</b>"), w_mode, w_datebox, w_window,
    widgets.HTML("<hr>"), w_btn,
    widgets.HTML("<b>Log:</b>"), w_log
]))

VBox(children=(HTML(value='<h2>üõ∞Ô∏è PACE Downloader</h2><hr>'), HTML(value='<b>üìÅ Output:</b>'), VBox(children=(H‚Ä¶

In [None]:
# CELL 7: Programmatic
'''
dates = parse_dates_from_file('/path/to/dates.txt')
download_pace_chunked(
    products=['CHL', 'RRS'],
    dates=dates,
    bbox={'lat_min': 30, 'lat_max': 50, 'lon_min': -80, 'lon_max': -60},
    output_dir='/path/to/output',
    window_days=4,
    chunk_size=8
)
'''
print("Descomente para usar")

In [None]:
# CELL 8: Check files
def list_files(d):
    p = Path(d).resolve()
    files = sorted(p.glob('pace_*.nc'))
    if not files: print(f"Nenhum em {p}"); return
    total = sum(f.stat().st_size for f in files) / 1e6
    print(f"{p}: {len(files)} files, {total:.1f}MB")
    for f in files[:5]: print(f"  {f.name}")
    if len(files) > 5: print(f"  ...+{len(files)-5}")

# list_files('./pace_data')