# Downloading JAXA Precipication Dataset (daily)

## Dataset Information


## Notebook Overview

## Installing JAXA Api


In [None]:
pip install /content/jaxa-earth-0.1.4.zip

Processing ./jaxa-earth-0.1.4.zip
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Building wheels for collected packages: jaxa-earth
  Building wheel for jaxa-earth (pyproject.toml) ... [?25l[?25hdone
  Created wheel for jaxa-earth: filename=jaxa_earth-0.1.4-py3-none-any.whl size=47459 sha256=c7c52d3f7dd8f91f3b633b97238a2373dbac2465c87250080d5246d9acea253f
  Stored in directory: /root/.cache/pip/wheels/1b/d1/d0/a78c3e9c8fc8440dbaa31270a93f1e0c536706bf19d69a93f8
Successfully built jaxa-earth
Installing collected packages: jaxa-earth
Successfully installed jaxa-earth-0.1.4


In [None]:
from jaxa.earth import je

## Retrieving the data


### Structure of Data in API

In [None]:
collections, bands = je.ImageCollectionList(ssl_verify=True).filter_name()

 - Geting image collection information : completed
 - Searching : 96 image collections found!


In [None]:
precip_collection = "JAXA.EORC_GSMaP_standard.Gauge.00Z-23Z.v6_daily"

In [None]:
precip_index = collections.index(precip_collection)

bands[precip_index]

['PRECIP']

### Loading data into memory

In [None]:
# Load module
from jaxa.earth import je

# Set query parameters
dlim = ["2024-05-01T00:00:00","2024-06-01T00:00:00"]  # YY-MM-DD
ppu  = 5
bbox = [110, 20, 160, 50]

# Get an image
data = je.ImageCollection(collection=precip_collection,ssl_verify=True)\
        .filter_date(dlim=dlim)\
        .filter_resolution(ppu=ppu)\
        .filter_bounds(bbox=bbox)\
        .select(band=bands[precip_index][0])\
        .get_images()

 - Collection : JAXA.EORC_GSMaP_standard.Gauge.00Z-23Z.v6_daily
 - Date : 2024-05/01/, 2024-05/02/, 2024-05/03/, 2024-05/04/, 2024-05/05/, 2024-05/06/, 2024-05/07/, 2024-05/08/, 2024-05/09/, 2024-05/10/, 2024-05/11/, 2024-05/12/, 2024-05/13/, 2024-05/14/, 2024-05/15/, 2024-05/16/, 2024-05/17/, 2024-05/18/, 2024-05/19/, 2024-05/20/, 2024-05/21/, 2024-05/22/, 2024-05/23/, 2024-05/24/, 2024-05/25/, 2024-05/26/, 2024-05/27/, 2024-05/28/, 2024-05/29/, 2024-05/30/, 2024-05/31/, 2024-06/01/, 
 - Resolution : 5.0 pixels per 1 degree 
 - Bounds : [110, 20, 160, 50]
 - Band : PRECIP
 - Loading images No.0 : 2024-05/01/
   ------10------20------30------40------50------60------70------80------90-----100
   ||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
 - Loading images No.1 : 2024-05/02/
   ------10------20------30------40------50------60------70------80------90-----100
   ||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
 - Loading

In [None]:
images = je.ImageProcess(data)

In [None]:
images.raster.img.shape

(32, 150, 250, 1)

In [None]:
# Process and show an image
img = je.ImageProcess(data)\
        .show_images()

Problematic...

## Writing to disk and chunking

In [None]:
# !pip install rasterio

In [None]:
from datetime import datetime, timedelta
from dateutil.relativedelta import relativedelta

def prepare_date_chunks(date_range, chunking="monthly"):

    if chunking not in ("daily", "monthly", "yearly"):
        raise ValueError(f"chunking must be 'daily', 'monthly', or 'yearly', got '{chunking}'")
    
    # Parse input dates
    start_dt = datetime.fromisoformat(date_range[0].replace('Z', '+00:00'))
    end_dt = datetime.fromisoformat(date_range[1].replace('Z', '+00:00'))
    
    # chunking stategies
    strategies = {
        "daily": {
            "next_chunk": lambda dt: dt + timedelta(days=1),
            "name_format": "%Y-%m-%d"
        },
        "monthly": {
            "next_chunk": lambda dt: dt.replace(day=1) + relativedelta(months=1),
            "name_format": "%Y-%m"
        },
        "yearly": {
            "next_chunk": lambda dt: dt.replace(month=1, day=1) + relativedelta(years=1),
            "name_format": "%Y"
        }
    }
    
    strategy = strategies[chunking]
    next_chunk = strategy["next_chunk"]
    name_format = strategy["name_format"]
    
    chunks = []
    current_dt = start_dt
    
    while current_dt < end_dt:
        chunk_start = current_dt
        chunk_end = min(next_chunk(current_dt), end_dt)
        
        name = chunk_start.strftime(name_format)
        dlim = [
            chunk_start.isoformat(),
            (chunk_end - timedelta(seconds=1)).isoformat() # subtract 1 second to include the last moment of the chunk to not include end-date
        ]
        
        chunks.append((name, dlim))
        current_dt = chunk_end
    
    return chunks

In [None]:
import os
import numpy as np
from jaxa.earth import je
import rasterio
from rasterio.transform import from_bounds
import time

def save_data_to_disk(collection, band, date_range, out_path, chunking="monthly", ppu=5, bbox=[-180, -90, 180, 90]):

    s = time.time()
    try:
        if type(band) is str:
            band = [band]
        elif not band: # load all bands for collection
            collections, bands = je.ImageCollectionList(ssl_verify=True).filter_name()
            coll_index = collections.index(collection)
            band = bands[coll_index]

        date_chunks = prepare_date_chunks(date_range, chunking)

        for b in band:
            for name, dlim in date_chunks:
                
                inner_path = os.path.join(out_path, collection.replace('.','_'), b)
                os.makedirs(inner_path, exist_ok=True)

                filename = f"{name}.tif"

                # 1. load data to memory
                data = je.ImageCollection(collection=collection,ssl_verify=True)\
                                            .filter_date(dlim=dlim)\
                                            .filter_resolution(ppu=ppu)\
                                            .filter_bounds(bbox=bbox)\
                                            .select(band=b)\
                                            .get_images()
                
                raster = data.raster  # raster is a Raster object per API doc
                arr = raster.img  # numpy array (t, h, w, 1)
                arr = raster.img.squeeze(axis=-1)  # remove last dim

                # 2. bring to correct format for rasterio
                min_lon, max_lon = raster.lonlim[0]
                min_lat, max_lat = raster.latlim[0]
                num_timesteps, height, width = arr.shape
                transform = from_bounds(min_lon, min_lat, max_lon, max_lat, width, height)

                dtype = arr.dtype
                crs = "EPSG:4326"  # coordinate reference system used

                # 3. Write to GeoTIFF
                filename = f"{name}.tif"
                filepath = os.path.join(inner_path, filename)
                with rasterio.open(
                    filepath,
                    "w",
                    driver="GTiff",
                    height=height,
                    width=width,
                    count=num_timesteps,  # Each timestep becomes a band
                    dtype=dtype,
                    crs=crs,
                    transform=transform
                ) as dst:
                    for t in range(num_timesteps):
                        dst.write(arr[t], t + 1)

    
    except Exception as e:
        print(f"Error: {e}")
        return
    
    finally:
        e = time.time()
        print(f"Execution time: {e - s} seconds")