# HDF download

In [1]:
import numpy as np
import pandas as pd
from datetime import datetime, timedelta

import fire.downloader as fdown
import fire.utils.io as uio
import fire.utils.modis as umod

# Get URLs to MODIS HDF files

## Alternative 1: Scrape MOD-HDF-URLs from LPDAAC

In [14]:
# MODIS data from 2020 is missing (mostly), thus load from 2020-01-01 on
min_date_str = "20200101"
min_date = datetime.strptime(min_date_str, r"%Y%m%d")
min_date

datetime.datetime(2020, 1, 1, 0, 0)

In [3]:
mod14a1_urls = fdown.collect_hdf_urls_from_lpdaac(
    "https://e4ftl01.cr.usgs.gov/MOLT/MOD14A1.006/", 
    min_date=min_date)

processed 14/14, est. time left: 0.0 m   
   total time: 2.37 m



In [5]:
len(mod14a1_urls)

4116

In [8]:
uio.write_lines(mod14a1_urls, f"../data/_urls/MOD14A1.006_since_{min_date_str}.txt")

In [4]:
mod14a1_urls = uio.read_lines(f"../data/_urls/MOD14A1.006_since_{min_date_str}.txt")
len(mod14a1_urls)

4116

## Alternative 2: Load MOD-HDF-URLs from disk

Get pre-collected urls of all MOD14A1 hdf files.

In [2]:
url_files = ["/home/jonas/data/_urls/MOD14A1.006.txt", "/home/jonas/data/_urls/MOD14A1.006_since_20200101.txt"]

In [3]:
mod14a1_urls = []
for uf in url_files:
    mod14a1_urls.extend(uio.read_lines(uf))
    
mod14a1_urls = np.unique(mod14a1_urls)
print("Count:  ", len(mod14a1_urls))
mod14a1_urls[:5]

Count:   272502


array(['https://e4ftl01.cr.usgs.gov/MOLT/MOD14A1.006/2000.02.18/MOD14A1.A2000049.h00v08.006.2015041132347.hdf',
       'https://e4ftl01.cr.usgs.gov/MOLT/MOD14A1.006/2000.02.18/MOD14A1.A2000049.h00v09.006.2015041132244.hdf',
       'https://e4ftl01.cr.usgs.gov/MOLT/MOD14A1.006/2000.02.18/MOD14A1.A2000049.h00v10.006.2015041132152.hdf',
       'https://e4ftl01.cr.usgs.gov/MOLT/MOD14A1.006/2000.02.18/MOD14A1.A2000049.h01v07.006.2015041130011.hdf',
       'https://e4ftl01.cr.usgs.gov/MOLT/MOD14A1.006/2000.02.18/MOD14A1.A2000049.h01v08.006.2015041131418.hdf'],
      dtype='<U101')

# Filter URLs to our needs (Spain etc)

In [4]:
vh_tiles = { # region -> v,h
    "california": (5,8),
    "spain": (4,17),
    "myanmar": [(6,26),(7,27)]
}

In [5]:
v,h = vh_tiles["myanmar"][1]
v,h

(7, 27)

In [6]:
hdf_index = umod.make_hdf_index_from_paths(mod14a1_urls)
hdf_index.head()

Unnamed: 0,url,fname,sat_name,fname_date,h,v
0,https://e4ftl01.cr.usgs.gov/MOLT/MOD14A1.006/2...,MOD14A1.A2000049.h00v08.006.2015041132347.hdf,MOD,2000-02-18,0,8
1,https://e4ftl01.cr.usgs.gov/MOLT/MOD14A1.006/2...,MOD14A1.A2000049.h00v09.006.2015041132244.hdf,MOD,2000-02-18,0,9
2,https://e4ftl01.cr.usgs.gov/MOLT/MOD14A1.006/2...,MOD14A1.A2000049.h00v10.006.2015041132152.hdf,MOD,2000-02-18,0,10
3,https://e4ftl01.cr.usgs.gov/MOLT/MOD14A1.006/2...,MOD14A1.A2000049.h01v07.006.2015041130011.hdf,MOD,2000-02-18,1,7
4,https://e4ftl01.cr.usgs.gov/MOLT/MOD14A1.006/2...,MOD14A1.A2000049.h01v08.006.2015041131418.hdf,MOD,2000-02-18,1,8


We are interested in Spain, thus MODIS tile (v=4, h=17). Also we are only interested in data since 2010. Filter!

In [7]:
# MODIS HDF files typically have 8 days of data, thus subtract 8 days of 2010-01-01
earliest_modis_date_2010 = datetime.strptime("2010-01-01", r"%Y-%m-%d")
earliest_modis_date_2010 -= timedelta(days=8)
print("Earliest hdf file date for 2010:  ", earliest_modis_date_2010)

hdf_urls_for_region_since_2010 = (
    hdf_index
    .query("h==@h & v==@v")
    .query("fname_date >= @earliest_modis_date_2010")
    ["url"]
    .to_list()
)

print("Count:  ", len(hdf_urls_for_region_since_2010))
hdf_urls_for_region_since_2010[:5]

Earliest hdf file date for 2010:   2009-12-24 00:00:00
Count:   475


['https://e4ftl01.cr.usgs.gov/MOLT/MOD14A1.006/2009.12.27/MOD14A1.A2009361.h27v07.006.2015198033054.hdf',
 'https://e4ftl01.cr.usgs.gov/MOLT/MOD14A1.006/2010.01.01/MOD14A1.A2010001.h27v07.006.2015198042638.hdf',
 'https://e4ftl01.cr.usgs.gov/MOLT/MOD14A1.006/2010.01.09/MOD14A1.A2010009.h27v07.006.2015198101022.hdf',
 'https://e4ftl01.cr.usgs.gov/MOLT/MOD14A1.006/2010.01.17/MOD14A1.A2010017.h27v07.006.2015198094606.hdf',
 'https://e4ftl01.cr.usgs.gov/MOLT/MOD14A1.006/2010.01.25/MOD14A1.A2010025.h27v07.006.2015198192519.hdf']

# Make target paths

In [8]:
# Make target paths
target_paths = [umod.default_target_path_scheme(url, "/home/jonas/data/") for url in hdf_urls_for_region_since_2010]
target_paths[:5]

['/home/jonas/data/MOD14A1.006/2009.12.27/MOD14A1.A2009361.h27v07.006.2015198033054.hdf',
 '/home/jonas/data/MOD14A1.006/2010.01.01/MOD14A1.A2010001.h27v07.006.2015198042638.hdf',
 '/home/jonas/data/MOD14A1.006/2010.01.09/MOD14A1.A2010009.h27v07.006.2015198101022.hdf',
 '/home/jonas/data/MOD14A1.006/2010.01.17/MOD14A1.A2010017.h27v07.006.2015198094606.hdf',
 '/home/jonas/data/MOD14A1.006/2010.01.25/MOD14A1.A2010025.h27v07.006.2015198192519.hdf']

# Download

In [9]:
# get user and password for LPDAAC
user_pwd = fdown.get_auth_from_netrc('urs.earthdata.nasa.gov')

In [10]:
successes = fdown.fetch_many_files(hdf_urls_for_region_since_2010, target_paths, user_pwd)

Downloading 475 files.

processed 474/475, est. time left: 0.0 m                                             
   total time: 0.0 m
processed 475/475, est. time left: 0.0 m  


475/475 files downloaded successfully (100.0 %)
processed 473/475, est. time left: 0.0 m