In [1]:
%load_ext autotime

time: 581 µs (started: 2021-07-23 16:30:14 -03:00)


In [1]:
import requests
from requests.auth import HTTPBasicAuth
from tqdm.auto import tqdm
from urllib.parse import unquote
from multiprocessing.pool import ThreadPool
import os

FOLDER_PATH = 'raw_historical'
URL_LIST_PATH = f'{FOLDER_PATH}/subset_M2I3NVASM_5.12.4_20210723_192925.txt'
USERNAME = 'danlessa'
PASSWORD = 'Pa$$w0rd'

with open(URL_LIST_PATH, 'r') as fid:
    url_list = [el.strip() 
                for el in fid.readlines()
                if 'README' not in el]
    

def url_filename(url):
    filename = (unquote(url).split("FILENAME")[1]
                        .split("&")[0]
                        .split("/")[-1])
    return filename
    
    
files = os.listdir(FOLDER_PATH) 
url_list = [el
            for el in url_list
            if url_filename(el) not in files]

In [2]:
# Source: file:///Users/danlessa/Downloads/EL-HowToAccessDataWithPython-230721-1449-2632.pdf
# overriding requests.Session.rebuild_auth to mantain headers when redirected
class SessionWithHeaderRedirection(requests.Session):
 AUTH_HOST = 'urs.earthdata.nasa.gov'
 def __init__(self, username, password):
     super().__init__()
     self.auth = (username, password)
     # Overrides from the library to keep headers when redirected to or from
     # the NASA auth host.
 def rebuild_auth(self, prepared_request, response):
     headers = prepared_request.headers
     url = prepared_request.url
     if 'Authorization' in headers:
         original_parsed = requests.utils.urlparse(response.request.url)
         redirect_parsed = requests.utils.urlparse(url)
     if (original_parsed.hostname != redirect_parsed.hostname) and \
         redirect_parsed.hostname != self.AUTH_HOST and \
         original_parsed.hostname != self.AUTH_HOST:
         del headers['Authorization']
     return



In [3]:
print(f"Number of files: {len(url_list)}")
print(f"Expected time (3s avg per file): {len(url_list) / (3 * 60) :.2f} min")
print(f"Expected time (3s avg per file, 20x): {len(url_list) / (3 * 60 * 20) :.2f} min")

Number of files: 2093
Expected time (3s avg per file): 11.63 min
Expected time (3s avg per file, 20x): 0.58 min


In [4]:
def download_file(session, url, output_path):
    r = session.get(url)
    with open(output_path, 'wb') as fid:
        fid.write(r.content)
        
def download_files(url_list):
    session = SessionWithHeaderRedirection(USERNAME, PASSWORD)
    for url in url_list:
        filename = (unquote(url).split("FILENAME")[1]
                                .split("&")[0]
                                .split("/")[-1])
        download_file(session, url, f"{FOLDER_PATH}/{filename}")

In [5]:
def chunks(l, n):
    """Yield n number of striped chunks from l."""
    for i in range(0, n):
        yield l[i::n]

N_chunks = 50
url_list_chunks = tuple(chunks(url_list, N_chunks))
with ThreadPool(N_chunks) as pool:
    results = pool.map(download_files, url_list_chunks)

In [7]:
import xarray as xr

ds_genexpr = (xr.load_dataset(f"{FOLDER_PATH}/{file}", engine='netcdf4')
              for file in files
              if '.nc' in file)

ds = xr.concat(ds_genexpr, dim='time')
ds.to_netcdf("historical_wind_data.nc")