# Function to get S3 temporay credentials

In [None]:
def init_S3FileSystem():
    """
    This routine automatically pull your EDL crediential from .netrc file and use it to obtain an AWS S3 credential through a podaac service accessable at https://archive.podaac.earthdata.nasa.gov/s3credentials
    
    Return:
    =======
    
    s3: an AWS S3 filesystem
    """
    import requests,s3fs
    creds = requests.get('https://archive.podaac.earthdata.nasa.gov/s3credentials').json()
    s3 = s3fs.S3FileSystem(anon=False,
                           key=creds['accessKeyId'],
                           secret=creds['secretAccessKey'], 
                           token=creds['sessionToken'])
    return s3

# Class to subset data by space and time.

In [1]:
# Standard imports
import netrc

from urllib import request
from http.cookiejar import CookieJar
from socket import gethostname, gethostbyname

# Third-party imports
import requests

class S3List:
    """Class used to query and download from PO.DAAC's CMR API.
    """

    CMR = "cmr.earthdata.nasa.gov"
    URS = "urs.earthdata.nasa.gov"

    def __init__(self):
        self._token = None

    def login(self):
        """Log into Earthdata and set up request library to track cookies.
        
        Raises an exception if can't authenticate with .netrc file.
        """

        try:
            username, _, password = netrc.netrc().authenticators(self.URS)
        except (FileNotFoundError, TypeError):
            raise Exception("ERROR: There not .netrc file or endpoint indicated in .netrc file.")

        # Create Earthdata authentication request
        manager = request.HTTPPasswordMgrWithDefaultRealm()
        manager.add_password(None, self.URS, username, password)
        auth = request.HTTPBasicAuthHandler(manager)

        # Set up the storage of cookies
        jar = CookieJar()
        processor = request.HTTPCookieProcessor(jar)

        # Define an opener to handle fetching auth request
        opener = request.build_opener(auth, processor)
        request.install_opener(opener)

    def get_token(self, client_id, ip_address):
        """Get CMR authentication token for searching records.
        
        Parameters
        ----------
        client_id: str
            client identifier to obtain token
        ip_address: str
            client's IP address
        """

        try:
            username, _, password = netrc.netrc().authenticators(self.URS)
        except (FileNotFoundError, TypeError) as error:
            raise Exception("ERROR: There not .netrc file or endpoint indicated in .netrc file.")

        # Post a token request and return resonse
        token_url = f"https://{self.CMR}/legacy-services/rest/tokens"
        token_xml = (f"<token>"
                        f"<username>{username}</username>"
                        f"<password>{password}</password>"
                        f"<client_id>{client_id}</client_id>"
                        f"<user_ip_address>{ip_address}</user_ip_address>"
                    f"</token>")
        headers = {"Content-Type" : "application/xml", "Accept" : "application/json"}
        self._token = requests.post(url=token_url, data=token_xml, headers=headers) \
            .json()["token"]["id"]

    def delete_token(self):
        """Delete CMR authentication token."""

        token_url = f"https://{self.CMR}/legacy-services/rest/tokens"
        headers = {"Content-Type" : "application/xml", "Accept" : "application/json"}
        try:
            res = requests.request("DELETE", f"{token_url}/{self._token}", headers=headers)
            return res.status_code
        except Exception as e:
            raise Exception(f"Failed to delete token: {e}.")

    def run_query(self, shortname, provider, temporal_range, bbox):
        """Run query on collection referenced by shortname from provider."""

        url = f"https://{self.CMR}/search/granules.umm_json"
        params = {
                    "provider" : provider, 
                    "ShortName" : shortname, 
                    "token" : self._token,
                    "scroll" : "true",
                    "page_size" : 2000,
                    "sort_key" : "start_date",
                    "temporal" : temporal_range,
                    "bounding_box": bbox,
                    "page_size": 2000,
                }
        res = requests.get(url=url, params=params)        
        coll = res.json()
        return [url["URL"] for res in coll["items"] for url in res["umm"]["RelatedUrls"] if url["Type"] == "GET DATA VIA DIRECT ACCESS"]

    def login_and_run_query(self, short_name, provider, temporal_range, bbox):
        """Log into CMR and run query to retrieve a list of S3 URLs."""

        try:
            # Login and retrieve token
            self.login()
            client_id = "podaac_cmr_client"
            hostname = gethostname()
            ip_addr = gethostbyname(hostname)
            self.get_token(client_id, ip_addr)

            # Run query
            s3_urls = self.run_query(short_name, provider, temporal_range, bbox)
            s3_urls.sort()

            # Clean up and delete token
            self.delete_token()            
        except Exception:
            raise
        else:
            # Return list
            return s3_urls

# Define the S3 bucket for each satellite (useful?)

In [20]:
S3Buckets = {
    'L2P S-NPP VIIRS'     : 's3://podaac-ops-cumulus-protected/VIIRS_NPP-STAR-L2P-v2.80', 
    'L2P MetopB AVHRR'    : 's3://podaac-ops-cumulus-protected/AVHRRF_MB-STAR-L2P-v2.80',
    'L2P GCOM AMSR2'      : 's3://podaac-ops-cumulus-protected/AMSR2-REMSS-L2P-v8a',
    'L2P GOES-16 '        : 's3://podaac-ops-cumulus-protected/ABI_G16-STAR-L2P-v2.70',
    'L2P Meteosat SEVIRI' : 's3://podaac-ops-cumulus-protected/MSG03-OSPO-L2P-v1.0'
}

# Use above class to retrieve S3 urls
### But the last two satellites from `S3Buckets` return no files using the code below

In [33]:
import os

# Required data 
# short_name = 'VIIRS_NPP-STAR-L2P-v2.80'
provider = 'POCLOUD'
temporal_range = '2022-07-18T00:00:00Z,2022-07-18T23:59:59Z'
bbox = "21,-64,66,-7"

s3_obj = S3List()

for sat in S3Buckets.values():
    short_name = os.path.basename(sat)
    
    s3_urls = s3_obj.login_and_run_query(
        short_name,
        provider,
        temporal_range,
        bbox
    )
    print(len(s3_urls))

33
35
16
0
0


# Open together all files for each satellite
### By the [metop preprocessing notebook](https://github.com/gridSST-hackathon/data-pre-processing/blob/main/metop_preprocessing.ipynb), I'm in doubt if open all files together is the right thing to do

In [38]:
import xarray as xr

# from glob import glob
from tqdm import tqdm

s3 = init_S3FileSystem()

# Iterate through remote_files to create a fileset
fileset = [s3.open(file) for file in tqdm(s3_urls)]

# # This works
# data = xr.open_mfdataset(
#     fileset,
#     engine='h5netcdf'
# )

In [7]:
data = xr.open_dataset(
    fileset[0],
    engine='h5netcdf'
)
data