In [9]:
from __future__ import print_function
import requests
import base64
import itertools
import json
import netrc
import ssl
import sys
from multiprocessing import Pool
from multiprocessing.dummy import Pool as ThreadPool
from getpass import getpass
import h5py

try:
    from urllib.parse import urlparse
    from urllib.request import urlopen, Request, build_opener, HTTPCookieProcessor
    from urllib.error import HTTPError, URLError
except ImportError:
    from urlparse import urlparse
    from urllib2 import urlopen, Request, HTTPError, URLError, build_opener, HTTPCookieProcessor

short_name = 'SPL3SMP_E'
version = '003'
time_start = '2015-04-01T00:00:00Z'
time_end = '2020-01-31T23:59:59Z'
bounding_box = '-103,33.6166,-94.433,37'
polygon = ''
filename_filter = '*'
url_list = []

CMR_URL = 'https://cmr.earthdata.nasa.gov'
URS_URL = 'https://urs.earthdata.nasa.gov'
CMR_PAGE_SIZE = 2000
CMR_FILE_URL = ('{0}/search/granules.json?provider=NSIDC_ECS'
                '&sort_key[]=start_date&sort_key[]=producer_granule_id'
                '&scroll=true&page_size={1}'.format(CMR_URL, CMR_PAGE_SIZE))
def get_credentials(url):
    credentials = None
    try:
        info = netrc.netrc()
        username, account, password = info.authenticators(urlparse(URS_URL).hostname)
    except Exception:
        try:
            username, account, password = info.authenticators(urlparse(CMR_URL).hostname)
        except Exception:
            username = None
            password = None

    return credentials

def build_version_query_params(version):
    desired_pad_length = 3
    version = str(int(version)) 
    query_params = ''

    while len(version) <= desired_pad_length:
        padded_version = version.zfill(desired_pad_length)
        query_params += '&version={0}'.format(padded_version)
        desired_pad_length -= 1
    return query_params

def build_cmr_query_url(short_name, version, time_start, time_end,
                        bounding_box=None, polygon=None,
                        filename_filter=None):
    params = '&short_name={0}'.format(short_name)
    params += build_version_query_params(version)
    params += '&temporal[]={0},{1}'.format(time_start, time_end)
    if polygon:
        params += '&polygon={0}'.format(polygon)
    elif bounding_box:
        params += '&bounding_box={0}'.format(bounding_box)
    if filename_filter:
        option = '&options[producer_granule_id][pattern]=true'
        params += '&producer_granule_id[]={0}{1}'.format(filename_filter, option)
    return CMR_FILE_URL + params


def cmr_filter_urls(search_results):
    if 'feed' not in search_results or 'entry' not in search_results['feed']:
        return []

    entries = [e['links']
               for e in search_results['feed']['entry']
               if 'links' in e]
    links = list(itertools.chain(*entries))

    urls = []
    unique_filenames = set()
    for link in links:
        if 'href' not in link:
            continue
        if 'inherited' in link and link['inherited'] is True:
            continue
        if 'rel' in link and 'data#' not in link['rel']:
            continue
        if 'title' in link and 'opendap' in link['title'].lower():
            continue
        filename = link['href'].split('/')[-1]    
        if filename in unique_filenames:
            continue
        unique_filenames.add(filename)
        urls.append(link['href'])

    return urls

def cmr_search(short_name, version, time_start, time_end,
               bounding_box='', polygon='', filename_filter=''):
    cmr_query_url = build_cmr_query_url(short_name=short_name, version=version,
                                        time_start=time_start, time_end=time_end,
                                        bounding_box=bounding_box,
                                        polygon=polygon, filename_filter=filename_filter)
    print('Querying for data:\n\t{0}\n'.format(cmr_query_url))

    cmr_scroll_id = None
    ctx = ssl.create_default_context()
    ctx.check_hostname = False
    ctx.verify_mode = ssl.CERT_NONE

    try:
        urls = []
        while True:
            req = Request(cmr_query_url)
            if cmr_scroll_id:
                req.add_header('cmr-scroll-id', cmr_scroll_id)
            response = urlopen(req, context=ctx)
            if not cmr_scroll_id:
                headers = {k.lower(): v for k, v in dict(response.info()).items()}
                cmr_scroll_id = headers['cmr-scroll-id']
                hits = int(headers['cmr-hits'])
                if hits > 0:
                    print('Found {0} matches.'.format(hits))
                else:
                    print('Found no matches.')
            search_page = response.read()
            search_page = json.loads(search_page.decode('utf-8'))
            url_scroll_results = cmr_filter_urls(search_page)
            if not url_scroll_results:
                break
            if hits > CMR_PAGE_SIZE:
                print('.', end='')
                sys.stdout.flush()
            urls += url_scroll_results

        if hits > CMR_PAGE_SIZE:
            print()
        return urls
    except KeyboardInterrupt:
        quit()

url_list = cmr_search(short_name, version, time_start, time_end,
                              bounding_box=bounding_box,
                              polygon=polygon, filename_filter=filename_filter)

url_count = len(url_list)
print('Downloading {0} files...'.format(url_count))


b=[url_list[i:i+3] for i in range(0,len(url_list),1)]

def cmr_download(url):
    credentials = None

#     for index, url in enumerate(url_list, start=1):
#         if not credentials and urlparse(url).scheme == 'https':
#             credentials = get_credentials(url)

#         filename = url.split('/')[-1]
    
#         print(filename)
    filename= url.split('/')[-1]
    credentials = get_credentials(url)
#     print(url)
    resp = requests.get(url, auth=("hong1158","HZ08110106hz"))
#     print(resp.response)
#     print(resp.content)
    if resp.status_code==200:
        try:
            with h5py.File('/media/scratch/ZhiLi/SMAP/'+filename, 'r') as f:
                pass
        except OSError:
            print(url)
            with open('/media/scratch/ZhiLi/SMAP/'+filename, 'wb') as f:
                f.write(resp.content)
    else:
        print('failed to connect to server %s, %s'%(str(resp.status_code), filename))




Querying for data:
	https://cmr.earthdata.nasa.gov/search/granules.json?provider=NSIDC_ECS&sort_key[]=start_date&sort_key[]=producer_granule_id&scroll=true&page_size=2000&short_name=SPL3SMP_E&version=003&version=03&version=3&temporal[]=2015-04-01T00:00:00Z,2020-01-31T23:59:59Z&bounding_box=-103,33.6166,-94.433,37&producer_granule_id[]=*&options[producer_granule_id][pattern]=true

Found 1729 matches.
Downloading 3458 files...


In [10]:
url_list= [url for url in url_list if url.endswith('.h5')]

In [11]:
dateURL= [url.split('/')[-1].split('_')[5] for url in url_list]

In [12]:
from glob import glob

In [4]:
files= glob('/media/scratch/ZhiLi/SMAP/*.h5')

In [5]:
dateFILE= [f.split('/')[-1].split('_')[5] for f in files]

In [7]:
remaining= [date  for date in dateURL if date not in dateFILE]

In [8]:
remainingURL= [url for url in url_list if url.split('/')[-1].split('_')[5] in remaining]

In [14]:
i=0
while i<1000:
    print("%d/%d"%(i,1000))
#     files= glob('/media/scratch/ZhiLi/SMAP/*.h5')
#     dateFILE= [f.split('/')[-1].split('_')[5] for f in files]
#     remaining= [date  for date in dateURL if date not in dateFILE]
#     remainingURL= [url for url in url_list if url.split('/')[-1].split('_')[5] in remaining]
#     print('remaining: %d'%(len(remaining)))
    if len(remainingURL)>0:
        try:
            pool = Pool(30)
            pool.map(cmr_download, url_list)
            pool.close()
        except:
            pass
    i+=1
    


0/1000
1/1000
2/1000
3/1000
4/1000
5/1000
6/1000
7/1000
8/1000
9/1000
10/1000
11/1000
12/1000
13/1000
14/1000
15/1000
16/1000
17/1000
18/1000
19/1000
20/1000
21/1000
22/1000
23/1000
24/1000
25/1000
26/1000
27/1000
28/1000
29/1000
30/1000
31/1000
32/1000
33/1000
34/1000
35/1000
36/1000
37/1000
38/1000
39/1000
40/1000
41/1000
42/1000
43/1000
44/1000
45/1000
46/1000
47/1000
48/1000
49/1000
50/1000
51/1000
52/1000
53/1000
54/1000
55/1000
56/1000
57/1000
58/1000
59/1000
60/1000
61/1000
62/1000
63/1000
64/1000
65/1000
66/1000
67/1000
68/1000
69/1000
70/1000
71/1000
72/1000
73/1000
74/1000
75/1000
76/1000
77/1000
78/1000
79/1000
80/1000
81/1000
82/1000
83/1000
84/1000
85/1000
86/1000
87/1000
88/1000
89/1000
90/1000
91/1000
92/1000
93/1000
94/1000
95/1000
96/1000
97/1000
98/1000
99/1000
100/1000
101/1000
102/1000
103/1000
104/1000
105/1000
106/1000
107/1000
108/1000
109/1000
110/1000
111/1000
112/1000
113/1000
114/1000
115/1000
116/1000
117/1000
118/1000
119/1000
120/1000
121/1000
122/1000
123

In [8]:
len(url_list)

1729