-
Notifications
You must be signed in to change notification settings - Fork 70
/
core.py
685 lines (589 loc) · 26.1 KB
/
core.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
"""
The main Pooch class and a factory function for it.
"""
import contextlib
import os
from pathlib import Path
import shutil
import ftplib
import requests
from .utils import (
check_version,
parse_url,
get_logger,
make_local_storage,
cache_location,
hash_matches,
temporary_file,
os_cache,
unique_file_name,
file_hash,
)
from .downloaders import choose_downloader
def retrieve(url, known_hash, fname=None, path=None, processor=None, downloader=None):
"""
Download and cache a single file locally.
Uses HTTP or FTP by default, depending on the protocol in the given *url*.
Other download methods can be controlled through the *downloader* argument
(see below).
The file will be downloaded to a temporary location first and its hash will
be compared to the given *known_hash*. This is done to ensure that the
download happened correctly and securely. If the hash doesn't match, the
file will be deleted and an exception will be raised.
If the file already exists locally, its hash will be compared to
*known_hash*. If they are not the same, this is interpreted as the file
needing to be updated and it will be downloaded again.
You can bypass these checks by passing ``known_hash=None``. If this is
done, the SHA256 hash of the downloaded file will be logged to the screen.
It is highly recommended that you copy and paste this hash as *known_hash*
so that future downloads are guaranteed to be the exact same file. This is
crucial for reproducible computations.
If the file exists in the given *path* with the given *fname* and the hash
matches, it will not be downloaded and the absolute path to the file will
be returned.
.. note::
This function is meant for downloading single files. If you need to
manage the download and caching of several files, with versioning, use
:func:`pooch.create` and :class:`pooch.Pooch` instead.
Parameters
----------
url : str
The URL to the file that is to be downloaded. Ideally, the URL should
end in a file name.
known_hash : str
A known hash (checksum) of the file. Will be used to verify the
download or check if an existing file needs to be updated. By default,
will assume it's a SHA256 hash. To specify a different hashing method,
prepend the hash with ``algorithm:``, for example
``md5:pw9co2iun29juoh`` or ``sha1:092odwhi2ujdp2du2od2odh2wod2``. If
None, will NOT check the hash of the downloaded file or check if an
existing file needs to be updated.
fname : str or None
The name that will be used to save the file. Should NOT include the
full the path, just the file name (it will be appended to *path*). If
None, will create a unique file name using a combination of the last
part of the URL (assuming it's the file name) and the MD5 hash of the
URL. For example, ``81whdo2d2e928yd1wi22-data-file.csv``. This ensures
that files from different URLs never overwrite each other, even if they
have the same name.
path : str or PathLike or None
The location of the cache folder on disk. This is where the file will
be saved. If None, will save to a ``pooch`` folder in the default cache
location for your operating system (see :func:`pooch.os_cache`).
processor : None or callable
If not None, then a function (or callable object) that will be called
before returning the full path and after the file has been downloaded
(if required). See :ref:`processors` for details.
downloader : None or callable
If not None, then a function (or callable object) that will be called
to download a given URL to a provided local file name. See
:ref:`downloaders` for details.
Returns
-------
full_path : str
The absolute path (including the file name) of the file in the local
storage.
Examples
--------
Download one of the data files from the Pooch repository on GitHub:
>>> import os
>>> from pooch import version, check_version, retrieve
>>> # Make a URL for the version of pooch we have installed
>>> url = "https://github.com/fatiando/pooch/raw/{}/data/tiny-data.txt"
>>> url = url.format(check_version(version.full_version))
>>> # Download the file and save it locally. Will check the MD5 checksum of
>>> # the downloaded file against the given value to make sure it's the
>>> # right file. You can use other hashes by specifying different
>>> # algorithm names (sha256, sha1, etc).
>>> fname = retrieve(
... url, known_hash="md5:70e2afd3fd7e336ae478b1e740a5f08e",
... )
>>> with open(fname) as f:
... print(f.read().strip())
# A tiny data file for test purposes only
1 2 3 4 5 6
>>> # Running again won't trigger a download and only return the path to
>>> # the existing file.
>>> fname2 = retrieve(
... url, known_hash="md5:70e2afd3fd7e336ae478b1e740a5f08e",
... )
>>> print(fname2 == fname)
True
>>> os.remove(fname)
Files that are compressed with gzip, xz/lzma, or bzip2 can be automatically
decompressed by passing using the :class:`pooch.Decompress` processor:
>>> from pooch import Decompress
>>> # URLs to a gzip compressed version of the data file.
>>> url = ("https://github.com/fatiando/pooch/raw/{}/"
... + "pooch/tests/data/tiny-data.txt.gz")
>>> url = url.format(check_version(version.full_version))
>>> # By default, you would have to decompress the file yourself
>>> fname = retrieve(
... url,
... known_hash="md5:8812ba10b6c7778014fdae81b03f9def",
... )
>>> print(os.path.splitext(fname)[1])
.gz
>>> # Use the processor to decompress after download automatically and
>>> # return the path to the decompressed file instead.
>>> fname2 = retrieve(
... url,
... known_hash="md5:8812ba10b6c7778014fdae81b03f9def",
... processor=Decompress(),
... )
>>> print(fname2 == fname)
False
>>> with open(fname2) as f:
... print(f.read().strip())
# A tiny data file for test purposes only
1 2 3 4 5 6
>>> os.remove(fname)
>>> os.remove(fname2)
When downloading archives (zip or tar), it can be useful to unpack them
after download to avoid having to do that yourself. Use the processors
:class:`pooch.Unzip` or :class:`pooch.Untar` to do this automatically:
>>> from pooch import Unzip
>>> # URLs to a zip archive with a single data file.
>>> url = ("https://github.com/fatiando/pooch/raw/{}/"
... + "pooch/tests/data/tiny-data.zip")
>>> url = url.format(check_version(version.full_version))
>>> # By default, you would get the path to the archive
>>> fname = retrieve(
... url,
... known_hash="md5:e9592cb46cf3514a1079051f8a148148",
... )
>>> print(os.path.splitext(fname)[1])
.zip
>>> os.remove(fname)
>>> # Using the processor, the archive will be unzipped and a list with the
>>> # path to every file will be returned instead of a single path.
>>> fnames = retrieve(
... url,
... known_hash="md5:e9592cb46cf3514a1079051f8a148148",
... processor=Unzip(),
... )
>>> # There was only a single file in our archive.
>>> print(len(fnames))
1
>>> with open(fnames[0]) as f:
... print(f.read().strip())
# A tiny data file for test purposes only
1 2 3 4 5 6
>>> for f in fnames:
... os.remove(f)
"""
if path is None:
path = os_cache("pooch")
if fname is None:
fname = unique_file_name(url)
# Create the local data directory if it doesn't already exist and make the
# path absolute.
path = cache_location(path, env=None, version=None)
make_local_storage(path)
full_path = path.resolve() / fname
action, verb = download_action(full_path, known_hash)
if action in ("download", "update"):
get_logger().info(
"%s data from '%s' to file '%s'.",
verb,
url,
str(full_path),
)
if downloader is None:
downloader = choose_downloader(url)
stream_download(url, full_path, known_hash, downloader, pooch=None)
if known_hash is None:
get_logger().info(
"SHA256 hash of downloaded file: %s\n"
"Use this value as the 'known_hash' argument of 'pooch.retrieve'"
" to ensure that the file hasn't changed if it is downloaded again"
" in the future.",
file_hash(str(full_path)),
)
if processor is not None:
return processor(str(full_path), action, None)
return str(full_path)
def create(
path,
base_url,
version=None,
version_dev="master",
env=None,
registry=None,
urls=None,
):
"""
Create a :class:`~pooch.Pooch` with sensible defaults to fetch data files.
If a version string is given, the Pooch will be versioned, meaning that the
local storage folder and the base URL depend on the project version. This
is necessary if your users have multiple versions of your library installed
(using virtual environments) and you updated the data files between
versions. Otherwise, every time a user switches environments would trigger
a re-download of the data. The version string will be appended to the local
storage path (for example, ``~/.mypooch/cache/v0.1``) and inserted into the
base URL (for example,
``https://github.com/fatiando/pooch/raw/v0.1/data``). If the version string
contains ``+XX.XXXXX``, it will be interpreted as a development version.
Does **not** create the local data storage folder. The folder will only be
created the first time a download is attempted with
:meth:`pooch.Pooch.fetch`. This makes it safe to use this function at the
module level (so it's executed on ``import`` and the resulting
:class:`~pooch.Pooch` is a global variable).
Parameters
----------
path : str, PathLike, list or tuple
The path to the local data storage folder. If this is a list or tuple,
we'll join the parts with the appropriate separator. The *version* will
be appended to the end of this path. Use :func:`pooch.os_cache` for a
sensible default.
base_url : str
Base URL for the remote data source. All requests will be made relative
to this URL. The string should have a ``{version}`` formatting mark in
it. We will call ``.format(version=version)`` on this string. If the
URL is a directory path, it must end in a ``'/'`` because we will not
include it.
version : str or None
The version string for your project. Should be PEP440 compatible. If
None is given, will not attempt to format *base_url* and no subfolder
will be appended to *path*.
version_dev : str
The name used for the development version of a project. If your data is
hosted on Github (and *base_url* is a Github raw link), then
``"master"`` is a good choice (default). Ignored if *version* is None.
env : str or None
An environment variable that can be used to overwrite *path*. This
allows users to control where they want the data to be stored. We'll
append *version* to the end of this value as well.
registry : dict or None
A record of the files that are managed by this Pooch. Keys should be
the file names and the values should be their hashes. Only files
in the registry can be fetched from the local storage. Files in
subdirectories of *path* **must use Unix-style separators** (``'/'``)
even on Windows.
urls : dict or None
Custom URLs for downloading individual files in the registry. A
dictionary with the file names as keys and the custom URLs as values.
Not all files in *registry* need an entry in *urls*. If a file has an
entry in *urls*, the *base_url* will be ignored when downloading it in
favor of ``urls[fname]``.
Returns
-------
pooch : :class:`~pooch.Pooch`
The :class:`~pooch.Pooch` initialized with the given arguments.
Examples
--------
Create a :class:`~pooch.Pooch` for a release (v0.1):
>>> pup = create(path="myproject",
... base_url="http://some.link.com/{version}/",
... version="v0.1",
... registry={"data.txt": "9081wo2eb2gc0u..."})
>>> print(pup.path.parts) # The path is a pathlib.Path
('myproject', 'v0.1')
>>> # The local folder is only created when a dataset is first downloaded
>>> print(pup.path.exists())
False
>>> print(pup.base_url)
http://some.link.com/v0.1/
>>> print(pup.registry)
{'data.txt': '9081wo2eb2gc0u...'}
>>> print(pup.registry_files)
['data.txt']
If this is a development version (12 commits ahead of v0.1), then the
``version_dev`` will be used (defaults to ``"master"``):
>>> pup = create(path="myproject",
... base_url="http://some.link.com/{version}/",
... version="v0.1+12.do9iwd")
>>> print(pup.path.parts)
('myproject', 'master')
>>> print(pup.base_url)
http://some.link.com/master/
Versioning is optional (but highly encouraged):
>>> pup = create(path="myproject",
... base_url="http://some.link.com/",
... registry={"data.txt": "9081wo2eb2gc0u..."})
>>> print(pup.path.parts) # The path is a pathlib.Path
('myproject',)
>>> print(pup.base_url)
http://some.link.com/
To place the storage folder at a subdirectory, pass in a list and we'll
join the path for you using the appropriate separator for your operating
system:
>>> pup = create(path=["myproject", "cache", "data"],
... base_url="http://some.link.com/{version}/",
... version="v0.1")
>>> print(pup.path.parts)
('myproject', 'cache', 'data', 'v0.1')
The user can overwrite the storage path by setting an environment variable:
>>> # The variable is not set so we'll use *path*
>>> pup = create(path=["myproject", "not_from_env"],
... base_url="http://some.link.com/{version}/",
... version="v0.1",
... env="MYPROJECT_DATA_DIR")
>>> print(pup.path.parts)
('myproject', 'not_from_env', 'v0.1')
>>> # Set the environment variable and try again
>>> import os
>>> os.environ["MYPROJECT_DATA_DIR"] = os.path.join("myproject", "env")
>>> pup = create(path=["myproject", "not_env"],
... base_url="http://some.link.com/{version}/",
... version="v0.1",
... env="MYPROJECT_DATA_DIR")
>>> print(pup.path.parts)
('myproject', 'env', 'v0.1')
"""
if version is not None:
version = check_version(version, fallback=version_dev)
base_url = base_url.format(version=version)
# Don't create the cache folder here! This function is usually called in
# the module context (at import time), so touching the file system is not
# recommended. It could cause crashes when multiple processes/threads try
# to import at the same time (which would try to create the folder several
# times at once).
path = cache_location(path, env, version)
pup = Pooch(path=path, base_url=base_url, registry=registry, urls=urls)
return pup
class Pooch:
"""
Manager for a local data storage that can fetch from a remote source.
Avoid creating ``Pooch`` instances directly. Use :func:`pooch.create`
instead.
Parameters
----------
path : str
The path to the local data storage folder. The path must exist in the
file system.
base_url : str
Base URL for the remote data source. All requests will be made relative
to this URL.
registry : dict or None
A record of the files that are managed by this good boy. Keys should be
the file names and the values should be their hashes. Only files
in the registry can be fetched from the local storage. Files in
subdirectories of *path* **must use Unix-style separators** (``'/'``)
even on Windows.
urls : dict or None
Custom URLs for downloading individual files in the registry. A
dictionary with the file names as keys and the custom URLs as values.
Not all files in *registry* need an entry in *urls*. If a file has an
entry in *urls*, the *base_url* will be ignored when downloading it in
favor of ``urls[fname]``.
"""
def __init__(self, path, base_url, registry=None, urls=None):
self.path = path
self.base_url = base_url
if registry is None:
registry = dict()
self.registry = registry
if urls is None:
urls = dict()
self.urls = dict(urls)
@property
def abspath(self):
"Absolute path to the local storage"
return Path(os.path.abspath(os.path.expanduser(str(self.path))))
@property
def registry_files(self):
"List of file names on the registry"
return list(self.registry)
def fetch(self, fname, processor=None, downloader=None):
"""
Get the absolute path to a file in the local storage.
If it's not in the local storage, it will be downloaded. If the hash of
the file in local storage doesn't match the one in the registry, will
download a new copy of the file. This is considered a sign that the
file was updated in the remote storage. If the hash of the downloaded
file still doesn't match the one in the registry, will raise an
exception to warn of possible file corruption.
Post-processing actions sometimes need to be taken on downloaded files
(unzipping, conversion to a more efficient format, etc). If these
actions are time or memory consuming, it would be best to do this only
once right after the file is downloaded. Use the *processor* argument
to specify a function that is executed after the download to perform
these actions. See :ref:`processors` for details.
Custom file downloaders can be provided through the *downloader*
argument. By default, Pooch will determine the download protocol from
the URL in the registry. If the server for a given file requires
authentication (username and password), use a downloader that support
these features. Downloaders can also be used to print custom messages
(like a progress bar), etc. See :ref:`downloaders` for details.
Parameters
----------
fname : str
The file name (relative to the *base_url* of the remote data
storage) to fetch from the local storage.
processor : None or callable
If not None, then a function (or callable object) that will be
called before returning the full path and after the file has been
downloaded. See :ref:`processors` for details.
downloader : None or callable
If not None, then a function (or callable object) that will be
called to download a given URL to a provided local file name. See
:ref:`downloaders` for details.
Returns
-------
full_path : str
The absolute path (including the file name) of the file in the
local storage.
"""
self._assert_file_in_registry(fname)
# Create the local data directory if it doesn't already exist
make_local_storage(str(self.abspath))
url = self.get_url(fname)
full_path = self.abspath / fname
known_hash = self.registry[fname]
action, verb = download_action(full_path, known_hash)
if action in ("download", "update"):
get_logger().info(
"%s file '%s' from '%s' to '%s'.",
verb,
fname,
url,
str(self.abspath),
)
if downloader is None:
downloader = choose_downloader(url)
stream_download(url, full_path, known_hash, downloader, pooch=self)
if processor is not None:
return processor(str(full_path), action, self)
return str(full_path)
def _assert_file_in_registry(self, fname):
"""
Check if a file is in the registry and raise :class:`ValueError` if
it's not.
"""
if fname not in self.registry:
raise ValueError(f"File '{fname}' is not in the registry.")
def get_url(self, fname):
"""
Get the full URL to download a file in the registry.
Parameters
----------
fname : str
The file name (relative to the *base_url* of the remote data
storage) to fetch from the local storage.
"""
self._assert_file_in_registry(fname)
return self.urls.get(fname, "".join([self.base_url, fname]))
def load_registry(self, fname):
"""
Load entries from a file and add them to the registry.
Use this if you are managing many files.
Each line of the file should have file name and its hash separated by
a space. Hash can specify checksum algorithm using "alg:hash" format.
In case no algorithm is provided, SHA256 is used by default.
Only one file per line is allowed. Custom download URLs for individual
files can be specified as a third element on the line. Line comments
can be added and must be prepended with ``#``.
Parameters
----------
fname : str | fileobj
Path (or open file object) to the registry file.
"""
with contextlib.ExitStack() as stack:
if hasattr(fname, "read"):
# It's a file object
fin = fname
else:
# It's a file path
fin = stack.enter_context(open(fname))
for linenum, line in enumerate(fin):
if isinstance(line, bytes):
line = line.decode("utf-8")
line = line.strip()
# skip line comments
if line.startswith("#"):
continue
elements = line.split()
if not len(elements) in [0, 2, 3]:
raise OSError(
f"Invalid entry in Pooch registry file '{fname}': "
f"expected 2 or 3 elements in line {linenum + 1} but got "
f"{len(elements)}. Offending entry: '{line}'"
)
if elements:
file_name = elements[0]
file_checksum = elements[1]
if len(elements) == 3:
file_url = elements[2]
self.urls[file_name] = file_url
self.registry[file_name] = file_checksum
def is_available(self, fname):
"""
Check availability of a remote file without downloading it.
Use this method when working with large files to check if they are
available for download.
Parameters
----------
fname : str
The file name (relative to the *base_url* of the remote data
storage) to fetch from the local storage.
Returns
-------
status : bool
True if the file is available for download. False otherwise.
"""
self._assert_file_in_registry(fname)
source = self.get_url(fname)
parsed_url = parse_url(source)
if parsed_url["protocol"] == "ftp":
directory, file_name = os.path.split(parsed_url["path"])
ftp = ftplib.FTP()
ftp.connect(host=parsed_url["netloc"])
try:
ftp.login()
available = file_name in ftp.nlst(directory)
finally:
ftp.close()
else:
response = requests.head(source, allow_redirects=True)
available = bool(response.status_code == 200)
return available
def download_action(path, known_hash):
"""
Determine the action that is needed to get the file on disk.
Parameters
----------
path : PathLike
The path to the file on disk.
known_hash : str
A known hash (checksum) of the file. Will be used to verify the
download or check if an existing file needs to be updated. By default,
will assume it's a SHA256 hash. To specify a different hashing method,
prepend the hash with ``algorithm:``, for example
``md5:pw9co2iun29juoh`` or ``sha1:092odwhi2ujdp2du2od2odh2wod2``.
Returns
-------
action, verb : str
The action that must be taken and the English verb (infinitive form of
*action*) used in the log:
* ``'download'``: File does not exist locally and must be downloaded.
* ``'update'``: File exists locally but needs to be updated.
* ``'fetch'``: File exists locally and only need to inform its path.
"""
if not path.exists():
action = "download"
verb = "Downloading"
elif not hash_matches(str(path), known_hash):
action = "update"
verb = "Updating"
else:
action = "fetch"
verb = "Fetching"
return action, verb
def stream_download(url, fname, known_hash, downloader, pooch=None):
"""
Stream the file and check that its hash matches the known one.
The file is first downloaded to a temporary file name in the cache folder.
It will be moved to the desired file name only if the hash matches the
known hash. Otherwise, the temporary file is deleted.
"""
# Ensure the parent directory exists in case the file is in a subdirectory.
# Otherwise, move will cause an error.
if not fname.parent.exists():
os.makedirs(str(fname.parent))
# Stream the file to a temporary so that we can safely check its hash
# before overwriting the original.
with temporary_file(path=str(fname.parent)) as tmp:
downloader(url, tmp, pooch)
hash_matches(tmp, known_hash, strict=True, source=str(fname.name))
shutil.move(tmp, str(fname))