-
Notifications
You must be signed in to change notification settings - Fork 111
/
archives.py
572 lines (497 loc) · 20.9 KB
/
archives.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
# emacs: -*- mode: python; py-indent-offset: 4; tab-width: 4; indent-tabs-mode: nil -*-
# ex: set sts=4 ts=4 sw=4 noet:
# ## ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ##
#
# See COPYING file distributed along with the datalad package for the
# copyright and license terms.
#
# ## ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ##
"""Various handlers/functionality for different types of files (e.g. for archives)
"""
import hashlib
import patoolib
from .external_versions import external_versions
# There were issues, so let's stay consistently with recent version
assert(external_versions["patoolib"] >= "1.7")
import os
import tempfile
from .exceptions import MissingExternalDependency
from .path import (
basename,
join as opj,
exists, abspath, isabs, normpath, relpath, pardir, isdir,
realpath,
sep as opsep,
)
from six import next, PY2
from six.moves.urllib.parse import unquote as urlunquote
import string
import random
from .locking import lock_if_check_fails
from ..utils import (
any_re_search,
assure_bytes,
chpwd,
rmdir,
unlink,
)
import logging
lgr = logging.getLogger('datalad.files')
# Monkey-patch patoolib's logging, so it logs coherently with the rest of
# datalad
import patoolib.util
#
# Seems have managed with swallow_outputs
#
# def _patool_log(level, msg):
# lgr.log(level, "patool: %s" % msg)
#
# def _patool_log_info(msg, *args, **kwargs):
# _patool_log(logging.DEBUG, msg)
#
# def _patool_log_error(msg, *args, **kwargs):
# _patool_log(logging.ERROR, msg)
#
# patoolib.util.log_info = _patool_log_info
# patoolib.util.log_error = _patool_log_error
# patoolib.util.log_internal_error = _patool_log_error
# we need to decorate patool.util.run
# because otherwise it just lets processes to spit out everything to std and we
# do want to use it at "verbosity>=0" so we could get idea on what is going on.
# And I don't want to mock for every invocation
from ..support.exceptions import CommandError
from ..utils import swallow_outputs
from ..utils import rmtemp
from ..cmd import Runner
from ..consts import ARCHIVES_TEMP_DIR
from ..utils import rmtree
from ..utils import get_tempfile_kwargs
from ..utils import assure_unicode
from ..utils import on_windows
_runner = Runner()
def _patool_run(cmd, verbosity=0, **kwargs):
"""Decorated runner for patool so it doesn't spit out outputs to stdout"""
# use our runner
try:
# kwargs_ = kwargs[:]; kwargs_['shell'] = True
# Any debug/progress output could be spit out to stderr so let's
# "expect" it.
#
if isinstance(cmd, (list, tuple)) and kwargs.get('shell'):
# patool (as far as I see it) takes care about quoting args
cmd = ' '.join(cmd)
out, err = _runner.run(cmd,
#log_stdout='offline',
#log_stderr='offline',
#expect_stderr=True,
#stdin=open('/dev/null'),
**kwargs)
lgr.debug("Finished running for patool. stdout=%s, stderr=%s", out, err)
return 0
except CommandError as e:
return e.code
except Exception as e:
lgr.error("While invoking runner caught unexpected exception: %s" % e)
return 100 # unknown beast
patoolib.util.run = _patool_run
# yoh: only keys are used atm, logic in decompress_file is replaced to use
# patool
DECOMPRESSORS = {
r'\.(tar\.bz|tbz)$': 'tar -xjvf %(file)s -C %(dir)s',
r'\.(tar\.xz)$': 'tar -xJvf %(file)s -C %(dir)s',
r'\.(tar\.gz|tgz)$': 'tar -xzvf %(file)s -C %(dir)s',
r'\.(zip)$': 'unzip %(file)s -d %(dir)s',
}
def unixify_path(path):
r"""On windows convert paths from drive:\d\file to /drive/d/file
This overcomes problems with various cmdline tools we are to use,
such as tar etc
"""
if on_windows:
drive, path_ = os.path.splitdrive(path)
path_ = path_.split(os.sep)
path_ = '/'.join(path_)
if drive:
# last one must be :
assert(drive[-1] == ":")
return '/%s%s' % (drive[:-1], path_)
else:
return path_
else:
return path
def decompress_file(archive, dir_, leading_directories='strip'):
"""Decompress `archive` into a directory `dir_`
Parameters
----------
archive: str
dir_: str
leading_directories: {'strip', None}
If `strip`, and archive contains a single leading directory under which
all content is stored, all the content will be moved one directory up
and that leading directory will be removed.
"""
if not exists(dir_):
lgr.debug("Creating directory %s to extract archive into" % dir_)
os.makedirs(dir_)
with swallow_outputs() as cmo:
archive = assure_bytes(archive)
dir_ = assure_bytes(dir_)
patoolib.util.check_existing_filename(archive)
patoolib.util.check_existing_filename(dir_, onlyfiles=False)
# Call protected one to avoid the checks on existence on unixified path
outdir = unixify_path(dir_)
if not PY2:
# should be supplied in PY3 to avoid b''
outdir = assure_unicode(outdir)
archive = assure_unicode(archive)
format_compression = patoolib.get_archive_format(archive)
if format_compression == ('gzip', None):
# Yarik fell into the trap of being lazy and not providing proper
# support for .gz .xz etc "stream archivers" formats in handling
# of archives. ATM out support for .gz relies on behavior of 7z while
# extracting them and respecting possibly present .gz filename
# header field.
# See more https://github.com/datalad/datalad/pull/3176#issuecomment-466819861
# TODO: provide proper handling of all those archives without
# relying on any filename been stored in the header
program = patoolib.find_archive_program(
format_compression[0], 'extract')
if basename(program) != '7z':
raise MissingExternalDependency(
"cmd:7z",
msg="(Not) Funny enough but ATM we need p7zip installation "
"to handle .gz files extraction 'correctly'"
)
patoolib._extract_archive(unixify_path(archive),
outdir=outdir,
verbosity=100)
if cmo.out:
lgr.debug("patool gave stdout:\n%s" % cmo.out)
if cmo.err:
lgr.debug("patool gave stderr:\n%s" % cmo.err)
# Note: (ben) Experienced issue, where extracted tarball
# lacked execution bit of directories, leading to not being
# able to delete them while having write permission.
# Can't imagine a situation, where we would want to fail on
# that kind of mess. So, to be sure set it.
if not on_windows:
os.chmod(dir_,
os.stat(dir_).st_mode |
os.path.stat.S_IEXEC)
for root, dirs, files in os.walk(dir_, followlinks=False):
for d in dirs:
subdir = opj(root, d)
os.chmod(subdir,
os.stat(subdir).st_mode |
os.path.stat.S_IEXEC)
if leading_directories == 'strip':
_, dirs, files = next(os.walk(dir_))
if not len(files) and len(dirs) == 1:
# move all the content under dirs[0] up 1 level
widow_dir = opj(dir_, dirs[0])
lgr.debug("Moving content within %s upstairs" % widow_dir)
subdir, subdirs_, files_ = next(os.walk(opj(dir_, dirs[0])))
for f in subdirs_ + files_:
os.rename(opj(subdir, f), opj(dir_, f))
rmdir(widow_dir)
elif leading_directories is None:
pass # really do nothing
else:
raise NotImplementedError("Not supported %s" % leading_directories)
def compress_files(files, archive, path=None, overwrite=True):
"""Compress `files` into an `archive` file
Parameters
----------
files : list of str
archive : str
path : str
Alternative directory under which compressor will be invoked, to e.g.
take into account relative paths of files and/or archive
overwrite : bool
Whether to allow overwriting the target archive file if one already exists
"""
with swallow_outputs() as cmo:
with chpwd(path):
if not overwrite:
patoolib.util.check_new_filename(archive)
patoolib.util.check_archive_filelist(files)
# Call protected one to avoid the checks on existence on unixified path
patoolib._create_archive(unixify_path(archive),
[unixify_path(f) for f in files],
verbosity=100)
if cmo.out:
lgr.debug("patool gave stdout:\n%s" % cmo.out)
if cmo.err:
lgr.debug("patool gave stderr:\n%s" % cmo.err)
def _get_cached_filename(archive):
"""A helper to generate a filename which has original filename and additional suffix
which wouldn't collide across files with the same name from different locations
"""
#return "%s_%s" % (basename(archive), hashlib.md5(archive).hexdigest()[:5])
# per se there is no reason to maintain any long original name here.
archive_cached = hashlib.md5(assure_bytes(realpath(archive))).hexdigest()[:10]
lgr.debug("Cached directory for archive %s is %s", archive, archive_cached)
return archive_cached
def _get_random_id(size=6, chars=string.ascii_uppercase + string.digits):
"""Return a random ID composed from digits and uppercase letters
upper-case so we are tolerant to unlikely collisions on dummy FSs
"""
return ''.join(random.choice(chars) for _ in range(size))
class ArchivesCache(object):
"""Cache to maintain extracted archives
Parameters
----------
toppath : str
Top directory under .git/ of which temp directory would be created.
If not provided -- random tempdir is used
persistent : bool, optional
Passed over into generated ExtractedArchives
"""
# TODO: make caching persistent across sessions/runs, with cleanup
# IDEA: extract under .git/annex/tmp so later on annex unused could clean it
# all up
def __init__(self, toppath=None, persistent=False):
self._toppath = toppath
if toppath:
path = opj(toppath, ARCHIVES_TEMP_DIR)
if not persistent:
tempsuffix = "-" + _get_random_id()
lgr.debug("For non-persistent archives using %s suffix for path %s",
tempsuffix, path)
path += tempsuffix
else:
if persistent:
raise ValueError("%s cannot be persistent since no toppath was provided" % self)
path = tempfile.mktemp(**get_tempfile_kwargs())
self._path = path
self.persistent = persistent
# TODO? assure that it is absent or we should allow for it to persist a bit?
#if exists(path):
# self._clean_cache()
self._archives = {}
# TODO: begging for a race condition
if not exists(path):
lgr.debug("Initiating clean cache for the archives under %s" % self.path)
try:
self._made_path = True
os.makedirs(path)
lgr.debug("Cache initialized")
except:
lgr.error("Failed to initialize cached under %s" % path)
raise
else:
lgr.debug("Not initiating existing cache for the archives under %s" % self.path)
self._made_path = False
@property
def path(self):
return self._path
def clean(self, force=False):
for aname, a in list(self._archives.items()):
a.clean(force=force)
del self._archives[aname]
# Probably we should not rely on _made_path and not bother if persistent removing it
# if ((not self.persistent) or force) and self._made_path:
# lgr.debug("Removing the entire archives cache under %s" % self.path)
# rmtemp(self.path)
if (not self.persistent) or force:
lgr.debug("Removing the entire archives cache under %s" % self.path)
rmtemp(self.path)
def _get_normalized_archive_path(self, archive):
"""Return full path to archive
So we have consistent operation from different subdirs,
while referencing archives from the topdir
TODO: why do we need it???
"""
if not isabs(archive) and self._toppath:
out = normpath(opj(self._toppath, archive))
if relpath(out, self._toppath).startswith(pardir):
raise RuntimeError("%s points outside of the topdir %s"
% (archive, self._toppath))
if isdir(out):
raise RuntimeError("got a directory here... bleh")
return out
return archive
def get_archive(self, archive):
archive = self._get_normalized_archive_path(archive)
if archive not in self._archives:
self._archives[archive] = \
ExtractedArchive(archive,
opj(self.path, _get_cached_filename(archive)),
persistent=self.persistent)
return self._archives[archive]
def __getitem__(self, archive):
return self.get_archive(archive)
def __delitem__(self, archive):
archive = self._get_normalized_archive_path(archive)
self._archives[archive].clean()
del self._archives[archive]
def __del__(self):
try:
# we can at least try
if not self.persistent:
self.clean()
except: # MIH: IOError?
pass
class ExtractedArchive(object):
"""Container for the extracted archive
"""
# suffix to use for a stamp so we could guarantee that extracted archive is
STAMP_SUFFIX = '.stamp'
def __init__(self, archive, path=None, persistent=False):
self._archive = archive
# TODO: bad location for extracted archive -- use tempfile
if not path:
path = tempfile.mktemp(**get_tempfile_kwargs(prefix=_get_cached_filename(archive)))
if exists(path) and not persistent:
raise RuntimeError("Directory %s already exists whenever it should not "
"persist" % path)
self._persistent = persistent
self._path = path
def __repr__(self):
return "%s(%r, path=%r)" % (self.__class__.__name__, self._archive, self.path)
def clean(self, force=False):
# would interfere with tests
# if os.environ.get('DATALAD_TESTS_TEMP_KEEP'):
# lgr.info("As instructed, not cleaning up the cache under %s"
# % self._path)
# return
for path, name in [
(self._path, 'cache'),
(self.stamp_path, 'stamp file')
]:
if exists(path):
if (not self._persistent) or force:
lgr.debug("Cleaning up the %s for %s under %s", name, self._archive, path)
# TODO: we must be careful here -- to not modify permissions of files
# only of directories
(rmtree if isdir(path) else unlink)(path)
@property
def path(self):
"""Given an archive -- return full path to it within cache (extracted)
"""
return self._path
@property
def stamp_path(self):
return self._path + self.STAMP_SUFFIX
@property
def is_extracted(self):
return exists(self.path) and exists(self.stamp_path) \
and os.stat(self.stamp_path).st_mtime >= os.stat(self.path).st_mtime
def assure_extracted(self):
"""Return path to the extracted `archive`. Extract archive if necessary
"""
path = self.path
with lock_if_check_fails(
check=(lambda s: s.is_extracted, (self,)),
lock_path=path,
operation="extract"
) as (check, lock):
if lock:
assert not check
self._extract_archive(path)
return path
def _extract_archive(self, path):
# we need to extract the archive
# TODO: extract to _tmp and then move in a single command so we
# don't end up picking up broken pieces
lgr.debug(u"Extracting {self._archive} under {path}".format(**locals()))
if exists(path):
lgr.debug(
"Previous extracted (but probably not fully) cached archive "
"found. Removing %s",
path)
rmtree(path)
os.makedirs(path)
assert (exists(path))
# remove old stamp
if exists(self.stamp_path):
rmtree(self.stamp_path)
decompress_file(self._archive, path, leading_directories=None)
# TODO: must optional since we might to use this content, move it
# into the tree etc
# lgr.debug("Adjusting permissions to R/O for the extracted content")
# rotree(path)
assert (exists(path))
# create a stamp
with open(self.stamp_path, 'wb') as f:
f.write(assure_bytes(self._archive))
# assert that stamp mtime is not older than archive's directory
assert (self.is_extracted)
# TODO: remove?
#def has_file_ready(self, afile):
# lgr.debug(u"Checking file {afile} from archive {archive}".format(**locals()))
# return exists(self.get_extracted_filename(afile))
def get_extracted_filename(self, afile):
"""Return full path to the `afile` within extracted `archive`
It does not actually extract any archive
"""
return opj(self.path, urlunquote(afile))
def get_extracted_files(self):
"""Generator to provide filenames which are available under extracted archive
"""
path = self.assure_extracted()
path_len = len(path) + (len(os.sep) if not path.endswith(os.sep) else 0)
for root, dirs, files in os.walk(path): # TEMP
for name in files:
yield assure_unicode(opj(root, name)[path_len:])
def get_leading_directory(self, depth=None, consider=None, exclude=None):
"""Return leading directory of the content within archive
Parameters
----------
depth: int or None, optional
Maximal depth of leading directories to consider. If None - no upper
limit
consider : list of str, optional
Regular expressions for file/directory names to be considered (before
exclude). Applied to the entire relative path to the file as in the archive
exclude: list of str, optional
Regular expressions for file/directory names to be excluded from consideration.
Applied to the entire relative path to the file as in the archive
Returns
-------
str or None:
If there is no single leading directory -- None returned
"""
leading = None
# returns only files, so no need to check if a dir or not
for fpath in self.get_extracted_files():
if consider and not any_re_search(consider, fpath):
continue
if exclude and any_re_search(exclude, fpath):
continue
lpath = fpath.split(opsep)
dpath = lpath[:-1] # directory path components
if leading is None:
leading = dpath if depth is None else dpath[:depth]
else:
if dpath[:len(leading)] != leading:
# find smallest common path
leading_ = []
# TODO: there might be more efficient pythonic way
for d1, d2 in zip(leading, dpath):
if d1 != d2:
break
leading_.append(d1)
leading = leading_
if not len(leading):
# no common leading - ready to exit
return None
return leading if leading is None else opj(*leading)
def get_extracted_file(self, afile):
lgr.debug(u"Requested file {afile} from archive {self._archive}".format(**locals()))
# TODO: That could be a good place to provide "compatibility" layer if
# filenames within archive are too obscure for local file system.
# We could somehow adjust them while extracting and here channel back
# "fixed" up names since they are only to point to the load
self.assure_extracted()
path = self.get_extracted_filename(afile)
# TODO: make robust
lgr.log(2, "Verifying that %s exists" % abspath(path))
assert exists(path), "%s must exist" % path
return path
def __del__(self):
try:
if self._persistent:
self.clean()
except: # MIH: IOError?
pass