/
files.py
237 lines (198 loc) · 8.75 KB
/
files.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
import os
import sys
import tarfile
import signal
import errno
from pathlib2 import PurePath
from shutil import rmtree
# Use the built-in version of scandir if possible, otherwise
# use the scandir module version
try:
from os import scandir
except ImportError:
from scandir import scandir # noqa: F401
from clint.textui.progress import Bar as ProgressBar
from floyd.manager.floyd_ignore import FloydIgnoreManager
from floyd.log import logger as floyd_logger
def get_unignored_file_paths(ignore_list=None, whitelist=None):
"""
Given an ignore_list and a whitelist of glob patterns, returns the list of
unignored file paths in the current directory and its subdirectories
"""
unignored_files = []
if ignore_list is None:
ignore_list = []
if whitelist is None:
whitelist = []
for root, dirs, files in os.walk("."):
floyd_logger.debug("Root:%s, Dirs:%s", root, dirs)
if ignore_path(unix_style_path(root), ignore_list, whitelist):
# Reset dirs to avoid going further down this directory.
# Then continue to the next iteration of os.walk, which causes
# everything in this directory to be ignored.
#
# Note that whitelisted files that are within directories that are
# ignored will not be whitelisted. This follows the expected
# behavior established by .gitignore logic:
# "It is not possible to re-include a file if a parent directory of
# that file is excluded."
# https://git-scm.com/docs/gitignore#_pattern_format
dirs[:] = []
floyd_logger.debug("Ignoring directory : %s", root)
continue
for file_name in files:
file_path = unix_style_path(os.path.join(root, file_name))
if ignore_path(file_path, ignore_list, whitelist):
floyd_logger.debug("Ignoring file : %s", file_name)
continue
unignored_files.append(os.path.join(root, file_name))
return unignored_files
def ignore_path(path, ignore_list=None, whitelist=None):
"""
Returns a boolean indicating if a path should be ignored given an
ignore_list and a whitelist of glob patterns.
"""
if ignore_list is None:
return True
should_ignore = matches_glob_list(path, ignore_list)
if whitelist is None:
return should_ignore
return should_ignore and not matches_glob_list(path, whitelist)
def matches_glob_list(path, glob_list):
"""
Given a list of glob patterns, returns a boolean
indicating if a path matches any glob in the list
"""
for glob in glob_list:
try:
if PurePath(path).match(glob):
return True
except TypeError:
pass
return False
def get_files_in_current_directory(file_type):
"""
Gets the list of files in the current directory and subdirectories.
Respects .floydignore file if present
"""
local_files = []
total_file_size = 0
ignore_list, whitelist = FloydIgnoreManager.get_lists()
floyd_logger.debug("Ignoring: %s", ignore_list)
floyd_logger.debug("Whitelisting: %s", whitelist)
file_paths = get_unignored_file_paths(ignore_list, whitelist)
for file_path in file_paths:
local_files.append((file_type, (unix_style_path(file_path), open(file_path, 'rb'), 'text/plain')))
total_file_size += os.path.getsize(file_path)
return (local_files, total_file_size)
def unix_style_path(path):
if os.path.sep != '/':
return path.replace(os.path.sep, '/')
return path
def sizeof_fmt(num, suffix='B'):
"""
Print in human friendly format
"""
for unit in ['', 'Ki', 'Mi', 'Gi', 'Ti', 'Pi', 'Ei', 'Zi']:
if abs(num) < 1024.0:
return "%3.1f%s%s" % (num, unit, suffix)
num /= 1024.0
return "%.1f%s%s" % (num, 'Yi', suffix)
class DataCompressor(object):
"""
Local Data Compression with progress bar.
"""
def __init__(self,
source_dir,
filename):
# Data directory to compress
self.source_dir = source_dir
# Archive (Tar file) name
# e.g. "/tmp/contents.tar.gz"
self.filename = filename
# Prgress Bar for tracking data compression
self.__compression_bar = None
# Number of files to compress
self.__files_to_compress = 0
self.__get_nfiles_to_compress()
# Number of files already compressed
self.__files_compressed = 0
def __get_nfiles_to_compress(self):
"""
Return the number of files to compress
Note: it should take about 0.1s for counting 100k files on a dual core machine
"""
floyd_logger.info("Get number of files to compress... (this could take a few seconds)")
paths = [self.source_dir]
try:
# Traverse each subdirs of source_dir and count files/dirs
while paths:
path = paths.pop()
for item in scandir(path):
if item.is_dir():
paths.append(item.path)
self.__files_to_compress += 1
elif item.is_file():
self.__files_to_compress += 1
except OSError as e:
# OSError: [Errno 13] Permission denied
if e.errno == errno.EACCES:
self.source_dir = os.getcwd() if self.source_dir == '.' else self.source_dir # Expand cwd
sys.exit(("Permission denied. Make sure to have read permission "
"for all the files and directories in the path: %s")
% (self.source_dir))
floyd_logger.info("Compressing %d files", self.__files_to_compress)
def create_tarfile(self):
"""
Create a tar file with the contents of the current directory
"""
floyd_logger.info("Compressing data...")
# Show progress bar (file_compressed/file_to_compress)
self.__compression_bar = ProgressBar(expected_size=self.__files_to_compress, filled_char='=')
# Auxiliary functions
def dfilter_file_counter(tarinfo):
"""
Dummy filter function used to track the progression at file levels.
"""
self.__compression_bar.show(self.__files_compressed)
self.__files_compressed += 1
return tarinfo
def warn_purge_exit(info_msg, filename, progress_bar, exit_msg):
"""
Warn the user that's something went wrong,
remove the tarball and provide an exit message.
"""
progress_bar.done()
floyd_logger.info(info_msg)
rmtree(os.path.dirname(filename))
sys.exit(exit_msg)
try:
# Define the default signal handler for catching: Ctrl-C
signal.signal(signal.SIGINT, signal.default_int_handler)
with tarfile.open(self.filename, "w:gz") as tar:
tar.add(self.source_dir, arcname=os.path.basename(self.source_dir), filter=dfilter_file_counter)
self.__compression_bar.done()
except (OSError, IOError) as e:
# OSError: [Errno 13] Permission denied
if e.errno == errno.EACCES:
self.source_dir = os.getcwd() if self.source_dir == '.' else self.source_dir # Expand cwd
warn_purge_exit(info_msg="Permission denied. Removing compressed data...",
filename=self.filename,
progress_bar=self.__compression_bar,
exit_msg=("Permission denied. Make sure to have read permission "
"for all the files and directories in the path: %s")
% (self.source_dir))
# OSError: [Errno 28] No Space Left on Device (IOError on python2.7)
elif e.errno == errno.ENOSPC:
dir_path = os.path.dirname(self.filename)
warn_purge_exit(info_msg="No space left. Removing compressed data...",
filename=self.filename,
progress_bar=self.__compression_bar,
exit_msg=("No space left when compressing your data in: %s.\n"
"Make sure to have enough space before uploading your data.")
% (os.path.abspath(dir_path)))
except KeyboardInterrupt: # Purge tarball on Ctrl-C
warn_purge_exit(info_msg="Ctrl-C signal detected: Removing compressed data...",
filename=self.filename,
progress_bar=self.__compression_bar,
exit_msg="Stopped the data upload gracefully.")