-
Notifications
You must be signed in to change notification settings - Fork 1
/
mimesort.py
executable file
·266 lines (223 loc) · 9.66 KB
/
mimesort.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
#!/usr/bin/env python
# -*- coding: utf-8 -*-
from __future__ import print_function, division
import errno
import getopt
import math
import mimetypes
import operator
import os
import shutil
import sys
__author__ = "Eric Pruitt"
__license__ = "Public Domain, FreeBSD, or PSF"
__all__ = ['IGNORED_FILES', 'MIXED_TYPES_LABEL', 'UNKNOWN_TYPES_LABEL',
'categorize', 'classify', 'diversity', 'guess_mime_type', 'organize']
MAX_DIVERSITY = 0.0
MIXED_TYPES_LABEL = 'mixed'
UNKNOWN_TYPES_LABEL = 'unknown'
# If python-magic is installed, it will be used as a fall-back when
# mimetypes.guess_type cannot identify a file. I have tested this with two
# versions of python-magic: the current github master as of 2011.09.26 and the
# package found in the Debian Squeeze repositories. For the latter, we rely on
# monkey-patching. The github master can be found at the following address:
# https://github.com/ahupp/python-magic/raw/master/magic.py
try:
import magic
if not hasattr(magic, 'from_file'):
cookie = magic.open(magic.MAGIC_MIME & ~magic.MAGIC_MIME_ENCODING)
cookie.load()
magic.from_file = lambda x, mime: cookie.file(x)
elif 'mime' not in magic.from_file.func_code.co_varnames:
raise AttributeError
def guess_mime_type(path):
"""
Guesses the MIME type of a file based on its extension using
mimetypes.guess_type and python-magic as a fall-back when the mimetypes
method is unable to identify the file. Returns a tuple containing the
MIME type and encoding (usually `None`).
"""
mimetype, _ = mimetypes.guess_type(path)
if not mimetype or mimetype.endswith('/octet-stream'):
magicmime = magic.from_file(path, mime=True)
if magicmime and '/' not in magicmime:
return None, None
return magicmime, None
return mimetype, _
except Exception as err:
if isinstance(err, AttributeError):
print(
"Available version of python-magic not supported.",
file=sys.stderr
)
guess_mime_type = mimetypes.guess_type
def diversity(elements):
"""
Computes the Shannon index of diversity for `elements`. The calculated
index and a list containing the most frequently occurring element or
elements are returned.
"""
bucket = dict()
mode = list()
maxfreq = 0
# Tally up the elements in the list.
for element in elements:
bucket[element] = bucket.setdefault(element, 0) + 1
if bucket[element] >= maxfreq:
if bucket[element] == maxfreq:
mode.append(element)
else:
maxfreq = bucket[element]
mode = [element]
total = float(len(elements))
freqs = [count / total for _, count in bucket.items()]
diversity = -sum(map(operator.mul, freqs, map(math.log, freqs)))
# Do not return a negative zero
return diversity or 0.0, mode
def classify(path, guesser=None):
"""
Returns more user-friendly version of a file's MIME type. For most MIME
types, this function will simply return the primary type, but more
ambiguous types like 'application' or 'text' are broken down into sub-types
where possible and certain prefixes stripped. When `guesser` is supplied,
`classify` will use it to identify the MIME type of a given path. The
function should require only a single parameter -- the path of the file --
and return the MIME type and a second value that will be ignored.
"""
if not guesser:
guesser = guess_mime_type
path = os.path.normcase(path)
mimetype, _ = guesser(path)
if not mimetype:
return None
try:
mediatype, mediasubtype = mimetype.split('/')
except ValueError:
mediatype, mediasubtype = ("application", "octet-stream")
if mediatype in ('application', 'text'):
mediatype = mediasubtype
if mediatype == 'octet-stream':
return UNKNOWN_TYPES_LABEL
if mediatype.startswith('vnd.'):
mediatype = mediatype.split('.')[-1]
if mediatype.startswith('x-'):
mediatype = mediatype[2:]
return mediatype
def categorize(path, maxdiversity=MAX_DIVERSITY):
"""
When `path` is a directory, two components are used to determine its
classification: a Shannon index of diversity generated by running
`classify` on every file found recursively under the directory and the
`maxdiversity` parameter. When the directory contains files whose
classifications differ, a Shannon index of diversity greater than
`maxdiversity` will cause the folder to classified as `mixed`. Otherwise,
the most prevalent classification is used.
If `path` is a file, `categorize` is nothing more than an alias for
`classify` that returns a file's classification and `None`.
"""
if os.path.isdir(path):
categories = list()
for root, directories, files in os.walk(path):
# I originally had `categories.extend(map(classify, files))` here,
# but I need to make sure the classify function has the full path
# since I have implemented support for python-magic.
for filename in files:
categories.append(classify(os.path.join(root, filename)))
dirdiversity, mode = diversity(categories)
path = os.path.basename(path)
if len(mode) == 1 and dirdiversity <= maxdiversity:
return mode[0], dirdiversity
return False, dirdiversity
else:
return classify(path), None
def organize(folder, dest=False, detectdirs=True, maxdiversity=MAX_DIVERSITY):
"""
Classifies the files in `folder` and moves like items into appropriate
named folders. The `dest` parameter is the destination folder for sorted
files. When `dest` is False, the sorted contents of `folder` remain in
`folder`. When `dest` is `None`, organize will simply print a list showing
the Shannon index of diversity and classification for each item in
`folder`. The `maxdiversity` is passed to `categorized` unchanged.
When `detectdirs` is True, the function ignores folders that appear to be
sorted based on the folders' names.
"""
files = [os.path.join(folder, base) for base in os.listdir(folder)]
if detectdirs:
files = [F for F in files if os.path.basename(F) not in IGNORED_FILES]
# Wrap categorize so I can provide maxdiversity while using map
_categorize = lambda x: categorize(x, maxdiversity=maxdiversity)
for path, categorydata in zip(files, map(_categorize, files)):
category, pathdiversity = categorydata
if category is None:
category = UNKNOWN_TYPES_LABEL
elif category is False:
category = MIXED_TYPES_LABEL
displayname = path if len(path) < 34 else path[:7] + '...' + path[-24:]
if pathdiversity is not None:
left = '%-34s %.3f' % (displayname, pathdiversity)
else:
left = '%-34s ' % (displayname)
if len(left) > 40:
# Truncate spaces starting from the right side of the string
left = left[::-1].replace(' ', '', len(left) - 50)[::-1]
print(left + ' ' + category)
if dest is None:
continue
destination = os.path.join(dest or folder, category)
try:
os.makedirs(destination)
except OSError as err:
if err.errno != errno.EEXIST:
print('%s: %s' % (destination, err), file=sys.stderr)
exit(1)
if os.path.isdir(destination):
if os.path.samefile(path, destination):
print('%s: Source is destination.' % path, file=sys.stderr)
else:
try:
shutil.move(path, destination)
except Exception as err:
print('%s: %s' % (path, err.message), file=sys.stderr)
else:
print('%s: Destination is not a folder.' % path, file=sys.stderr)
def main(args=sys.argv[1:]):
"""
Entry point when run as a stand-alone script.
"""
arguments, trailing = getopt.gnu_getopt(args, 'd:nihm')
argdict = dict(arguments)
maxdiversity = argdict.get('-d', MAX_DIVERSITY)
if '-h' in argdict:
print(os.path.basename(__file__), '[OPTIONS] [DIR... [DEST]]')
print('\t-h Display this message and quit')
print('\t-d NUMBER Threshold for Shannon diversity index')
print('\t-i Ignore folders that appear to be sorted')
print('\t-n Display categorizations and exit')
print('\t-m Do not use python-magic even if it is available')
else:
if '-m' in argdict:
global guess_mime_type
guess_mime_type = mimetypes.guess_type
detectdirs = '-i' not in argdict
dryrun = '-n' in argdict
promptuser = '-y' not in argdict
for folder in trailing or '.':
if len(trailing) > 1:
dest = trailing.pop()
else:
dest = folder
if dryrun:
print('Destination folder: %s' % dest)
dest = None
elif not trailing and promptuser:
response = input('Sort %s? [N/y] ' % os.getcwd())
if not response.strip().lower().startswith('y'):
exit(0)
organize(folder, dest, detectdirs, maxdiversity)
# Generate set containing all possible folder names
IGNORED_FILES = set((MIXED_TYPES_LABEL, UNKNOWN_TYPES_LABEL))
strict, loose = mimetypes.MimeTypes().types_map
for extension in list(strict.keys()) + list(loose.keys()):
IGNORED_FILES.add(classify('x' + extension, guesser=mimetypes.guess_type))
if __name__ == '__main__':
main()