This repository has been archived by the owner on Sep 24, 2020. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 3
/
flake8.py
357 lines (297 loc) · 12.9 KB
/
flake8.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
# coding: utf-8
from __future__ import unicode_literals
import bisect
import io
import sys
from lib2to3.pgen2 import driver, token, tokenize
from lib2to3 import patcomp, pygram, pytree
import pycodestyle
import six
import venusian
from intervaltree import Interval, IntervalTree
from ebb_lint._version import __version__
from ebb_lint.errors import Errors
from ebb_lint import checkers
_pycodestyle_noqa = pycodestyle.noqa
# This is a blight. Disable it unconditionally.
pycodestyle.noqa = lambda ign: False
def tokenize_source_string(s, base_byte=0):
fobj = io.StringIO(six.text_type(s).rstrip(' \t\r\n\\'))
lines = Lines(fobj)
fobj.seek(0)
for typ, tok, spos, epos, _ in tokenize.generate_tokens(fobj.readline):
yield typ, tok, Interval(
lines.byte_of_pos(*spos) + base_byte,
lines.byte_of_pos(*epos) + base_byte)
# detect_future_features isn't fully covered, but I don't really care, because
# I don't want to rewrite it. Maybe if it becomes more relevant I'll pull it
# out of this suite and actually properly unit test it, but right now I feel
# like it's mostly just working around a lib2to3 deficiency so I don't care
# enough to do anything else. It's stolen from lib2to3 directly. Why was this a
# private function? Ugh.
def detect_future_features(s): # pragma: nocover
have_docstring = False
gen = tokenize_source_string(s)
def advance():
tok = next(gen)
return tok[0], tok[1]
ignore = frozenset((token.NEWLINE, tokenize.NL, token.COMMENT))
features = set()
try:
while True:
tp, value = advance()
if tp in ignore:
continue
elif tp == token.STRING:
if have_docstring:
break
have_docstring = True
elif tp == token.NAME and value == 'from':
tp, value = advance()
if tp != token.NAME or value != '__future__':
break
tp, value = advance()
if tp != token.NAME or value != 'import':
break
tp, value = advance()
if tp == token.OP and value == '(':
tp, value = advance()
while tp == token.NAME:
features.add(value)
tp, value = advance()
if tp != token.OP or value != ',':
break
tp, value = advance()
else:
break
except StopIteration:
pass
return frozenset(features)
if six.PY3: # ✘py27
def grammar_for_future_features(future_features):
return pygram.python_grammar_no_print_statement
else: # ✘py33 ✘py34 ✘py35
def grammar_for_future_features(future_features):
if 'print_function' in future_features:
return pygram.python_grammar_no_print_statement
else:
return pygram.python_grammar
def find_comments(s, base_byte=0):
for typ, tok, interval in tokenize_source_string(s, base_byte=base_byte):
if typ == tokenize.COMMENT:
yield tok, interval
def decode_string_using_source_encoding(b):
encoding = tokenize.detect_encoding(io.BytesIO(b).readline)[0]
return b.decode(encoding)
def read_file_using_source_encoding(filename):
with open(filename, 'rb') as infile:
encoding = tokenize.detect_encoding(infile.readline)[0]
with io.open(filename, 'r', encoding=encoding) as infile_with_encoding:
return infile_with_encoding.read()
def parse_source(driver, source):
trailing_newline = not source or source.endswith('\n')
# Thanks for this, lib2to3.
if not trailing_newline:
source += '\n'
return driver.parse_string(source), trailing_newline
class Lines(object):
def __init__(self, infile):
count = 0
self.lines = [(0, '')]
for line in infile:
self.lines.append((count, line))
count += len(line)
self.last_pos = len(self.lines) - 1, len(self.lines[-1][1])
self.last_byte = count
def __getitem__(self, idx):
return self.lines[idx]
def __iter__(self):
for e, (count, line) in enumerate(self.lines):
if e == 0:
continue
yield e, count, line
def position_of_byte(self, byte):
lineno = bisect.bisect_left(self.lines, (byte + 1,)) - 1
column = byte - self.lines[lineno][0]
return lineno, column
def byte_of_pos(self, lineno, column):
# This requires a bit of explanation. The source passed to lib2to3's
# parser has an extra newline added in some cases, to deal with a bug
# in lib2to3 where it crashes hard if files don't end with a trailing
# newline. When that extra line is added, the final DEDENT token in the
# file will have a lineno equal to the lines in the file plus one,
# becase it's "at" a location that doesn't exist in the real file. If
# this case wasn't specifically caught, the self[lineno] would raise an
# exception because lineno is beyond the last index in self.lines. So,
# when that case is detected, return the final byte position.
if lineno == len(self.lines) and column == 0:
return self.last_byte
byte, _ = self[lineno]
byte += column
return byte
def byte_of_node(self, node):
return self.byte_of_pos(node.lineno, node.column)
def byte_intersection(tree, lower, upper):
ret = 0
for i in tree.search(lower, upper):
ret += min(i.end, upper) - max(i.begin, lower)
return ret
class EbbLint(object):
name = 'ebb_lint'
version = __version__
collected_checkers = None
_source = None
_lines = None
def __init__(self, tree, filename):
self.tree = tree
self.filename = filename
self._intervals = {
'comments': IntervalTree(),
'string literals': IntervalTree(),
}
@classmethod
def add_options(cls, parser):
parser.add_option('--hard-max-line-length', default=119, type=int,
metavar='n',
help='absolute maximum line length allowed')
parser.config_options.append('hard-max-line-length')
parser.add_option('--permissive-bulkiness-percentage', default=67,
type=int, metavar='p', help=(
'integer percentage of a line which must be '
'string literals or comments to be allowed to '
'pass the soft line limit'))
parser.config_options.append('permissive-bulkiness-percentage')
@classmethod
def parse_options(cls, options):
# We implement our own line-length checker because it's not possible to
# customize how another checker does its checking.
options.ignore += 'E501',
cls.options = options
# This vastly speeds up the test suite, since parse_options is called
# on every test now, and venusian does a lot of work.
if cls.collected_checkers is not None:
return
collected_checkers = []
def register_checker(pattern, checker, extra):
if ('python_minimum_version' in extra
and sys.version_info < extra['python_minimum_version']):
return
if ('python_disabled_version' in extra
and sys.version_info > extra['python_disabled_version']):
return
pattern = patcomp.compile_pattern(pattern)
collected_checkers.append((pattern, checker, extra))
scanner = venusian.Scanner(register=register_checker)
scanner.scan(checkers)
cls.collected_checkers = collected_checkers
@property
def source(self):
if self._source is None:
if self.filename != 'stdin':
self._source = read_file_using_source_encoding(self.filename)
elif six.PY2: # ✘py33 ✘py34 ✘py35
# On python 2, reading from stdin gives you bytes, which must
# be decoded.
self._source = decode_string_using_source_encoding(
pycodestyle.stdin_get_value())
else: # ✘py27
# On python 3, reading from stdin gives you text.
self._source = pycodestyle.stdin_get_value()
return self._source
@property
def lines(self):
if self._lines is None:
self._lines = Lines(self.source.splitlines(True))
return self._lines
def _message_for_node(self, node, error, **kw):
line_offset = kw.pop('line_offset', None)
if line_offset is None:
byte = self.lines.byte_of_node(node) + kw.pop('offset', 0)
lineno, column = self.lines.position_of_byte(byte)
else:
lineno = node.lineno + line_offset
column = kw.pop('column')
return self._message_for_pos((lineno, column), error, **kw)
def _message_for_pos(self, pos, error, **kw):
lineno, column = pos
message = 'L{:03d} {}'.format(
error.value.code, error.value.message.format(**kw))
return lineno, column, message, type(self)
def run(self):
self.future_features = detect_future_features(self.source)
d = driver.Driver(
grammar_for_future_features(self.future_features),
convert=pytree.convert)
tree, trailing_newline = parse_source(d, self.source)
if not trailing_newline:
yield self._message_for_pos(
self.lines.last_pos, Errors.no_trailing_newline)
for error in self._check_tree(tree):
yield error
for error in self._check_line_lengths():
yield error
def _check_tree(self, tree):
for node in tree.pre_order():
for error in self._scan_node_for_ranges(node):
yield error
for pattern, checker, extra in self.collected_checkers:
results = {}
if not pattern.match(node, results):
continue
for k in extra.get('comments_for', ()):
# XXX: this doesn't use `k` for finding the node; `k` is
# supposed to name a specific node, but it isn't used when
# choosing which node is added to results.
results[k + '_comments'] = [
c for c, _ in find_comments(node.prefix)]
if extra.get('pass_filename', False):
results['filename'] = self.filename
if extra.get('pass_future_features', False):
results['future_features'] = self.future_features
for error_node, error, kw in checker(**results):
yield self._message_for_node(error_node, error, **kw)
def _scan_node_for_ranges(self, node):
if node.children or (node.type != token.STRING and not node.prefix):
return
byte = self.lines.byte_of_node(node)
if node.type == token.STRING:
self._intervals['string literals'].add(Interval(
byte, byte + len(node.value)))
comments = list(
find_comments(node.prefix, byte - len(node.prefix)))
for c, i in comments:
self._intervals['comments'].add(i)
m = _pycodestyle_noqa(c)
if m is not None:
yield self._message_for_pos(
self.lines.position_of_byte(i.begin + m.start()),
Errors.no_noqa)
def _check_line_lengths(self):
soft_limit = self.options.max_line_length
hard_limit = self.options.hard_max_line_length
permitted_percentage = self.options.permissive_bulkiness_percentage
for lineno, line_start, line in self.lines:
line = line.rstrip('\r\n')
if len(line) <= soft_limit:
continue
if len(line) > hard_limit:
yield self._message_for_pos(
(lineno, hard_limit), Errors.line_too_long,
length=len(line), which_limit='hard', limit=hard_limit,
extra='')
continue
line_end = line_start + len(line)
percentages = {}
for name, i in self._intervals.items():
n_bytes = byte_intersection(i, line_start, line_end)
percentages[name] = p = n_bytes * 100 // len(line)
assert 0 <= p <= 100, 'line percentage not in range'
if any(p >= permitted_percentage for p in percentages.values()):
continue
extra = ' since the line has ' + '; '.join(
'{p}% {name}'.format(p=p, name=name)
for name, p in percentages.items())
yield self._message_for_pos(
(lineno, soft_limit), Errors.line_too_long,
length=len(line), which_limit='soft', limit=soft_limit,
extra=extra)