-
Notifications
You must be signed in to change notification settings - Fork 42
Ability to skip last rows. #225
Changes from 1 commit
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -11,6 +11,7 @@ | |
import warnings | ||
from copy import copy | ||
from itertools import chain | ||
from collections import deque | ||
from .loaders.stream import StreamLoader | ||
from . import exceptions | ||
from . import helpers | ||
|
@@ -444,8 +445,37 @@ def builtin_processor(extended_rows): | |
|
||
yield (row_number, headers, row) | ||
|
||
def skip_negative_rows(extended_rows): | ||
""" | ||
This processor will skip rows which counts from the end, e.g. | ||
-1: skip last row, -2: skip pre-last row, etc. | ||
Rows to skip are taken from Stream.__skip_rows_by_numbers | ||
""" | ||
rows_to_skip = [n for n in self.__skip_rows_by_numbers if n < 0] | ||
if not rows_to_skip: | ||
for row in extended_rows: | ||
yield row | ||
else: | ||
buffer_size = abs(min(rows_to_skip)) + 1 | ||
# collections.deque - takes O[1] time to push/pop values from any side. | ||
buffer = deque() | ||
|
||
# use buffer to save last rows | ||
for row in extended_rows: | ||
buffer.append(row) | ||
if len(buffer) == buffer_size: | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Agree. |
||
yield buffer.popleft() | ||
|
||
# Now squeeze out the buffer | ||
last_row_number = buffer[len(buffer)-1][0] | ||
# with last_row_number, we could transform negative row numbers to positive | ||
rows_to_skip_positive = [last_row_number + 1 + n for n in rows_to_skip] | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. a bit convoluted... n = len(buffer)
for i, row in enumerate(buffer):
if i-n not in rows_to_skip:
yield row There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I used 'last_row_number' from |
||
for row in buffer: | ||
if row[0] not in rows_to_skip_positive: | ||
yield row | ||
|
||
# Apply processors to iterator | ||
processors = [builtin_processor] + self.__post_parse | ||
processors = [builtin_processor, skip_negative_rows] + self.__post_parse | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think we should add this processor only if there is an actual need of skipping rows at the end of the file. So could you please use here a simple condition? It will save some CPU ticks. And less processors - easy to debug. In 99% of the cases there is no negative There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. fixed |
||
for processor in processors: | ||
iterator = processor(iterator) | ||
|
||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -302,6 +302,21 @@ def test_stream_skip_rows_with_headers(): | |
assert stream.read() == [['2', '中国人']] | ||
|
||
|
||
def test_stream_skip_rows_from_the_end(): | ||
source = 'data/special/skip-rows.csv' | ||
with Stream(source, skip_rows=[-2, 1]) as stream: | ||
assert stream.read() == [['1', 'english'], ['2', '中国人']] | ||
|
||
with Stream(source, skip_rows=[-1, -2]) as stream: | ||
assert stream.read() == [['id', 'name'], ['1', 'english']] | ||
|
||
|
||
def test_stream_skip_rows_no_double_skip(): | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Since skip rows runs before skip negative numbers, it's possible that 'skip_rows' will remove the last line, and then 'skip_negative_numbers' wouldn't know which line is the last line... There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. yes, you are right, it is happens :( fixing... |
||
source = 'data/special/skip-rows.csv' | ||
with Stream(source, skip_rows=[3, -2]) as stream: | ||
assert stream.read() == [['id', 'name'], ['1', 'english'], ['2', '中国人']] | ||
|
||
|
||
# Post parse | ||
|
||
def test_stream_post_parse_headers(): | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This is a bit confusing - the buffer size if actually
abs(min(rows_to_skip))
(without the+1
)There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Agree, will fix.