Skip to content
This repository

HTTPS clone URL

Subversion checkout URL

You can clone with HTTPS or Subversion.

Download ZIP
Browse code

update mapreduce framework.

update by r319-r339.
  • Loading branch information...
commit 0bdde503d97906e807a92b421ce933f810d2eeec 1 parent fce3cac
cloudysunny14 authored

Showing 44 changed files with 2,073 additions and 537 deletions. Show diff stats Hide diff stats

  1. +1 1  appengine-mapreduce2GCS/googlestorage_test/input_readers_test.py
  2. +19 0 src/mapreduce/errors.py
  3. +92 68 src/mapreduce/handlers.py
  4. +137 180 src/mapreduce/input_readers.py
  5. +1 18 src/mapreduce/lib/files/blobstore.py
  6. +212 49 src/mapreduce/lib/files/file.py
  7. +307 1 src/mapreduce/lib/files/file_service_pb.py
  8. +203 0 src/mapreduce/lib/files/gs.py
  9. +9 0 src/mapreduce/lib/files/testutil.py
  10. +44 11 src/mapreduce/lib/key_range/__init__.py
  11. +15 11 test/mapreduce_test/gs_files_test.py → src/mapreduce/lib/pipeline/index.yaml
  12. +1 1  src/mapreduce/lib/pipeline/models.py
  13. +192 124 src/mapreduce/lib/pipeline/pipeline.py
  14. +160 0 src/mapreduce/lib/pipeline/status_ui.py
  15. +28 1 src/mapreduce/lib/pipeline/ui/common.css
  16. +41 0 src/mapreduce/lib/pipeline/ui/root_list.css
  17. +61 0 src/mapreduce/lib/pipeline/ui/root_list.html
  18. +124 0 src/mapreduce/lib/pipeline/ui/root_list.js
  19. +16 5 src/mapreduce/lib/pipeline/ui/status.css
  20. +1 1  src/mapreduce/lib/pipeline/ui/status.html
  21. +35 10 src/mapreduce/mapreduce_pipeline.py
  22. +39 32 src/mapreduce/output_writers.py
  23. 0  test/{mapreduce_test → mapreduce}/base_handler_test.py
  24. 0  test/{mapreduce_test → mapreduce}/combiner_test.py
  25. 0  test/{mapreduce_test → mapreduce}/context_test.py
  26. 0  test/{mapreduce_test → mapreduce}/control_test.py
  27. +22 0 test/{mapreduce_test → mapreduce}/end_to_end_test.py
  28. +106 8 test/{mapreduce_test → mapreduce}/handlers_test.py
  29. +155 7 test/{mapreduce_test → mapreduce}/input_readers_test.py
  30. 0  test/{mapreduce_test → mapreduce}/large_mapreduce_test.py
  31. 0  test/{mapreduce_test → mapreduce}/main_test.py
  32. 0  test/{mapreduce_test → mapreduce}/mapper_pipeline_test.py
  33. +43 9 test/{mapreduce_test → mapreduce}/mapreduce_pipeline_test.py
  34. 0  test/{mapreduce_test → mapreduce}/model_test.py
  35. 0  test/{mapreduce_test → mapreduce}/namespace_range_test.py
  36. 0  test/{mapreduce_test → mapreduce}/operation/counters_test.py
  37. 0  test/{mapreduce_test → mapreduce}/operation/db_test.py
  38. 0  test/{mapreduce_test → mapreduce}/output_writers_end_to_end_test.py
  39. +9 0 test/{mapreduce_test → mapreduce}/output_writers_test.py
  40. 0  test/{mapreduce_test → mapreduce}/quota_test.py
  41. 0  test/{mapreduce_test → mapreduce}/shuffler_end_to_end_test.py
  42. 0  test/{mapreduce_test → mapreduce}/shuffler_test.py
  43. 0  test/{mapreduce_test → mapreduce}/status_test.py
  44. 0  test/{mapreduce_test → mapreduce}/util_test.py
2  appengine-mapreduce2GCS/googlestorage_test/input_readers_test.py
@@ -27,7 +27,7 @@
27 27
28 28 class GoogleStorageInputReaderTest(testutil.HandlerTestBase):
29 29 READER_NAME = (
30   - "mapreduce.input_readers.CloudStorageLineInputReader")
  30 + "googlestorage.input_readers.GoogleStorageLineInputReader")
31 31
32 32 def assertDone(self, reader):
33 33 self.assertRaises(StopIteration, reader.next)
19 src/mapreduce/errors.py
@@ -25,8 +25,11 @@
25 25 "BadWriterParamsError",
26 26 "BadYamlError",
27 27 "Error",
  28 + "FailJobError",
28 29 "MissingYamlError",
29 30 "MultipleDocumentsInMrYaml",
  31 + "NotEnoughArgumentsError",
  32 + "RetrySliceError",
30 33 "ShuffleServiceError",
31 34 ]
32 35
@@ -65,3 +68,19 @@ class ShuffleServiceError(Error):
65 68 class BadCombinerOutputError(Error):
66 69 """Combiner outputs data instead of yielding it."""
67 70
  71 +
  72 +class FailJobError(Error):
  73 + """The job will be failed if this exception is thrown anywhere."""
  74 +
  75 +
  76 +class RetrySliceError(Error):
  77 + """The slice will be retried up to some maximum number of times.
  78 +
  79 + The job will be failed if the slice can't progress before maximum
  80 + number of retries.
  81 + """
  82 +
  83 +
  84 +class NotEnoughArgumentsError(Error):
  85 + """Required argument is missing."""
  86 +
160 src/mapreduce/handlers.py
@@ -41,6 +41,11 @@
41 41 from mapreduce import quota
42 42 from mapreduce import util
43 43
  44 +try:
  45 + from google.appengine.ext import ndb
  46 +except ImportError:
  47 + ndb = None
  48 +
44 49
45 50 # TODO(user): Make this a product of the reader or in quotas.py
46 51 _QUOTA_BATCH_SIZE = 20
@@ -52,22 +57,14 @@
52 57 # Delay between consecutive controller callback invocations.
53 58 _CONTROLLER_PERIOD_SEC = 2
54 59
  60 +# How many times to cope with a RetrySliceError before totally
  61 +# giving up and aborting the whole job.
  62 +_RETRY_SLICE_ERROR_MAX_RETRIES = 10
  63 +
55 64 # Set of strings of various test-injected faults.
56 65 _TEST_INJECTED_FAULTS = set()
57 66
58 67
59   -class Error(Exception):
60   - """Base class for exceptions in this module."""
61   -
62   -
63   -class NotEnoughArgumentsError(Error):
64   - """Required argument is missing."""
65   -
66   -
67   -class NoDataError(Error):
68   - """There is no data present for a desired input."""
69   -
70   -
71 68 def _run_task_hook(hooks, method, task, queue_name):
72 69 """Invokes hooks.method(task, queue_name).
73 70
@@ -134,10 +131,8 @@ def handle(self):
134 131 if control and control.command == model.MapreduceControl.ABORT:
135 132 logging.info("Abort command received by shard %d of job '%s'",
136 133 shard_state.shard_number, shard_state.mapreduce_id)
137   - if tstate.output_writer:
138   - tstate.output_writer.finalize(ctx, shard_state.shard_number)
139   - # We recieved a command to abort. We don't care if we override
140   - # some data.
  134 + # NOTE: When aborting, specifically do not finalize the output writer
  135 + # because it might be in a bad state.
141 136 shard_state.active = False
142 137 shard_state.result_status = model.ShardState.RESULT_ABORTED
143 138 shard_state.put(config=util.create_datastore_write_config(spec))
@@ -154,6 +149,16 @@ def handle(self):
154 149 else:
155 150 quota_consumer = None
156 151
  152 + # Tell NDB to never cache anything in memcache or in-process. This ensures
  153 + # that entities fetched from Datastore input_readers via NDB will not bloat
  154 + # up the request memory size and Datastore Puts will avoid doing calls
  155 + # to memcache. Without this you get soft memory limit exits, which hurts
  156 + # overall throughput.
  157 + if ndb is not None:
  158 + ndb_ctx = ndb.get_context()
  159 + ndb_ctx.set_cache_policy(lambda key: False)
  160 + ndb_ctx.set_memcache_policy(lambda key: False)
  161 +
157 162 context.Context._set(ctx)
158 163 try:
159 164 # consume quota ahead, because we do not want to run a datastore
@@ -162,49 +167,67 @@ def handle(self):
162 167 scan_aborted = False
163 168 entity = None
164 169
165   - # We shouldn't fetch an entity from the reader if there's not enough
166   - # quota to process it. Perform all quota checks proactively.
167   - if not quota_consumer or quota_consumer.consume():
168   - for entity in input_reader:
169   - if isinstance(entity, db.Model):
170   - shard_state.last_work_item = repr(entity.key())
171   - else:
172   - shard_state.last_work_item = repr(entity)[:100]
173   -
174   - scan_aborted = not self.process_data(
175   - entity, input_reader, ctx, tstate)
176   -
177   - # Check if we've got enough quota for the next entity.
178   - if (quota_consumer and not scan_aborted and
179   - not quota_consumer.consume()):
180   - scan_aborted = True
181   - if scan_aborted:
182   - break
183   - else:
  170 + try:
  171 + # We shouldn't fetch an entity from the reader if there's not enough
  172 + # quota to process it. Perform all quota checks proactively.
  173 + if not quota_consumer or quota_consumer.consume():
  174 + for entity in input_reader:
  175 + if isinstance(entity, db.Model):
  176 + shard_state.last_work_item = repr(entity.key())
  177 + else:
  178 + shard_state.last_work_item = repr(entity)[:100]
  179 +
  180 + scan_aborted = not self.process_data(
  181 + entity, input_reader, ctx, tstate)
  182 +
  183 + # Check if we've got enough quota for the next entity.
  184 + if (quota_consumer and not scan_aborted and
  185 + not quota_consumer.consume()):
  186 + scan_aborted = True
  187 + if scan_aborted:
  188 + break
  189 + else:
  190 + scan_aborted = True
  191 +
  192 + if not scan_aborted:
  193 + logging.info("Processing done for shard %d of job '%s'",
  194 + shard_state.shard_number, shard_state.mapreduce_id)
  195 + # We consumed extra quota item at the end of for loop.
  196 + # Just be nice here and give it back :)
  197 + if quota_consumer:
  198 + quota_consumer.put(1)
  199 + shard_state.active = False
  200 + shard_state.result_status = model.ShardState.RESULT_SUCCESS
  201 +
  202 + operation.counters.Increment(
  203 + context.COUNTER_MAPPER_WALLTIME_MS,
  204 + int((time.time() - self._start_time)*1000))(ctx)
  205 +
  206 + # TODO(user): Mike said we don't want this happen in case of
  207 + # exception while scanning. Figure out when it's appropriate to skip.
  208 + ctx.flush()
  209 + except errors.RetrySliceError, e:
  210 + logging.error("Slice error: %s", e)
  211 + retry_count = int(
  212 + os.environ.get("HTTP_X_APPENGINE_TASKRETRYCOUNT") or 0)
  213 + if retry_count <= _RETRY_SLICE_ERROR_MAX_RETRIES:
  214 + raise
  215 + logging.error("Too many retries: %d, failing the job", retry_count)
184 216 scan_aborted = True
185   -
186   -
187   - if not scan_aborted:
188   - logging.info("Processing done for shard %d of job '%s'",
189   - shard_state.shard_number, shard_state.mapreduce_id)
190   - # We consumed extra quota item at the end of for loop.
191   - # Just be nice here and give it back :)
192   - if quota_consumer:
193   - quota_consumer.put(1)
194 217 shard_state.active = False
195   - shard_state.result_status = model.ShardState.RESULT_SUCCESS
196   -
197   - operation.counters.Increment(
198   - context.COUNTER_MAPPER_WALLTIME_MS,
199   - int((time.time() - self._start_time)*1000))(ctx)
200   -
201   - # TODO(user): Mike said we don't want this happen in case of
202   - # exception while scanning. Figure out when it's appropriate to skip.
203   - ctx.flush()
  218 + shard_state.result_status = model.ShardState.RESULT_FAILED
  219 + except errors.FailJobError, e:
  220 + logging.error("Job failed: %s", e)
  221 + scan_aborted = True
  222 + shard_state.active = False
  223 + shard_state.result_status = model.ShardState.RESULT_FAILED
204 224
205 225 if not shard_state.active:
206   - # shard is going to stop. Finalize output writer if any.
207   - if tstate.output_writer:
  226 + # shard is going to stop. Don't finalize output writer unless the job is
  227 + # going to be successful, because writer might be stuck in some bad state
  228 + # otherwise.
  229 + if (shard_state.result_status == model.ShardState.RESULT_SUCCESS and
  230 + tstate.output_writer):
208 231 tstate.output_writer.finalize(ctx, shard_state.shard_number)
209 232
210 233 config = util.create_datastore_write_config(spec)
@@ -219,6 +242,8 @@ def handle(self):
219 242 def tx():
220 243 fresh_shard_state = db.get(
221 244 model.ShardState.get_key_by_shard_id(shard_id))
  245 + if not fresh_shard_state:
  246 + raise db.Rollback()
222 247 if (not fresh_shard_state.active or
223 248 "worker_active_state_collision" in _TEST_INJECTED_FAULTS):
224 249 shard_state.active = False
@@ -273,8 +298,6 @@ def process_data(self, data, input_reader, ctx, transient_shard_state):
273 298 output_writer.write(output, ctx)
274 299
275 300 if self._time() - self._start_time > _SLICE_DURATION_SEC:
276   - logging.debug("Spent %s seconds. Rescheduling",
277   - self._time() - self._start_time)
278 301 return False
279 302 return True
280 303
@@ -372,11 +395,6 @@ def handle(self):
372 395 spec = model.MapreduceSpec.from_json_str(
373 396 self.request.get("mapreduce_spec"))
374 397
375   - # TODO(user): Make this logging prettier.
376   - logging.debug("post: id=%s headers=%s spec=%s",
377   - spec.mapreduce_id, self.request.headers,
378   - self.request.get("mapreduce_spec"))
379   -
380 398 state, control = db.get([
381 399 model.MapreduceState.get_key_by_job_id(spec.mapreduce_id),
382 400 model.MapreduceControl.get_key_by_job_id(spec.mapreduce_id),
@@ -407,6 +425,8 @@ def handle(self):
407 425 state.active_shards = len(active_shards)
408 426 state.failed_shards = len(failed_shards)
409 427 state.aborted_shards = len(aborted_shards)
  428 + if not control and failed_shards:
  429 + model.MapreduceControl.abort(spec.mapreduce_id)
410 430
411 431 if (not state.active and control and
412 432 control.command == model.MapreduceControl.ABORT):
@@ -512,9 +532,13 @@ def _finalize_job(mapreduce_spec, mapreduce_state, base_path):
512 532 base_path: handler base path.
513 533 """
514 534 config = util.create_datastore_write_config(mapreduce_spec)
515   - # Enqueue done_callback if needed.
516   - if mapreduce_spec.mapper.output_writer_class():
  535 +
  536 + # Only finalize the output writers if we the job is successful.
  537 + if (mapreduce_spec.mapper.output_writer_class() and
  538 + mapreduce_state.result_status == model.MapreduceState.RESULT_SUCCESS):
517 539 mapreduce_spec.mapper.output_writer_class().finalize_job(mapreduce_state)
  540 +
  541 + # Enqueue done_callback if needed.
518 542 def put_state(state):
519 543 state.put(config=config)
520 544 done_callback = mapreduce_spec.params.get(
@@ -680,11 +704,11 @@ def _get_required_param(self, param_name):
680 704 parameter value
681 705
682 706 Raises:
683   - NotEnoughArgumentsError: if parameter is not specified.
  707 + errors.NotEnoughArgumentsError: if parameter is not specified.
684 708 """
685 709 value = self.request.get(param_name)
686 710 if not value:
687   - raise NotEnoughArgumentsError(param_name + " not specified")
  711 + raise errors.NotEnoughArgumentsError(param_name + " not specified")
688 712 return value
689 713
690 714 @classmethod
@@ -821,11 +845,11 @@ def _get_required_param(self, param_name):
821 845 parameter value
822 846
823 847 Raises:
824   - NotEnoughArgumentsError: if parameter is not specified.
  848 + errors.NotEnoughArgumentsError: if parameter is not specified.
825 849 """
826 850 value = self.request.get(param_name)
827 851 if not value:
828   - raise NotEnoughArgumentsError(param_name + " not specified")
  852 + raise errors.NotEnoughArgumentsError(param_name + " not specified")
829 853 return value
830 854
831 855 @classmethod
317 src/mapreduce/input_readers.py
@@ -31,12 +31,12 @@
31 31 "DatastoreEntityInputReader",
32 32 "DatastoreInputReader",
33 33 "DatastoreKeyInputReader",
  34 + "RandomStringInputReader",
34 35 "Error",
35 36 "InputReader",
36 37 "LogInputReader",
37 38 "NamespaceInputReader",
38 39 "RecordsReader",
39   - "CloudStorageLineInputReader"
40 40 ]
41 41
42 42 # pylint: disable-msg=C6409
@@ -44,6 +44,9 @@
44 44 import base64
45 45 import copy
46 46 import logging
  47 +import random
  48 +import string
  49 +import StringIO
47 50 import time
48 51 import zipfile
49 52
@@ -66,10 +69,6 @@
66 69 from mapreduce import operation
67 70 from mapreduce import util
68 71
69   -try:
70   - from cStringIO import StringIO
71   -except:
72   - from StringIO import StringIO
73 72
74 73 # Classes moved to errors module. Copied here for compatibility.
75 74 Error = errors.Error
@@ -246,6 +245,7 @@ class AbstractDatastoreInputReader(InputReader):
246 245 KEY_RANGE_PARAM = "key_range"
247 246 NAMESPACE_RANGE_PARAM = "namespace_range"
248 247 CURRENT_KEY_RANGE_PARAM = "current_key_range"
  248 + FILTERS_PARAM = "filters"
249 249
250 250 # TODO(user): Add support for arbitrary queries. It's not possible to
251 251 # support them without cursors since right now you can't even serialize query
@@ -255,7 +255,8 @@ def __init__(self,
255 255 key_ranges=None,
256 256 ns_range=None,
257 257 batch_size=_BATCH_SIZE,
258   - current_key_range=None):
  258 + current_key_range=None,
  259 + filters=None):
259 260 """Create new AbstractDatastoreInputReader object.
260 261
261 262 This is internal constructor. Use split_query in a concrete class instead.
@@ -268,6 +269,9 @@ def __init__(self,
268 269 key_ranges or ns_range can be non-None.
269 270 batch_size: size of read batch as int.
270 271 current_key_range: the current key_range.KeyRange being processed.
  272 + filters: optional list of filters to apply to the query. Each filter is
  273 + a tuple: (<property_name_as_str>, <query_operation_as_str>, <value>).
  274 + User filters are applied first.
271 275 """
272 276 assert key_ranges is not None or ns_range is not None, (
273 277 "must specify one of 'key_ranges' or 'ns_range'")
@@ -282,7 +286,7 @@ def __init__(self,
282 286 self._ns_range = ns_range
283 287 self._batch_size = int(batch_size)
284 288 self._current_key_range = current_key_range
285   -
  289 + self._filters = filters
286 290
287 291 @classmethod
288 292 def _get_raw_entity_kind(cls, entity_kind):
@@ -293,7 +297,6 @@ def _get_raw_entity_kind(cls, entity_kind):
293 297 entity_kind, cls.__name__)
294 298 return entity_kind
295 299
296   -
297 300 def __iter__(self):
298 301 """Iterates over the given KeyRanges or NamespaceRange.
299 302
@@ -411,8 +414,6 @@ def _split_input_from_namespace(cls, app, namespace, entity_kind,
411 414 # With one shard we don't need to calculate any splitpoints at all.
412 415 return [key_range.KeyRange(namespace=namespace, _app=app)]
413 416
414   - # we use datastore.Query instead of ext.db.Query here, because we can't
415   - # erase ordering on db.Query once we set it.
416 417 ds_query = datastore.Query(kind=raw_entity_kind,
417 418 namespace=namespace,
418 419 _app=app,
@@ -523,6 +524,20 @@ def validate(cls, mapper_spec):
523 524 "Expected a single namespace string")
524 525 if cls.NAMESPACES_PARAM in params:
525 526 raise BadReaderParamsError("Multiple namespaces are no longer supported")
  527 + if cls.FILTERS_PARAM in params:
  528 + filters = params[cls.FILTERS_PARAM]
  529 + if not isinstance(filters, list):
  530 + raise BadReaderParamsError("Expected list for filters parameter")
  531 + for f in filters:
  532 + if not isinstance(f, tuple):
  533 + raise BadReaderParamsError("Filter should be a tuple: %s", f)
  534 + if len(f) != 3:
  535 + raise BadReaderParamsError("Filter should be a 3-tuple: %s", f)
  536 + if not isinstance(f[0], basestring):
  537 + raise BadReaderParamsError("First element should be string: %s", f)
  538 + if f[1] != "=":
  539 + raise BadReaderParamsError(
  540 + "Only equality filters are supported: %s", f)
526 541
527 542 @classmethod
528 543 def split_input(cls, mapper_spec):
@@ -552,6 +567,7 @@ def split_input(cls, mapper_spec):
552 567 shard_count = mapper_spec.shard_count
553 568 namespace = params.get(cls.NAMESPACE_PARAM)
554 569 app = params.get(cls._APP_PARAM)
  570 + filters = params.get(cls.FILTERS_PARAM)
555 571
556 572 if namespace is None:
557 573 # It is difficult to efficiently shard large numbers of namespaces because
@@ -578,21 +594,27 @@ def split_input(cls, mapper_spec):
578 594 return [cls(entity_kind_name,
579 595 key_ranges=None,
580 596 ns_range=ns_range,
581   - batch_size=batch_size)
  597 + batch_size=batch_size,
  598 + filters=filters)
582 599 for ns_range in ns_ranges]
583 600 elif not namespace_keys:
584 601 return [cls(entity_kind_name,
585 602 key_ranges=None,
586 603 ns_range=namespace_range.NamespaceRange(),
587   - batch_size=shard_count)]
  604 + batch_size=shard_count,
  605 + filters=filters)]
588 606 else:
589 607 namespaces = [namespace_key.name() or ""
590 608 for namespace_key in namespace_keys]
591 609 else:
592 610 namespaces = [namespace]
593 611
594   - return cls._split_input_from_params(
  612 + readers = cls._split_input_from_params(
595 613 app, namespaces, entity_kind_name, params, shard_count)
  614 + if filters:
  615 + for reader in readers:
  616 + reader._filters = filters
  617 + return readers
596 618
597 619 def to_json(self):
598 620 """Serializes all the data in this query range into json form.
@@ -624,7 +646,8 @@ def to_json(self):
624 646 self.NAMESPACE_RANGE_PARAM: namespace_range_json,
625 647 self.CURRENT_KEY_RANGE_PARAM: current_key_range_json,
626 648 self.ENTITY_KIND_PARAM: self._entity_kind,
627   - self.BATCH_SIZE_PARAM: self._batch_size}
  649 + self.BATCH_SIZE_PARAM: self._batch_size,
  650 + self.FILTERS_PARAM: self._filters}
628 651 return json_dict
629 652
630 653 @classmethod
@@ -664,7 +687,8 @@ def from_json(cls, json):
664 687 key_ranges,
665 688 ns_range,
666 689 json[cls.BATCH_SIZE_PARAM],
667   - current_key_range)
  690 + current_key_range,
  691 + filters=json.get(cls.FILTERS_PARAM))
668 692
669 693
670 694 class DatastoreInputReader(AbstractDatastoreInputReader):
@@ -682,7 +706,8 @@ def _iter_key_range(self, k_range):
682 706 cursor = None
683 707 while True:
684 708 query = k_range.make_ascending_query(
685   - util.for_name(self._entity_kind))
  709 + util.for_name(self._entity_kind),
  710 + filters=self._filters)
686 711 if isinstance(query, db.Query):
687 712 # Old db version.
688 713 if cursor:
@@ -746,7 +771,7 @@ class DatastoreKeyInputReader(AbstractDatastoreInputReader):
746 771 def _iter_key_range(self, k_range):
747 772 raw_entity_kind = self._get_raw_entity_kind(self._entity_kind)
748 773 query = k_range.make_ascending_datastore_query(
749   - raw_entity_kind, keys_only=True)
  774 + raw_entity_kind, keys_only=True, filters=self._filters)
750 775 for key in query.Run(
751 776 config=datastore_query.QueryOptions(batch_size=self._batch_size)):
752 777 yield key, key
@@ -758,7 +783,7 @@ class DatastoreEntityInputReader(AbstractDatastoreInputReader):
758 783 def _iter_key_range(self, k_range):
759 784 raw_entity_kind = self._get_raw_entity_kind(self._entity_kind)
760 785 query = k_range.make_ascending_datastore_query(
761   - raw_entity_kind)
  786 + raw_entity_kind, self._filters)
762 787 for entity in query.Run(
763 788 config=datastore_query.QueryOptions(batch_size=self._batch_size)):
764 789 yield entity.key(), entity
@@ -1243,7 +1268,7 @@ def next(self):
1243 1268 raise StopIteration()
1244 1269 entry = self._entries.pop()
1245 1270 value = self._zip.read(entry.filename)
1246   - self._filestream = StringIO(value)
  1271 + self._filestream = StringIO.StringIO(value)
1247 1272 if self._initial_offset:
1248 1273 self._filestream.seek(self._initial_offset)
1249 1274 self._filestream.readline()
@@ -1313,6 +1338,95 @@ def __str__(self):
1313 1338 self._next_offset())
1314 1339
1315 1340
  1341 +class RandomStringInputReader(InputReader):
  1342 + """RandomStringInputReader generates random strings as output.
  1343 +
  1344 + Primary usage is to populate output with testing entries.
  1345 + """
  1346 +
  1347 + # Total number of entries this reader should generate.
  1348 + COUNT = "count"
  1349 + # Length of the generated strings.
  1350 + STRING_LENGTH = "string_length"
  1351 +
  1352 + DEFAULT_STRING_LENGTH = 10
  1353 +
  1354 + def __init__(self, count, string_length):
  1355 + """Initialize input reader.
  1356 +
  1357 + Args:
  1358 + count: number of entries this shard should generate.
  1359 + string_length: the length of generated random strings.
  1360 + """
  1361 + self._count = count
  1362 + self._string_length = string_length
  1363 +
  1364 + def __iter__(self):
  1365 + ctx = context.get()
  1366 +
  1367 + while self._count:
  1368 + self._count -= 1
  1369 + start_time = time.time()
  1370 + content = "".join(random.choice(string.ascii_lowercase)
  1371 + for _ in range(self._string_length))
  1372 + if ctx:
  1373 + operation.counters.Increment(
  1374 + COUNTER_IO_READ_MSEC, int((time.time() - start_time) * 1000))(ctx)
  1375 + operation.counters.Increment(COUNTER_IO_READ_BYTES, len(content))(ctx)
  1376 + yield content
  1377 +
  1378 + @classmethod
  1379 + def split_input(cls, mapper_spec):
  1380 + params = _get_params(mapper_spec)
  1381 + count = params[cls.COUNT]
  1382 + string_length = cls.DEFAULT_STRING_LENGTH
  1383 + if cls.STRING_LENGTH in params:
  1384 + string_length = params[cls.STRING_LENGTH]
  1385 +
  1386 + shard_count = mapper_spec.shard_count
  1387 + count_per_shard = count // shard_count
  1388 +
  1389 + mr_input_readers = [
  1390 + cls(count_per_shard, string_length) for _ in range(shard_count)]
  1391 +
  1392 + left = count - count_per_shard*shard_count
  1393 + if left > 0:
  1394 + mr_input_readers.append(cls(left, string_length))
  1395 +
  1396 + return mr_input_readers
  1397 +
  1398 + @classmethod
  1399 + def validate(cls, mapper_spec):
  1400 + if mapper_spec.input_reader_class() != cls:
  1401 + raise BadReaderParamsError("Mapper input reader class mismatch")
  1402 +
  1403 + params = _get_params(mapper_spec)
  1404 + if cls.COUNT not in params:
  1405 + raise BadReaderParamsError("Must specify %s" % cls.COUNT)
  1406 + if not isinstance(params[cls.COUNT], int):
  1407 + raise BadReaderParamsError("%s should be an int but is %s" %
  1408 + (cls.COUNT, type(params[cls.COUNT])))
  1409 + if params[cls.COUNT] <= 0:
  1410 + raise BadReaderParamsError("%s should be a positive int")
  1411 + if cls.STRING_LENGTH in params and not (
  1412 + isinstance(params[cls.STRING_LENGTH], int) and
  1413 + params[cls.STRING_LENGTH] > 0):
  1414 + raise BadReaderParamsError("%s should be a positive int but is %s" %
  1415 + (cls.STRING_LENGTH, params[cls.STRING_LENGTH]))
  1416 + if (not isinstance(mapper_spec.shard_count, int) or
  1417 + mapper_spec.shard_count <= 0):
  1418 + raise BadReaderParamsError(
  1419 + "shard_count should be a positive int but is %s" %
  1420 + mapper_spec.shard_count)
  1421 +
  1422 + @classmethod
  1423 + def from_json(cls, json):
  1424 + return cls(json[cls.COUNT], json[cls.STRING_LENGTH])
  1425 +
  1426 + def to_json(self):
  1427 + return {self.COUNT: self._count, self.STRING_LENGTH: self._string_length}
  1428 +
  1429 +
1316 1430 class ConsistentKeyReader(DatastoreKeyInputReader):
1317 1431 """A key reader which reads consistent data from datastore.
1318 1432
@@ -1641,6 +1755,10 @@ def __iter__(self):
1641 1755 COUNTER_IO_READ_MSEC, int((time.time() - start_time) * 1000))(ctx)
1642 1756 operation.counters.Increment(COUNTER_IO_READ_BYTES, len(record))(ctx)
1643 1757 yield record
  1758 + except (files.ExistenceError), e:
  1759 + raise errors.FailJobError("ExistenceError: %s" % e)
  1760 + except (files.UnknownError), e:
  1761 + raise errors.RetrySliceError("UnknownError: %s" % e)
1644 1762 except EOFError:
1645 1763 self._filenames.pop(0)
1646 1764 if not self._filenames:
@@ -1939,164 +2057,3 @@ def __str__(self):
1939 2057 params.append("%s=%s" % (key, value))
1940 2058
1941 2059 return "LogInputReader(%s)" % ", ".join(params)
1942   -
1943   -class CloudStorageLineInputReader(InputReader):
1944   - """Input reader for files from a stored in the CloudStorage.
1945   -
1946   - You requires activate the cloud storage and create bucket.
1947   - The class shouldn't be instantiated directly. Use the split_input class method
1948   - instead.
1949   - """
1950   - # TODO(user): Should we set this based on MAX_BLOB_FETCH_SIZE?
1951   - _BLOB_BUFFER_SIZE = 64000
1952   -
1953   - # Maximum number of shards to allow.
1954   - _MAX_SHARD_COUNT = 256
1955   -
1956   - # Maximum number of file path
1957   - _MAX_FILE_PATHS_COUNT = 1
1958   -
1959   - # Mapreduce parameters.
1960   - FILE_PATHS_PARAM = "file_paths"
1961   - # Serialyzation parameters.
1962   - INITIAL_POSITION_PARAM = "initial_position"
1963   - START_POSITION_PARAM = "start_position"
1964   - END_POSITION_PARAM = "end_position"
1965   - FILE_PATH_PARAM = "file_path"
1966   -
1967   - def __init__(self, file_path, start_position, end_position):
1968   - """Initializes this instance with the given blob key and character range.
1969   -
1970   - This BlobstoreInputReader will read from the first record starting after
1971   - strictly after start_position until the first record ending at or after
1972   - end_position (exclusive). As an exception, if start_position is 0, then
1973   - this InputReader starts reading at the first record.
1974   -
1975   - Args:
1976   - blob_key: the BlobKey that this input reader is processing.
1977   - start_position: the position to start reading at.
1978   - end_position: a position in the last record to read.
1979   - """
1980   - self._file_path = file_path
1981   - self._start_position = start_position
1982   - self._end_position = end_position
1983   - self._has_iterated = False
1984   - with files.open(self._file_path, 'r') as fp:
1985   - fp.seek(self._start_position, 0)
1986   - value = fp.read(self._BLOB_BUFFER_SIZE)
1987   - self._filestream = StringIO(value)
1988   - self._read_before_start = bool(start_position)
1989   -
1990   - @classmethod
1991   - def validate(cls, mapper_spec):
1992   - """Validates mapper spec and all mapper parameters.
1993   -
1994   - Args:
1995   - mapper_spec: The MapperSpec for this InputReader.
1996   -
1997   - Raises:
1998   - BadReaderParamsError: required parameters are missing or invalid.
1999   - """
2000   - if mapper_spec.input_reader_class() != cls:
2001   - raise BadReaderParamsError("Mapper input reader class mismatch")
2002   - params = _get_params(mapper_spec)
2003   - if cls.FILE_PATHS_PARAM not in params:
2004   - raise BadReaderParamsError("Must specify 'file_path' for mapper input")
2005   -
2006   - file_paths = params[cls.FILE_PATHS_PARAM]
2007   - if isinstance(file_paths, basestring):
2008   - # This is a mechanism to allow multiple blob keys (which do not contain
2009   - # commas) in a single string. It may go away.
2010   - file_paths = file_paths.split(",")
2011   - if len(file_paths) > cls._MAX_FILE_PATHS_COUNT:
2012   - raise BadReaderParamsError("Too many 'file_paht' for mapper input")
2013   - if not file_paths:
2014   - raise BadReaderParamsError("No 'file_pahts' specified for mapper input")
2015   -
2016   - @classmethod
2017   - def split_input(cls, mapper_spec):
2018   - """Returns a list of shard_count input_spec_shards for input_spec.
2019   -
2020   - Args:
2021   - mapper_spec: The mapper specification to split from. Must contain
2022   - 'blob_keys' parameter with one or more blob keys.
2023   -
2024   - Returns:
2025   - A list of BlobstoreInputReaders corresponding to the specified shards.
2026   - """
2027   - params = _get_params(mapper_spec)
2028   - file_paths = params[cls.FILE_PATHS_PARAM]
2029   - if isinstance(file_paths, basestring):
2030   - # This is a mechanism to allow multiple blob keys (which do not contain
2031   - # commas) in a single string. It may go away.
2032   - file_paths = file_paths.split(",")
2033   -
2034   - file_sizes = {}
2035   - for file_path in file_paths:
2036   - with files.open(file_path, 'r') as fp:
2037   - fp.seek(0,2)
2038   - file_sizes[file_path] = fp.tell()
2039   -
2040   - shard_count = min(cls._MAX_SHARD_COUNT, mapper_spec.shard_count)
2041   -
2042   - shards_per_blob = shard_count // len(file_paths)
2043   - if shards_per_blob == 0:
2044   - shards_per_blob = 1
2045   -
2046   - chunks = []
2047   - for blob_key, blob_size in file_sizes.items():
2048   - blob_chunk_size = blob_size // shards_per_blob
2049   - for i in xrange(shards_per_blob - 1):
2050   - chunks.append(CloudStorageLineInputReader.from_json(
2051   - {cls.FILE_PATH_PARAM: blob_key,
2052   - cls.INITIAL_POSITION_PARAM: blob_chunk_size * i,
2053   - cls.END_POSITION_PARAM: blob_chunk_size * (i + 1)}))
2054   - chunks.append(CloudStorageLineInputReader.from_json(
2055   - {cls.FILE_PATH_PARAM: blob_key,
2056   - cls.INITIAL_POSITION_PARAM: blob_chunk_size * (shards_per_blob - 1),
2057   - cls.END_POSITION_PARAM: blob_size}))
2058   -
2059   - return chunks
2060   -
2061   - def next(self):
2062   - """Returns the next input from as an (offset, line) tuple."""
2063   - self._has_iterated = True
2064   -
2065   - if self._read_before_start:
2066   - self._filestream.readline()
2067   - self._read_before_start = False
2068   -
2069   - start_position = self._filestream.tell()
2070   -
2071   - if start_position > self._end_position:
2072   - self.stopIteration()
2073   -
2074   - line = self._filestream.readline()
2075   -
2076   - if not line:
2077   - self.stopIteration()
2078   -
2079   - return start_position, line.rstrip("\n")
2080   -
2081   - def stopIteration(self):
2082   - self._filestream.close()
2083   - self._filestream = None
2084   - raise StopIteration()
2085   -
2086   - def to_json(self):
2087   - """Returns an json-compatible input shard spec for remaining inputs."""
2088   - return {self.FILE_PATH_PARAM: self._file_path,
2089   - self.INITIAL_POSITION_PARAM: self._start_position,
2090   - self.END_POSITION_PARAM: self._end_position}
2091   -
2092   - def __str__(self):
2093   - """Returns the string representation of this BlobstoreLineInputReader."""
2094   - return "blobstore.BlobKey(%r):[%d, %d]" % (
2095   - self._file_path, self._filestream.tell(), self._end_position)
2096   -
2097   - @classmethod
2098   - def from_json(cls, json):
2099   - """Instantiates an instance of this InputReader for the given shard spec."""
2100   - return cls(json[cls.FILE_PATH_PARAM],
2101   - json[cls.INITIAL_POSITION_PARAM],
2102   - json[cls.END_POSITION_PARAM])
19 src/mapreduce/lib/files/blobstore.py
@@ -37,7 +37,6 @@
37 37 _BLOBSTORE_FILESYSTEM = files.BLOBSTORE_FILESYSTEM
38 38 _BLOBSTORE_DIRECTORY = '/' + _BLOBSTORE_FILESYSTEM + '/'
39 39 _BLOBSTORE_NEW_FILE_NAME = 'new'
40   -_CREATION_HANDLE_PREFIX = 'writable:'
41 40 _MIME_TYPE_PARAMETER = 'content_type'
42 41 _BLOBINFO_UPLOADED_FILENAME_PARAMETER = 'file_name'
43 42
@@ -102,7 +101,7 @@ def get_blob_key(create_file_name):
102 101 (create_file_name, _BLOBSTORE_DIRECTORY))
103 102 ticket = create_file_name[len(_BLOBSTORE_DIRECTORY):]
104 103
105   - if not ticket.startswith(_CREATION_HANDLE_PREFIX):
  104 + if not ticket.startswith(files._CREATION_HANDLE_PREFIX):
106 105
107 106 return blobstore.BlobKey(ticket)
108 107
@@ -153,19 +152,3 @@ def get_file_name(blob_key):
153 152 if not isinstance(blob_key, (blobstore.BlobKey, basestring)):
154 153 raise files.InvalidArgumentError('Expected string or blobstore.BlobKey')
155 154 return '%s%s' % (_BLOBSTORE_DIRECTORY, blob_key)
156   -
157   -
158   -def _delete(filename):
159   - """Permanently delete a file.
160   -
161   - Args:
162   - filename: finalized file name as string.
163   - """
164   -
165   - blob_key = get_blob_key(filename)
166   - if blob_key is None:
167   - return
168   - blob_info = blobstore.BlobInfo.get(blob_key)
169   - if blob_info is None:
170   - return
171   - blob_info.delete()
261 src/mapreduce/lib/files/file.py
@@ -52,13 +52,13 @@
52 52
53 53 'delete',
54 54 'finalize',
  55 + 'listdir',
55 56 'open',
56 57 'stat',
57 58
58 59 'BufferedFile',
59 60 ]
60 61
61   -import gc
62 62 import os
63 63 import sys
64 64 import StringIO
@@ -72,6 +72,8 @@
72 72 GS_FILESYSTEM = 'gs'
73 73 FILESYSTEMS = (BLOBSTORE_FILESYSTEM, GS_FILESYSTEM)
74 74 READ_BLOCK_SIZE = 1024 * 512
  75 +_CREATION_HANDLE_PREFIX = 'writable:'
  76 +_DEFAULT_BUFFER_SIZE = 512 * 1024
75 77
76 78
77 79 class Error(Exception):
@@ -400,7 +402,7 @@ def read(self, size=None):
400 402 buf.close()
401 403
402 404 def _verify_read_mode(self):
403   - if self._mode != 'r':
  405 + if self._mode not in ('r', 'rb'):
404 406 raise WrongOpenModeError('File is opened for write.')
405 407
406 408 def _open(self):
@@ -411,9 +413,9 @@ def _open(self):
411 413 request.set_exclusive_lock(self._exclusive_lock)
412 414 request.set_content_type(self._content_type)
413 415
414   - if self._mode == 'a' or self._mode == 'ab':
  416 + if self._mode in ('a', 'ab'):
415 417 request.set_open_mode(file_service_pb.OpenRequest.APPEND)
416   - elif self._mode == 'r' or self._mode == 'rb':
  418 + elif self._mode in ('r', 'rb'):
417 419 request.set_open_mode(file_service_pb.OpenRequest.READ)
418 420 else:
419 421 raise UnsupportedOpenModeError('Unsupported open mode: %s', self._mode)
@@ -464,11 +466,17 @@ def stat(self):
464 466 file_stat.filename = file_stat_pb.filename()
465 467 file_stat.finalized = file_stat_pb.finalized()
466 468 file_stat.st_size = file_stat_pb.length()
  469 + file_stat.st_mtime = file_stat_pb.mtime()
  470 + file_stat.st_ctime = file_stat_pb.ctime()
467 471
468 472 return file_stat
469 473
470 474
471   -def open(filename, mode='r', content_type=RAW, exclusive_lock=False):
  475 +def open(filename,
  476 + mode='r',
  477 + content_type=RAW,
  478 + exclusive_lock=False,
  479 + buffering=0):
472 480 """Open a file.
473 481
474 482 Args:
@@ -477,10 +485,17 @@ def open(filename, mode='r', content_type=RAW, exclusive_lock=False):
477 485 content_type: File's content type. Value from FileContentType.ContentType
478 486 enum.
479 487 exclusive_lock: If file should be exclusively locked. All other exclusive
480   - lock attempts will file untile file is correctly closed.
  488 + lock attempts will file until file is correctly closed.
  489 + buffering: optional argument similar to the one in Python's open.
  490 + It specifies the file's desired buffer size: 0 means unbuffered, positive
  491 + value means use a buffer of that size, any negative value means the
  492 + default size. Only read buffering is supported.
481 493
482 494 Returns:
483 495 File object.
  496 +
  497 + Raises:
  498 + InvalidArgumentError: Raised when given illegal argument value or type.
484 499 """
485 500 if not filename:
486 501 raise InvalidArgumentError('Filename is empty')
@@ -489,12 +504,48 @@ def open(filename, mode='r', content_type=RAW, exclusive_lock=False):
489 504 (filename.__class__, filename))
490 505 if content_type != RAW:
491 506 raise InvalidArgumentError('Invalid content type')
  507 + if not (isinstance(buffering, int) or isinstance(buffering, long)):
  508 + raise InvalidArgumentError('buffering should be an int but is %s'
  509 + % buffering)
  510 +
  511 + if mode == 'r' or mode == 'rb':
  512 + if buffering > 0:
  513 + return BufferedFile(filename, buffering)
  514 + elif buffering < 0:
  515 + return BufferedFile(filename, _DEFAULT_BUFFER_SIZE)
  516 +
  517 + return _File(filename,
  518 + mode=mode,
  519 + content_type=content_type,
  520 + exclusive_lock=exclusive_lock)
  521 +
492 522
493   - f = _File(filename,
494   - mode=mode,
495   - content_type=content_type,
496   - exclusive_lock=exclusive_lock)
497   - return f
  523 +def listdir(path, **kwargs):
  524 + """Return a sorted list of filenames (matching a pattern) in the given path.
  525 +
  526 + Only Google Cloud Storage paths are supported in current implementation.
  527 +
  528 + Args:
  529 + path: a Google Cloud Storage path of "/gs/bucketname" form.
  530 + kwargs: other keyword arguments to be relayed to Google Cloud Storage.
  531 + This can be used to select certain files with names matching a pattern.
  532 + See mapreduce.lib.files.gs.listdir for details.
  533 +
  534 + Returns:
  535 + a list containing filenames (matching a pattern) from the given path.
  536 + Sorted by Python String.
  537 + """
  538 +
  539 + from mapreduce.lib.files import gs
  540 +
  541 + if not isinstance(path, basestring):
  542 + raise InvalidArgumentError('path should be a string, but is %s(%r)' %
  543 + (path.__class__.__name__, path))
  544 +
  545 + if path.startswith(gs._GS_PREFIX):
  546 + return gs.listdir(path, kwargs)
  547 + else:
  548 + raise InvalidFileNameError('Unsupported path: %s' % path)
498 549
499 550
500 551 def finalize(filename, content_type=RAW):
@@ -527,12 +578,12 @@ class _FileStat(object):
527 578 filename: the uploaded filename of the file;
528 579 finalized: whether the file is finalized. This is always true by now;
529 580 st_size: number of bytes of the file;
530   - st_ctime: creation time. Currently not set;
531   - st_mtime: modification time. Currently not set.;
  581 + st_ctime: creation time;
  582 + st_mtime: modification time.
532 583 """
533 584 def __init__(self):
534 585 self.filename = None
535   - self.finlized = True
  586 + self.finalized = True
536 587 self.st_size = None
537 588 self.st_ctime = None
538 589 self.st_mtime = None
@@ -597,21 +648,67 @@ def _create(filesystem, content_type=RAW, filename=None, params=None):
597 648 return response.filename()
598 649
599 650
600   -def delete(filename):
601   - """Permanently delete a file.
  651 +def __checkIsFinalizedName(filename):
  652 + """Check if a filename is finalized.
  653 +
  654 + A filename is finalized when it has creation handle prefix, which is the same
  655 + for both blobstore and gs files.
  656 +
  657 + Args:
  658 + filename: a gs or blobstore filename that starts with '/gs/' or
  659 + '/blobstore/'
  660 +
  661 + Raises:
  662 + InvalidFileNameError: raised when filename is finalized.
  663 + """
  664 + if filename.split('/')[2].startswith(_CREATION_HANDLE_PREFIX):
  665 + raise InvalidFileNameError('File %s should have finalized filename' %
  666 + filename)
  667 +
  668 +
  669 +def delete(*filenames):
  670 + """Permanently delete files.
  671 +
  672 + Delete on non-finalized/non-existent files is a no-op.
602 673
603 674 Args:
604   - filename: finalized file name as string.
  675 + filenames: finalized file names as strings. filename should has format
  676 + "/gs/bucket/filename" or "/blobstore/blobkey".
  677 +
  678 + Raises:
  679 + InvalidFileNameError: Raised when any filename is not of valid format or
  680 + not a finalized name.
  681 + IOError: Raised if any problem occurs contacting the backend system.
605 682 """
  683 +
606 684 from mapreduce.lib.files import blobstore as files_blobstore
  685 + from mapreduce.lib.files import gs
  686 + from google.appengine.ext import blobstore
607 687
608   - if not isinstance(filename, basestring):
609   - raise InvalidArgumentError('Filename should be a string, but is %s(%r)' %
610   - (filename.__class__.__name__, filename))
611   - if filename.startswith(files_blobstore._BLOBSTORE_DIRECTORY):
612   - files_blobstore._delete(filename)
613   - else:
614   - raise InvalidFileNameError( 'Unsupported file name: %s' % filename)
  688 + blobkeys = []
  689 +
  690 + for filename in filenames:
  691 + if not isinstance(filename, basestring):
  692 + raise InvalidArgumentError('Filename should be a string, but is %s(%r)' %
  693 + (filename.__class__.__name__, filename))
  694 + if filename.startswith(files_blobstore._BLOBSTORE_DIRECTORY):
  695 + __checkIsFinalizedName(filename)
  696 + blobkey = files_blobstore.get_blob_key(filename)
  697 + if blobkey:
  698 + blobkeys.append(blobkey)
  699 + elif filename.startswith(gs._GS_PREFIX):
  700 +
  701 + __checkIsFinalizedName(filename)
  702 + blobkeys.append(blobstore.create_gs_key(filename))
  703 + else:
  704 + raise InvalidFileNameError('Filename should start with /%s or /%s' %
  705 + (files_blobstore._BLOBSTORE_DIRECTORY,
  706 + gs._GS_PREFIX))
  707 +
  708 + try:
  709 + blobstore.delete(blobkeys)
  710 + except Exception, e:
  711 + raise IOError('Blobstore failure.', e)
615 712
616 713
617 714 def _get_capabilities():
@@ -630,9 +727,7 @@ def _get_capabilities():
630 727 class BufferedFile(object):
631 728 """BufferedFile is a file-like object reading underlying file in chunks."""
632 729
633   - _BUFFER_SIZE = 512 * 1024
634   -
635   - def __init__(self, filename, buffer_size=_BUFFER_SIZE):
  730 + def __init__(self, filename, buffer_size=_DEFAULT_BUFFER_SIZE):
636 731 """Constructor.
637 732
638 733 Args:
@@ -644,6 +739,18 @@ def __init__(self, filename, buffer_size=_BUFFER_SIZE):
644 739 self._buffer = ''
645 740 self._buffer_pos = 0
646 741 self._buffer_size = buffer_size
  742 + self._eof = False
  743 +
  744 + def __enter__(self):
  745 + return self
  746 +
  747 + def __exit__(self, atype, value, traceback):
  748 + self.close()
  749 +
  750 + def close(self):
  751 + self._buffer = ''
  752 + self._eof = True
  753 + self._buffer_pos = 0
647 754
648 755 def tell(self):
649 756 """Return file's current position."""
@@ -659,28 +766,78 @@ def read(self, size):
659 766 Returns:
660 767 A string with data read.
661 768 """
662   - while len(self._buffer) - self._buffer_pos < size:
663   - self._buffer = self._buffer[self._buffer_pos:]
664   - self._buffer_pos = 0
665   - with open(self._filename, 'r') as f:
666   - f.seek(self._position + len(self._buffer))
667   - data = f.read(self._buffer_size)
668   - if not data:
669   - break
670   - self._buffer += data
671   - gc.collect()
  769 + data_list = []
  770 + while True:
  771 + result = self.__readBuffer(size)
  772 + data_list.append(result)
  773 + size -= len(result)
  774 + if size == 0 or self._eof:
  775 + return ''.join(data_list)
  776 + self.__refillBuffer()
  777 +
  778 + def readline(self, size=-1):
  779 + """Read one line delimited by '\n' from the file.
  780 +
  781 + A trailing newline character is kept in the string. It may be absent when a
  782 + file ends with an incomplete line. If the size argument is non-negative,
  783 + it specifies the maximum string size (counting the newline) to return. An