# Pipelines

Data pipelines allow large amounts of data to be processed without loading into memory at once.

## Pull Pipelines

Generators can be used for iteration based data pipelines

In [1]:
def pipeline_stage_one_a(limit):
    num = 0
    while num <= limit:
        print("stage 1 yield")
        yield num
        num += 1

def pipeline_stage_two(previous_stage_generator):
    for value in previous_stage_generator:
        print("stage 2 yield")
        yield value * value

def pipeline_stage_three(previous_stage_generator):
    for squared_value in previous_stage_generator:
        squared_value += 1
        print("stage 3 yield")
        yield squared_value

Create a data pipeline using generators

In [2]:
stage1 = pipeline_stage_one_a(limit=10)         # create initial generator - returns generator object, no execution
stage2 = pipeline_stage_two(stage1)             # pass it to the next in the pipeline
stage3 = pipeline_stage_three(stage2)           # pass it to the next in the pipeline

Use iteration to pull data through pipeline

In [3]:
for i in stage3:
    print("x^2 + 1 =", i)

stage 1 yield
stage 2 yield
stage 3 yield
x^2 + 1 = 1
stage 1 yield
stage 2 yield
stage 3 yield
x^2 + 1 = 2
stage 1 yield
stage 2 yield
stage 3 yield
x^2 + 1 = 5
stage 1 yield
stage 2 yield
stage 3 yield
x^2 + 1 = 10
stage 1 yield
stage 2 yield
stage 3 yield
x^2 + 1 = 17
stage 1 yield
stage 2 yield
stage 3 yield
x^2 + 1 = 26
stage 1 yield
stage 2 yield
stage 3 yield
x^2 + 1 = 37
stage 1 yield
stage 2 yield
stage 3 yield
x^2 + 1 = 50
stage 1 yield
stage 2 yield
stage 3 yield
x^2 + 1 = 65
stage 1 yield
stage 2 yield
stage 3 yield
x^2 + 1 = 82
stage 1 yield
stage 2 yield
stage 3 yield
x^2 + 1 = 101


Same can be done using generator comprehensions

In [4]:
stage1 = (x for x in range(10))                 # create initial generator
stage2 = (x*x for x in stage1)                  # pass it to the next in the pipeline
stage3 = (x+1 for x in stage2)                  # pass it to the next in the pipeline

# here we use iteration to pull the data through the pipeline from the end
for i in stage3:
    print("x^2 + 1 =", i)

x^2 + 1 = 1
x^2 + 1 = 2
x^2 + 1 = 5
x^2 + 1 = 10
x^2 + 1 = 17
x^2 + 1 = 26
x^2 + 1 = 37
x^2 + 1 = 50
x^2 + 1 = 65
x^2 + 1 = 82


You can create many-to-one generators in the pipeline with zip

In [5]:

def pipeline_stage_one_b(limit):
    num = 0
    while num <= limit:
        print("stage 1a yield")
        yield f"(x = {num})"
        num += 1

def pipeline_stage_multiplex(previous_stage_generator_a, previous_stage_generator_b):
    for squared_value, value_label in zip(previous_stage_generator_a, previous_stage_generator_b):
        squared_value += 1
        print("stage 3 yield")
        yield f"{squared_value} {value_label}"

stage1a = pipeline_stage_one_a(limit=10)                # create first generator
stage1b = pipeline_stage_one_b(limit=10)                # create second generator
stage2 = pipeline_stage_two(stage1a)                    # pass first to the next in the pipeline
stage3 = pipeline_stage_multiplex(stage2, stage1b)      # pass two generators to the next in the pipeline

for i in stage3:
    print("x^2 + 1 =", i)

stage 1 yield
stage 2 yield
stage 1a yield
stage 3 yield
x^2 + 1 = 1 (x = 0)
stage 1 yield
stage 2 yield
stage 1a yield
stage 3 yield
x^2 + 1 = 2 (x = 1)
stage 1 yield
stage 2 yield
stage 1a yield
stage 3 yield
x^2 + 1 = 5 (x = 2)
stage 1 yield
stage 2 yield
stage 1a yield
stage 3 yield
x^2 + 1 = 10 (x = 3)
stage 1 yield
stage 2 yield
stage 1a yield
stage 3 yield
x^2 + 1 = 17 (x = 4)
stage 1 yield
stage 2 yield
stage 1a yield
stage 3 yield
x^2 + 1 = 26 (x = 5)
stage 1 yield
stage 2 yield
stage 1a yield
stage 3 yield
x^2 + 1 = 37 (x = 6)
stage 1 yield
stage 2 yield
stage 1a yield
stage 3 yield
x^2 + 1 = 50 (x = 7)
stage 1 yield
stage 2 yield
stage 1a yield
stage 3 yield
x^2 + 1 = 65 (x = 8)
stage 1 yield
stage 2 yield
stage 1a yield
stage 3 yield
x^2 + 1 = 82 (x = 9)
stage 1 yield
stage 2 yield
stage 1a yield
stage 3 yield
x^2 + 1 = 101 (x = 10)


## Push Pipelines

Coroutines can be used to create producer-consumer data pipelines

In [6]:
def pipeline_producer(next_stage_coroutine, limit):
    num = 0
    _return_value = next_stage_coroutine.send(None)                 # prime the next stage in the pipeline
    while num <= limit:
        print("stage producer send")
        _return_value = next_stage_coroutine.send(num)
        print("return value from producer send =", _return_value)
        num += 1
    next_stage_coroutine.close()

def pipeline_stage_one(next_stage_coroutine):
    _return_value = next_stage_coroutine.send(None)
    while True:
        print("stage 1 yield")
        value = (yield _return_value)                               # yield back up the pipeline
        print("stage 1 send")
        _return_value = next_stage_coroutine.send(value * value)    # send down the pipeline

def pipeline_stage_two(next_stage_coroutine):
    _return_value = next_stage_coroutine.send(None)
    while True:
        print("stage 2 yield")
        value = (yield _return_value)
        print("stage 2 send")
        _return_value = next_stage_coroutine.send(value + 1)

Create a data pipeline using coroutines.

In [7]:
def pipeline_stage_consumer(prefix_string):
    while True:
        print("stage consumer yield")
        value = (yield 'Consumed')
        print(prefix_string, value)

stage_consumer = pipeline_stage_consumer("x^2 + 1 =")
stage2 = pipeline_stage_two(stage_consumer)
stage1 = pipeline_stage_one(stage2)

print("push data data through the pipeline by calling producer")
pipeline_producer(stage1, 10)

push data data through the pipeline by calling producer
stage consumer yield
stage 2 yield
stage 1 yield
stage producer send
stage 1 send
stage 2 send
x^2 + 1 = 1
stage consumer yield
stage 2 yield
stage 1 yield
return value from producer send = Consumed
stage producer send
stage 1 send
stage 2 send
x^2 + 1 = 2
stage consumer yield
stage 2 yield
stage 1 yield
return value from producer send = Consumed
stage producer send
stage 1 send
stage 2 send
x^2 + 1 = 5
stage consumer yield
stage 2 yield
stage 1 yield
return value from producer send = Consumed
stage producer send
stage 1 send
stage 2 send
x^2 + 1 = 10
stage consumer yield
stage 2 yield
stage 1 yield
return value from producer send = Consumed
stage producer send
stage 1 send
stage 2 send
x^2 + 1 = 17
stage consumer yield
stage 2 yield
stage 1 yield
return value from producer send = Consumed
stage producer send
stage 1 send
stage 2 send
x^2 + 1 = 26
stage consumer yield
stage 2 yield
stage 1 yield
return value from producer send = C

Create a data pipeline with broadcasting coroutines

In [8]:
def pipeline_stage_broadcaster(next_stage_coroutines):
    for _sink in next_stage_coroutines:
        _sink.send(None)
    while True:
        print("stage broadcast yield")
        value = (yield 'Broadcasting')
        print("stage broadcasting")
        for _sink in next_stage_coroutines:
            _sink.send(value)

stage_consumer1 = pipeline_stage_consumer("consumer1: x^2 + 1 =")
stage_consumer2 = pipeline_stage_consumer("consumer2: x^2 + 1 =")
stage_broadcaster = pipeline_stage_broadcaster([stage_consumer1, stage_consumer2])
stage2 = pipeline_stage_two(stage_broadcaster)
stage1 = pipeline_stage_one(stage2)

print("again push data data through the pipeline by calling producer")
pipeline_producer(stage1, 10)

again push data data through the pipeline by calling producer
stage consumer yield
stage consumer yield
stage broadcast yield
stage 2 yield
stage 1 yield
stage producer send
stage 1 send
stage 2 send
stage broadcasting
consumer1: x^2 + 1 = 1
stage consumer yield
consumer2: x^2 + 1 = 1
stage consumer yield
stage broadcast yield
stage 2 yield
stage 1 yield
return value from producer send = Broadcasting
stage producer send
stage 1 send
stage 2 send
stage broadcasting
consumer1: x^2 + 1 = 2
stage consumer yield
consumer2: x^2 + 1 = 2
stage consumer yield
stage broadcast yield
stage 2 yield
stage 1 yield
return value from producer send = Broadcasting
stage producer send
stage 1 send
stage 2 send
stage broadcasting
consumer1: x^2 + 1 = 5
stage consumer yield
consumer2: x^2 + 1 = 5
stage consumer yield
stage broadcast yield
stage 2 yield
stage 1 yield
return value from producer send = Broadcasting
stage producer send
stage 1 send
stage 2 send
stage broadcasting
consumer1: x^2 + 1 = 10
stage c