#### a starter example for dataflow

In [4]:
# Standard imports
import apache_beam as beam

# Create a pipeline executing on a direct runner (local, non-cloud).
p = beam.Pipeline('DirectPipelineRunner')
# Create a PCollection with names and write it to a file.
(p
 | beam.Create('add names', ['Ann', 'Joe'])
 | beam.Write('save', beam.io.TextFileSink('./names')))
# Execute the pipeline.
p.run()
!cat names-00000-of-00001

Ann
Joe


In [3]:
!ls name*

names-00000-of-00001


In [4]:
!cat names-00000-of-00001

Ann
Joe


##### another way to write the same thing

before each pipeline definition, a new beam.Pipeline instance has to be created

In [29]:
p = beam.Pipeline('DirectPipelineRunner')
(p
 | 'add nuhguyguygouames' >> beam.Create(['Ann', 'Joe'])
 | 'save' >> beam.Write(beam.io.TextFileSink('./names')))
# Execute the pipeline.
p.run()
!cat names-00000-of-00001

Ann
Joe


play with Map

In [22]:
p = beam.Pipeline('DirectPipelineRunner')
(p
 | 'add names' >> beam.Create(['Ann', 'Joe'])
 | 'try map' >> beam.Map(lambda x: (x, sorted(x.lower())))
 | 'save' >> beam.Write(beam.io.TextFileSink('./names')))
# Execute the pipeline.
p.run()
!cat names-00000-of-00001

('Ann', ['a', 'n', 'n'])
('Joe', ['e', 'j', 'o'])


#### Map example

pay attention that the lambda can take multiple params, only the first is the element from the collection

In [33]:
p = beam.Pipeline('DirectPipelineRunner')
# Read file with names, add a greeting for each, and write results.
(p
 | beam.Read('load messages', beam.io.TextFileSource('./names-00000-of-00001'))
 | beam.Map('add greeting',
          lambda name, msg, details: '%s %s %s!' % (msg, name, details),
          'Hello', 'lala')   
 | beam.Write('save', beam.io.TextFileSink('./greetings')))
p.run()
!cat greetings-00000-of-00001

Hello Ann lala!
Hello Joe lala!


#### FlatMap example

In [35]:
p = beam.Pipeline('DirectPipelineRunner')
# Read previous file, add a name to each greeting and write results.
(p
 | beam.Read('load messages', beam.io.TextFileSource('./names-00000-of-00001'))
 | beam.FlatMap('add greetings',
              lambda name, msgs: ['%s %s!' % (m, name) for m in msgs],
              ['Hello', 'Hola'])
 | beam.Write('save', beam.io.TextFileSink('./greetings')))
p.run()
!cat greetings-00000-of-00001

Hello Ann!
Hola Ann!
Hello Joe!
Hola Joe!


##### what if we just use regular Map

In [36]:
p = beam.Pipeline('DirectPipelineRunner')
# Read previous file, add a name to each greeting and write results.
(p
 | beam.Read('load messages', beam.io.TextFileSource('./names-00000-of-00001'))
 | beam.Map('add greetings',
              lambda name, msgs: ['%s %s!' % (m, name) for m in msgs],
              ['Hello', 'Hola'])
 | beam.Write('save', beam.io.TextFileSink('./greetings')))
p.run()
!cat greetings-00000-of-00001

[u'Hello Ann!', u'Hola Ann!']
[u'Hello Joe!', u'Hola Joe!']
