In [4]:
import apache_beam as beam
with beam.Pipeline() as p:
    plants_csv = (
      p
      | 'Garden plants' >> beam.Create([
          ['🍓', 'Strawberry', 'perennial'],
          ['🥕', 'Carrot', 'biennial'],
          ['🍆', 'Eggplant', 'perennial'],
          ['🍅', 'Tomato', 'annual'],
          ['🥔', 'Potato', 'perennial'],
      ])
        |beam.ToString.Iterables()
        |beam.Map(print)
    )

🍓,Strawberry,perennial
🥕,Carrot,biennial
🍆,Eggplant,perennial
🍅,Tomato,annual
🥔,Potato,perennial


In [8]:

import apache_beam as beam

with beam.Pipeline() as pipeline:
  plant_lists = (
      pipeline
      | 'Garden plants' >> beam.Create([
          ['🍓', 'Strawberry', 'perennial'],
          ['🥕', 'Carrot', 'biennial'],
          ['🍆', 'Eggplant', 'perennial'],
          ['🍅', 'Tomato', 'annual'],
          ['🥔', 'Potato', 'perennial'],
      ])
      |beam.ToString.Element()
      |beam.Map(print)
  )

['🍓', 'Strawberry', 'perennial']
['🥕', 'Carrot', 'biennial']
['🍆', 'Eggplant', 'perennial']
['🍅', 'Tomato', 'annual']
['🥔', 'Potato', 'perennial']


In [9]:
import apache_beam as beam

with beam.Pipeline() as pipeline:
  plants = (
      pipeline
      | 'Garden plants' >> beam.Create([
          ('🍓', 'Strawberry'),
          ('🥕', 'Carrot'),
          ('🍆', 'Eggplant'),
          ('🍅', 'Tomato'),
          ('🥔', 'Potato'),
      ])
      | 'To string' >> beam.ToString.Kvs()
      | beam.Map(print))

🍓,Strawberry
🥕,Carrot
🍆,Eggplant
🍅,Tomato
🥔,Potato


In [17]:
import apache_beam as beam

class GetTimestamp(beam.DoFn):
    def process(self, plant, timestamp=beam.DoFn.TimestampParam):
        yield '{} - {}'.format(timestamp.to_utc_datetime(), plant['name'])

with beam.Pipeline() as pipeline:
    plant_timestamps = (
      pipeline
      | 'Garden plants' >> beam.Create([
          {'name': 'Strawberry', 'season': 1585699200}, # April, 2020
          {'name': 'Carrot', 'season': 1590969600},     # June, 2020
          {'name': 'Artichoke', 'season': 1583020800},  # March, 2020
          {'name': 'Tomato', 'season': 1588291200},     # May, 2020
          {'name': 'Potato', 'season': 1598918400},     # September, 2020
      ])
      | 'With timestamps' >> beam.Map(
          lambda plant: beam.window.TimestampedValue(plant, plant['season']))
      | 'Get timestamp' >> beam.ParDo(GetTimestamp())
      | beam.Map(print)
  )

2020-04-01 00:00:00 - Strawberry
2020-06-01 00:00:00 - Carrot
2020-03-01 00:00:00 - Artichoke
2020-05-01 00:00:00 - Tomato
2020-09-01 00:00:00 - Potato


In [20]:
import apache_beam as beam

class GetTimestamp(beam.DoFn):
    def process(self, plant, timestamp=beam.DoFn.TimestampParam):
        event_id = int(timestamp.micros / 1e6)  # equivalent to seconds
        yield '{} - {}'.format(event_id, plant['name'])

with beam.Pipeline() as pipeline:
    plant_events = (
      pipeline
      | 'Garden plants' >> beam.Create([
          {'name': 'Strawberry', 'event_id': 1},
          {'name': 'Carrot', 'event_id': 4},
          {'name': 'Artichoke', 'event_id': 2},
          {'name': 'Tomato', 'event_id': 3},
          {'name': 'Potato', 'event_id': 5},
      ])
      | 'With timestamps' >> beam.Map(lambda plant: \
          beam.window.TimestampedValue(plant, plant['event_id']))
      | 'Get timestamp' >> beam.ParDo(GetTimestamp())
      | beam.Map(print)
  )


1 - Strawberry
4 - Carrot
2 - Artichoke
3 - Tomato
5 - Potato


In [24]:
import apache_beam as beam

class GetTimestamp(beam.DoFn):
    def process(self, plant, timestamp=beam.DoFn.TimestampParam):
        yield '{} - {}'.format(timestamp.to_utc_datetime(), plant['name'])

with beam.Pipeline() as pipeline:
    plant_events = (
      pipeline
      | 'Garden plants' >> beam.Create([
          {'name': 'Strawberry', 'event_id': 1},
          {'name': 'Carrot', 'event_id': 4},
          {'name': 'Artichoke', 'event_id': 2},
          {'name': 'Tomato', 'event_id': 3},
          {'name': 'Potato', 'event_id': 5},
      ])
      | 'With timestamps' >> beam.Map(lambda plant: \
          beam.window.TimestampedValue(plant, time.time()))
      | 'Get timestamp' >> beam.ParDo(GetTimestamp())
      | beam.Map(print)
  )


2020-03-10 14:43:03.832376 - Strawberry
2020-03-10 14:43:03.836376 - Carrot
2020-03-10 14:43:03.836376 - Artichoke
2020-03-10 14:43:03.836376 - Tomato
2020-03-10 14:43:03.836376 - Potato


In [39]:
emails_list = [
    ('amy', 'amy@example.com'),
    ('carl', 'carl@example.com'),
    ('julia', 'julia@example.com'),
    ('carl', 'carl@email.com'),
]
phones_list = [
    ('amy', '111-222-3333'),
    ('james', '222-333-4444'),
    ('amy', '333-444-5555'),
    ('carl', '444-555-6666'),
]
import apache_beam as beam
from apache_beam.options.pipeline_options import PipelineOptions

with beam.Pipeline(options=PipelineOptions()) as p:
    emails = p | 'CreateEmails' >> beam.Create(emails_list)
    phones = p | 'CreatePhones' >> beam.Create(phones_list)
    results = ({'emails': emails, 'phones': phones} | beam.CoGroupByKey())
    results|beam.Map(lambda x: print('%s; %s; %s'%(x[0],sorted(x[1]['emails']), sorted(x[1]['phones']))))
 

amy; ['amy@example.com']; ['111-222-3333', '333-444-5555']
james; []; ['222-333-4444']
carl; ['carl@email.com', 'carl@example.com']; ['444-555-6666']
julia; ['julia@example.com']; []
