In [2]:
import apache_beam as beam
beam.__version__

'2.61.0'

In [7]:
# Set the logging level to reduce verbose information
import logging

logging.root.setLevel(logging.ERROR)
     

In [9]:
##Simple User-Defined Function (UDF)

In [16]:

pc = [1, 10, 100, 10000]

# User-defined function
def bounded_sum(values, bound=500):
  return min(sum(values), bound)

small_sum = pc | beam.CombineGlobally(bounded_sum)  # [500]
large_sum = pc | beam.CombineGlobally(bounded_sum, bound=5000)  # [1111]

print(small_sum, large_sum)

[500] [5000]


In [17]:
##2. Transforms: ParDo and Combine
#A ParDo transform considers each element in the input PCollection,
#performs your user code to process each element, and emits zero, one, or multiple elements to an output PCollection
#Combine is another Beam transform for combining collections of elements or values in your data. Both allow flexible UDFs to define how you process the data.

In [21]:
data = [1,2,3,4,5]
# create a DoFn to multiply each element by five
# you can define the processing code under `process`
# which is required for a DoFn
class MultiplyByFive(beam.DoFn):
  def process(self, element):
    yield element*5
        

In [24]:
with beam.Pipeline() as pipeline:
    outputs =(
        pipeline
        |'Create values' >> beam.Create(data)
        |'Multiply by Five' >>beam.ParDo(MultiplyByFive())
    )
    outputs | beam.Map(print)

5
10
15
20
25
