#### Script: Dataflow Basics

Description: Notebook where we will see the functioning of each transformation discussed during the theory.

EDEM. Master Data Analytics<br>
Professor: Javi Briones

### Setup

In [3]:
# GCP Auth

# Local
!gcloud auth application-default login

# Google Colab
# from google.colab import auth
# auth.authenticate_user()

"gcloud" no se reconoce como un comando interno o externo,
programa o archivo por lotes ejecutable.


In [1]:
# Install requirements
!pip3 install "apache_beam[interactive]"

Collecting apache_beam[interactive]
  Downloading apache_beam-2.53.0-cp39-cp39-win_amd64.whl.metadata (6.7 kB)
Collecting crcmod<2.0,>=1.7 (from apache_beam[interactive])
  Downloading crcmod-1.7.tar.gz (89 kB)
     ---------------------------------------- 0.0/89.7 kB ? eta -:--:--
     ------------------------------------ --- 81.9/89.7 kB 1.5 MB/s eta 0:00:01
     ---------------------------------------- 89.7/89.7 kB 1.7 MB/s eta 0:00:00
  Installing build dependencies: started
  Installing build dependencies: finished with status 'done'
  Getting requirements to build wheel: started
  Getting requirements to build wheel: finished with status 'done'
  Preparing metadata (pyproject.toml): started
  Preparing metadata (pyproject.toml): finished with status 'done'
Collecting orjson<4,>=3.9.7 (from apache_beam[interactive])
  Downloading orjson-3.9.12-cp39-none-win_amd64.whl.metadata (50 kB)
     ---------------------------------------- 0.0/50.6 kB ? eta -:--:--
     ---------------------

ERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
scipy 1.7.1 requires numpy<1.23.0,>=1.16.5, but you have numpy 1.24.4 which is incompatible.

[notice] A new release of pip is available: 23.3.1 -> 23.3.2
[notice] To update, run: C:\Users\DELL\AppData\Local\Programs\Python\Python39\python.exe -m pip install --upgrade pip


In [1]:
# Import Python Libraries
import logging
import apache_beam as beam
from apache_beam.runners.interactive.interactive_runner import InteractiveRunner
import apache_beam.runners.interactive.interactive_beam as ib

### Beam Basics

<img src="../00_DocAux/.images/Beam_Pipeline.png" width="1000"/>

##### 01 Understanding basic concepts: PCollection, PTransform & Pipeline Object

In [2]:
with beam.Pipeline(InteractiveRunner()) as p:

    (p   
        | "Read Text from a File" >> beam.io.ReadFromText('../00_DocAux/input_text.txt')
        | "FlatMap" >> beam.FlatMap(lambda z: z.split())
        | "map" >> beam.Map(lambda x: (x,1))
        | "resultado" >> beam.CombinePerKey(sum)
        | "Show content" >> beam.Map(print))

('En', 1)
('un', 2)
('lugar', 1)
('de', 12)
('la', 1)
('Mancha,', 1)
('cuyo', 1)
('nombre', 1)
('no', 2)
('quiero', 1)
('acordarme,', 1)
('ha', 1)
('mucho', 1)
('tiempo', 1)
('que', 2)
('vivía', 1)
('hidalgo', 1)
('los', 5)
('lanza', 1)
('en', 1)
('astillero,', 1)
('adarga', 1)
('antigua,', 1)
('rocín', 1)
('flaco', 1)
('y', 2)
('galgo', 1)
('corredor.', 1)
('Una', 1)
('olla', 1)
('algo', 1)
('más', 3)
('vaca', 1)
('carnero,', 1)
('salpicón', 1)
('las', 3)
('noches,', 1)
('duelos', 1)
('quebrantos', 1)
('sábados,', 1)
('lantejas', 1)
('viernes,', 1)
('algún', 1)
('palomino', 1)
('añadidura', 1)
('domingos,', 1)
('consumían', 1)
('tres', 1)
('partes', 1)
('su', 2)
('hacienda.', 1)
('El', 1)
('resto', 1)
('della', 1)
('concluían', 1)
('sayo', 1)
('velarte,', 1)
('calzas', 1)
('velludo', 1)
('para', 1)
('fiestas', 1)
('con', 2)
('sus', 1)
('pantuflos', 1)
('lo', 2)
('mismo,', 1)
('días', 1)
('entre', 1)
('semana', 1)
('se', 1)
('honraba', 1)
('vellorí', 1)
('fino.', 1)


##### 02 Understanding Core Transformations: DoFn & Map

In [8]:
# Map
def edem_map(element, num):
    return element * num

# DoFn
class edemDoFn(beam.DoFn):

    def __init__(self, num):
        self.num_ = num

    def process(self, element):
        yield element * self.num_

# Pipeline
with beam.Pipeline(InteractiveRunner()) as p:
  data = (
      p 
        | "Create a PCollection" >> beam.Create([1,2,3,4,5])
        | "Map" >> beam.Map(edem_map, num=2)
        | "DoFn" >> beam.ParDo(edemDoFn(4))
        | "Print" >> beam.Map(print)
  )

8
16
24
32
40


In [9]:
# PTransform
class edem_PTransform(beam.PTransform):

    # Map
    def edem_map(element):
        return element * 2

    # DoFn
    class edemDoFn(beam.DoFn):

        def process(self, element, num):
            yield element * num
    
    def expand(self,PColl):
        
        PColl_ = (PColl 
            | "Map" >> beam.Map(lambda x: x * 2)
            | "ParDo" >> beam.ParDo(edemDoFn(), num=4)
            | "Print" >> beam.Map(print))
        
        yield PColl_

# Pipeline
with beam.Pipeline(InteractiveRunner()) as p:
    data = (
        p 
            | "Create a PCollection" >> beam.Create([1,2,3,4,5])
    )
    
    data | edem_PTransform() | "Print" >> beam.Map(print)

usage: ipykernel_launcher.py [-h] [--dataflow_endpoint DATAFLOW_ENDPOINT]
                             [--project PROJECT] [--job_name JOB_NAME]
                             [--staging_location STAGING_LOCATION]
                             [--temp_location TEMP_LOCATION] [--region REGION]
                             [--service_account_email SERVICE_ACCOUNT_EMAIL]
                             [--no_auth]
                             [--template_location TEMPLATE_LOCATION]
                             [--label LABELS] [--update]
                             [--transform_name_mapping TRANSFORM_NAME_MAPPING]
                             [--enable_streaming_engine]
                             [--dataflow_kms_key DATAFLOW_KMS_KEY]
                             [--create_from_snapshot CREATE_FROM_SNAPSHOT]
                             [--flexrs_goal {COST_OPTIMIZED,SPEED_OPTIMIZED}]
                             [--dataflow_service_option DATAFLOW_SERVICE_OPTIONS]
                           

AttributeError: 'tuple' object has no attribute 'tb_frame'

##### 03 DoFn Lifecycle

In [None]:
from datetime import datetime
class DoFnLifeCycle(beam.DoFn):

  def now(self):
    self._now = datetime.now()
    return self._now

  def __init__(self):
    print("Constructor started at: %s" % self.now())

  def setup(self):
    print("worker started at: %s" % self.now())

  def start_bundle(self):
    print("bundle started at: %s" % self.now())

  def process(self, element):
    words = element.split()
    for word in words:
      print("Processing element: %s" % word)
      yield word.upper()

  def finish_bundle(self):
    print("bundle finished at: %s" % self.now())

  def teardown(self):
    print("worker finished at: %s" % self.now())

with beam.Pipeline(InteractiveRunner()) as p:
  input_data = (
      p 
        | "Reading the input file" >> beam.io.ReadFromText('../00_DocAux/input_text.txt')
        | "DoFn Life Cycle" >> beam.ParDo(DoFnLifeCycle())
  )

##### 04 Transformations

In [None]:
# GroupByKey
with beam.Pipeline(InteractiveRunner()) as p:

    data = (p | "PCollection" >> beam.Create([('Spain', 'Valencia'), ('Spain','Barcelona'), ('France', 'Paris')]))

    (data 
        | "Combined" >> beam.GroupByKey()
        | "Print" >> beam.Map(print))

In [None]:
# CoGroupByKey
with beam.Pipeline(InteractiveRunner()) as p:

    p1 = p | "PCollection 01" >> beam.Create([('Spain', 'Valencia'), ('Spain','Barcelona'), ('France', 'Paris')])
    p2 = p | "PCollection 02" >> beam.Create([('Spain', 'Madrid'), ('Spain','Alicante'), ('France', 'Lyon')])

    data = ((p1,p2) | beam.CoGroupByKey())

    data | "Print" >> beam.Map(print)

In [None]:
# Combine
with beam.Pipeline(InteractiveRunner()) as p:

    data = (p | "PCollection" >> beam.Create([('User1', 1), ('User2', 5), ('User1', 7)]))

    (data 
        | "Combined" >> beam.CombinePerKey(sum)
        | "Print" >> beam.Map(print))

In [None]:
# Flatten
with beam.Pipeline(InteractiveRunner()) as p:

    p1 = p | "PCollection 01" >> beam.Create(['New York', 'Los Angeles', 'Miami', 'Chicago'])
    p2 = p | "Pcollection 02" >> beam.Create(['Madrid', 'Barcelona', 'Valencia', 'Malaga'])
    p3 = p | "Pcollection 03" >> beam.Create(['London','Manchester', 'Liverpool'])

    merged = ((p1,p2,p3)| beam.Flatten())

    merged | beam.Map(print)

In [None]:
# Partition
countries = ['Spain', 'USA', 'Switzerland']

def partition_fn(country,num_countries):
    return countries.index(country['country'])

with beam.Pipeline(InteractiveRunner()) as p:

        p1,p2,p3 = (
                p 
                | "PCollection" >> beam.Create([
                        {'country': 'Spain', 'city': 'Valencia'},
                        {'country': 'Spain', 'city': 'Barcelona'},
                        {'country': 'USA', 'city': 'New York'},
                        {'country': 'Switzerland', 'city': 'Zurich'},
                        {'country': 'Switzerland', 'city': 'Geneva'}  
                ])
                | "partition" >> beam.Partition(partition_fn, len(countries))
        )

        p3 | "PCollection for Spain" >> beam.Map(print)
        

##### 05 Streaming

In [5]:
# PubSub
from apache_beam.options.pipeline_options import PipelineOptions

with beam.Pipeline(options=PipelineOptions(streaming=True)) as p:

    data = (p | "ReadFromPubSub" >> beam.io.ReadFromPubSub(subscription='projects/dataflow-1-411618/subscriptions/new_topic-sub'))

    data | beam.Map(print)

usage: ipykernel_launcher.py [-h] [--dataflow_endpoint DATAFLOW_ENDPOINT]
                             [--project PROJECT] [--job_name JOB_NAME]
                             [--staging_location STAGING_LOCATION]
                             [--temp_location TEMP_LOCATION] [--region REGION]
                             [--service_account_email SERVICE_ACCOUNT_EMAIL]
                             [--no_auth]
                             [--template_location TEMPLATE_LOCATION]
                             [--label LABELS] [--update]
                             [--transform_name_mapping TRANSFORM_NAME_MAPPING]
                             [--enable_streaming_engine]
                             [--dataflow_kms_key DATAFLOW_KMS_KEY]
                             [--create_from_snapshot CREATE_FROM_SNAPSHOT]
                             [--flexrs_goal {COST_OPTIMIZED,SPEED_OPTIMIZED}]
                             [--dataflow_service_option DATAFLOW_SERVICE_OPTIONS]
                           

AttributeError: 'tuple' object has no attribute 'tb_frame'

In [4]:
!pip install google-cloud-PubSub


Collecting google-cloud-PubSub
  Obtaining dependency information for google-cloud-PubSub from https://files.pythonhosted.org/packages/1e/5f/0d7fb021e4e5a0da2fc6fdfe7cb89e4f9c14f832c34ea2af8c4834c3c7fd/google_cloud_pubsub-2.19.0-py2.py3-none-any.whl.metadata
  Downloading google_cloud_pubsub-2.19.0-py2.py3-none-any.whl.metadata (9.3 kB)
Collecting grpcio<2.0dev,>=1.51.3 (from google-cloud-PubSub)
  Obtaining dependency information for grpcio<2.0dev,>=1.51.3 from https://files.pythonhosted.org/packages/6a/b9/f94bea4c6f0e322a239f7ba66ba3b0ce766d1c6a2d50055f7c8acf0fba38/grpcio-1.60.0-cp311-cp311-win_amd64.whl.metadata
  Downloading grpcio-1.60.0-cp311-cp311-win_amd64.whl.metadata (4.2 kB)
Downloading google_cloud_pubsub-2.19.0-py2.py3-none-any.whl (265 kB)
   ---------------------------------------- 0.0/265.3 kB ? eta -:--:--
   - -------------------------------------- 10.2/265.3 kB ? eta -:--:--
   ------------- -------------------------- 92.2/265.3 kB 1.1 MB/s eta 0:00:01
   -----------