In [21]:
import apache_beam as beam
import mysql.connector

In [22]:
class ReadFromMySQL(beam.DoFn):
    def __init__(self, host, database, user, password, query):
        self.host = host
        self.database = database
        self.user = user
        self.password = password
        self.query = query

    def setup(self):
        # Create a connection to the MySQL database
        self.connection = mysql.connector.connect(
            host=self.host,
            database=self.database,
            user=self.user,
            password=self.password
        )
        self.cursor = self.connection.cursor()

    def process(self, element):
        # Execute the query and yield results row by row
        self.cursor.execute(self.query)
        for row in self.cursor.fetchall():
            yield row

    def teardown(self):
        # Close the database connection
        self.cursor.close()
        self.connection.close()


In [None]:
from apache_beam.utils.windowed_value import WindowedValue
import apache_beam.transforms.window as window
import re
import json
from collections import Counter

class ParseAndAnalyzeSMS(beam.DoFn):
    def __init__(self):
        self.word_counter = Counter()  # Counter for word frequencies

    def process(self, record):
        # Unpack record fields
        (id, message, sender, recipient, source, sourceID, SMSMessageResponse, 
         createdDt, updateDt, msgResponseID, status, blastID, SMSType) = record

        # Parse the SMSMessageResponse JSON
        process_cost_and_status_data= self.process_cost_and_status(SMSMessageResponse)
        cost = process_cost_and_status_data[0]
        status =process_cost_and_status_data[1]
        # Update word counter with tokenized words from the message
        words = self.tokenize_message(message)
        self.word_counter.update(words)

        # Yield status and cost for the main output
        yield (status, cost)

    @staticmethod
    def tokenize_message(message):
        """Tokenizes a message into words, ignoring punctuation and case."""
        return re.findall(r'\w+', message.lower())

    def finish_bundle(self):
        """
        Emit word count results at the end of the bundle.
        """
        for word, count in self.word_counter.most_common():
            # Emit word count as a tagged output
            yield WindowedValue({"word": word, "count": count}, timestamp=0, windows=[window.GlobalWindow()])


    def process_cost_and_status(SMSMessageResponse):
        try:
            sms_response = json.loads(SMSMessageResponse)
            cost = float(sms_response["Recipients"][0]["cost"].replace("KES ", ""))
            status = sms_response["Recipients"][0]["status"]
            return [cost, status]
        except (KeyError, ValueError, IndexError, json.JSONDecodeError):
            return [0.0, "Unknown"]

In [None]:
# Define pipeline options and inputs
pipeline_options = beam.options.pipeline_options.PipelineOptions()
host = "127.0.0.1"
database = 'defaultdb'
user = 'root'
password = 'cypher'
query = 'select * from smslog limit 20000;'

# Pipeline options
#pipeline_options = beam.options.pipeline_options.PipelineOptions()

# Create the pipeline
with beam.Pipeline() as pipeline:
    mysql_data = (
        pipeline
        | "Create Input" >> beam.Create([None])  # Start with an empty input
        | "Read MySQL Data" >> beam.ParDo(ReadFromMySQL(host, database, user, password, query))
        | "Parse and Analyze SMS" >> beam.ParDo(ParseAndAnalyzeSMS())
        | "Group By Status" >> beam.GroupByKey()
        | "Sum Costs Per Status" >> beam.Map(
            lambda status_costs: (status_costs[0], sum(status_costs[1]))
        )
        | "Print Results" >> beam.Map(print)
    )