In [0]:
# Source DB: Defaults to MySQL
# Source DB Access Name: Required
# Source DB Table: Required [schema_name.table_name] [If multiple tables used in DML, fill first table name]
# DML: Defaults to: SELECT * FROM Source-Table
# Target DB: Defaults to MySQL
# Target DB Access Name: Required
# Target DB Table: Required [schema_name.table_name] 
# Email Notification: Defaults to skip [Requires email id to send notification]

In [0]:
# #Source Inputs
# dbutils.widgets.removeAll()
# dbutils.widgets.dropdown("Source_DB", "MySQL", ["MySQL", "PSQL"])
# dbutils.widgets.text("Source_DB_Acces", "")
# dbutils.widgets.text("Source_DB_Table", "")
# dbutils.widgets.text("DML", "")
# dbutils.widgets.dropdown("Load_Type","Append",["Append","Overwrite"])

# #Target Inputs
# dbutils.widgets.dropdown("Target_DB", "MySQL", ["MySQL", "PSQL"])
# dbutils.widgets.text("Target_DB_Access", "")
# dbutils.widgets.text("Target_DB_Table", "")
# dbutils.widgets.text("Notification_Recipient","")

In [0]:
%run ./configs

In [0]:
# Reading user inputs
source_db = getArgument('Source_DB').strip()
source_access = getArgument('Source_DB_Acces').strip()
source_table = getArgument('Source_DB_Table').strip()
DML = getArgument('DML').strip()
load_type = getArgument('Load_Type').strip()
target_db = getArgument('Target_DB').strip()
target_access = getArgument('Target_DB_Access').strip()
target_table = getArgument('Target_DB_Table').strip()
notify_recipient = getArgument('Notification_Recipient').strip()
sdb_conn ={}
tdb_conn= {}

# Validating user inputs
if source_access == '':
  dbutils.notebook.exit("Source DB Access name missing!")
elif source_table == '':
  dbutils.notebook.exit("Source Table info missing!")
elif len(source_table.split('.')) != 2:
  dbutils.notebook.exit("Schema name missing in source table!")
elif target_access == '':
  dbutils.notebook.exit("Target DB Access Name missing!")
elif target_table == '':
  dbutils.notebook.exit("Target Table info missing!")
elif len(target_table.split('.')) != 2:
  dbutils.notebook.exit("Schema name missing in target table!")

if source_access in db_access_list:
  sdb_conn['db_host'] = db_hosts[source_access]
  sdb_conn['db_username'] = db_usernames[source_access]
  sdb_conn['db_password'] = db_passwords[source_access]
else:
  dbutils.notebook.exit("Invalid Source DB Access Name!")

if target_access in db_access_list:
  tdb_conn['db_host'] = db_hosts[target_access]
  tdb_conn['db_username'] = db_usernames[target_access]
  tdb_conn['db_password'] = db_passwords[target_access]
else:
  dbutils.notebook.exit("Invalid Target DB Access Name!")
service = 'DB-DB'

In [0]:
import boto3
import uuid
import datetime, time
from datetime import datetime
import logging
import re
# Create a logger
logger = logging.getLogger()
logger.setLevel(logging.INFO)

In [0]:
run_id = uuid.uuid4()
print('run_id:',run_id)

run_id: 1a87326b-95e2-409b-a4db-c3bcb6570910


In [0]:
def run_logger(service,log_op, opn, DML, rc, target,status):
  """
  Inputs: SQL Query
  Output: Returns True if success
  """
  logging.info(f"Updating run log table for {opn} operation.....")
  if log_op == 'insert' :
    query = f"INSERT INTO TABLE run_log VALUES('{service}','{run_id}','{source_access}','{source_table}','{DML}','{opn}',{rc},'{target_access}','{target_table}','{status}',current_timestamp())"
  elif log_op == 'update':
    query = f"UPDATE run_log SET status='{status}' where run_id = '{run_id}' and operation='{opn}'"
  # Executing SQL Query
  spark.sql(query)
  return True

In [0]:
def table_data_read_write(sdb_conn, tdb_conn, source_table, DML, target_table, load_type):
  """
  Reads data from source table using DML if DML is blank SELECT * will be used
  Write data to target table
  Inputs: DB connection details, source table name, DML, target table
  Output: record count
  """

  # Reading user input DML
  if DML == '':
    query = f"(SELECT * FROM {source_table})as query"
  else:
    query = '( '+ DML+' ) as query'
  
  print('Executable Query:',query)

  sdb_name = source_table.split('.')[0]
  sdbtable = source_table.split('.')[1]
  tdb_name = target_table.split('.')[0]
  tdbtable = target_table.split('.')[1]

  # Preparing url string for source JDBC connection
  if source_db == 'MySQL':
    surl = f"jdbc:mysql://{sdb_conn['db_host']}:3306/{sdb_name}" 
  elif source_db == 'PSQL':
    surl = f"jdbc:postgresql://{sdb_conn['db_host']}:5432/{sdb_name}" 

  # Reading source query data
  try:
    read_df = (spark.read
    .format("jdbc")
    .option("url", surl)
    .option("dbtable", query)
    .option("user", sdb_conn['db_username'])
    .option("password", sdb_conn['db_password'])
    .load()
    )
  except Exception as e:
    logging.error(f"Unable to read data from table:{e}")
    run_logger('DB-DB','insert','read', query, 0, target_db ,'failed')
    return None
  
  if read_df :
    rc = read_df.count()
    query = query.replace("'", "\\'")
    run_logger('DB-DB','insert','read', query, rc, target_db ,'success')
    logging.info("Completed reading from table ......")

    # Preparing url string for target JDBC connection
    if target_db == 'MySQL':
      turl = f"jdbc:mysql://{tdb_conn['db_host']}:3306/{tdb_name}" 
    elif target_db == 'PSQL':
      turl = f"jdbc:postgresql://{tdb_conn['db_host']}:5432/{tdb_name}" 

    # Writing read df to target table
    try: 
      (
      read_df.write
      .format("jdbc")
      .option("url", turl)
      .option("dbtable",tdbtable)
      .option("user", tdb_conn['db_username'])
      .option("password", tdb_conn['db_password'])
      .mode(load_type)
      .save()
      )
      run_logger('DB-DB','insert','write', query, rc, target_db ,'success')
      logging.info("Completed writing to target table.......")
      return rc
    except Exception as e:
      logging.error(f"Unable to write data to table: {e}")
      query = query.replace("'", "\\'")
      run_logger('DB-DB','insert','write', query, 0, target_db ,'failed')
      return None
  else:
    query = query.replace("'", "\\'")
    run_logger('DB-DB','insert','read', query, 0, target_db ,'failed')
    logging.info("Error during reading from source table ......")
    return None

In [0]:
status = table_data_read_write(sdb_conn, tdb_conn, source_table, DML, target_table, load_type)
# Checking run
if status :
  logging.info(f"{status} records loaded to table {target_table}")
else:
  logging.info("Data load failed to target table!!")

Executable Query: (SELECT * FROM reddit.reddit_posts_agg)as query


INFO:root:Updating run log table for read operation.....
INFO:root:Completed reading from table ......
INFO:root:Updating run log table for write operation.....
INFO:root:Completed writing to target table.......
INFO:root:2754 records loaded to table reddit.reddit_posts_agg_c


In [0]:
from botocore.exceptions import ClientError
def send_email(subject, body_html, sender, recipients):

  """
  This function sends email notification to recipeints on run status
  Inputs: Email content, receiver email address list
  Output: Success/Failure message
  """
  access_key = ses_conn['access_key']
  secret_key = ses_conn['secret_key']

  #Creating boto3 ses client
  ses_client = boto3.client('ses', aws_access_key_id=access_key, aws_secret_access_key=secret_key, region_name = 'us-east-2')

  # Create a MIME message
  body_text = "This email requires HTML support. Please view in a HTML-compatible email client."
  charset = "UTF-8"
  
  # Assemble the email
  try:
      response = ses_client.send_email(
          Destination={
              'ToAddresses': recipients,
          },
          Message={
              'Body': {
                  'Html': {
                      'Charset': charset,
                      'Data': body_html,
                  },
                  'Text': {
                      'Charset': charset,
                      'Data': body_text,
                  },
              },
              'Subject': {
                  'Charset': charset,
                  'Data': subject,
              },
          },
          Source=sender,
      )
  except ClientError as e:
      print(e.response['Error']['Message'])
  else:
      print("Email sent! Message ID:", response['MessageId'])

In [0]:
def dataframe_to_html_table(df):
    # Convert DataFrame to HTML table
    html_table = df.to_html(index=False)
    # Add inline CSS styling to color the header
    styled_header = '<th style="background-color: #FB451D; color: white;">'
    html_table = html_table.replace('<th>', styled_header)
    # Format the HTML table
    formatted_html_table = f'<html><body>{html_table}</body></html>'
    return formatted_html_table

In [0]:
subject = f"{service} run status for run id - {run_id}"
df = spark.sql(""" select distinct service as Service, source_path_table as `Source_Table`,source_file_dml as DML, operation as Task, record_count as Record_Count, target_path_table as Target_Table, Status, Timestamp from run_log where run_id = '{}' order by task """.format(run_id ))
html_table = dataframe_to_html_table(df.toPandas())
body_html = f"<html><body><p>Hi,</p><p>Please find the status of service: {service} with run id: {run_id}</p>{html_table}</body></html>"
displayHTML(body_html)
sender = "noreplyd22snotification@gmail.com"
if notify_recipient != '' :
  recipients = notify_recipient.split(',')
else:
   recipients = []
if len(recipients) > 0:
  send_email(subject, body_html, sender, recipients)
else:
  print('No recipients to send email!')

Service,Source_Table,DML,Task,Record_Count,Target_Table,Status,Timestamp
DB-DB,reddit.reddit_posts_agg,(SELECT * FROM reddit.reddit_posts_agg)as query,read,2754,reddit.reddit_posts_agg_c,success,2024-03-23 16:31:09.045
DB-DB,reddit.reddit_posts_agg,(SELECT * FROM reddit.reddit_posts_agg)as query,write,2754,reddit.reddit_posts_agg_c,success,2024-03-23 16:33:28.687


No recipients to send email!
