In [0]:
# S3 Access Name: Required
# S3 File Path: Required [Patterns accepted: bucket/prefix/filename_yyyymmdd.txt/csv, filename_yyyymmddHHMMSS.txt/csv] [Note: If blank, all files at path will be used]
# File Delimiter: Defaults to comma [comma: ',' tab: '\t', 'pipe': '|']
# Target DB: Defaults to MySQL
# DB Access Name: Required
# DB Target Table: Required [schema_name.table_name]
# Load Type: Defaults to Append
# Header Row: Defaults to True
# Email Notification: Defaults to skip [Requires email id to send notification]

In [0]:
# #Source Inputs
# dbutils.widgets.removeAll()
# dbutils.widgets.text("Source_S3_Access", "")
# dbutils.widgets.text("Source_S3_File_Path", "")
# dbutils.widgets.dropdown("Header_Row", "True", ["True", "False"])
# dbutils.widgets.text("File_Delimiter", ",")

# #Target Inputs
# dbutils.widgets.dropdown("Target_DB", "MySQL", ["MySQL", "PSQL"])
# dbutils.widgets.dropdown("Load_Type", "Append", ["Append", "Overwrite"])
# dbutils.widgets.text("Target_DB_Access", "")
# dbutils.widgets.text("Target_DB_Table", "")
# dbutils.widgets.text("Notification_Recipient","")

In [0]:
%run ./configs

In [0]:
# Reading user inputs
source_access = getArgument('Source_S3_Access').strip()
source_s3_path = getArgument('Source_S3_File_Path').strip()
target_db = getArgument('Target_DB')
target_access = getArgument('Target_DB_Access').strip()
target_table = getArgument('Target_DB_Table').strip()
load_type = getArgument('Load_Type')
header = getArgument('Header_Row')
delimiter = getArgument('File_Delimiter').strip()
notify_recipient = getArgument('Notification_Recipient').strip()
s3_conn ={}
db_conn ={}

# Validating user inputs
if source_access == '':
  dbutils.notebook.exit("Source S3 Access name missing!")
elif source_s3_path == '':
  dbutils.notebook.exit("Source S3 File Path missing!")
elif target_access == '':
  dbutils.notebook.exit("Target DB Access Name missing!")
elif target_table == '':
  dbutils.notebook.exit("Target Table missing!")
elif len(target_table.split('.')) != 2:
  dbutils.notebook.exit("Schema name missing in target table!")

if source_access in s3_access_list: 
  s3_conn['s3_access_key'] = s3_access_keys[source_access]
  s3_conn['s3_secret_key'] = s3_secret_keys[source_access]
else:
  dbutils.notebook.exit("Invalid S3 Access Name!")

if target_access in db_access_list:
  db_conn['db_host'] = db_hosts[target_access]
  db_conn['db_username'] = db_usernames[target_access]
  db_conn['db_password'] = db_passwords[target_access]
else:
  dbutils.notebook.exit("Invalid DB Access Name!")
service = 'S3-DB'

In [0]:
import boto3
import uuid
import re
import datetime, time
import logging
# Create a logger
logger = logging.getLogger()
logger.setLevel(logging.INFO)

In [0]:
run_id = uuid.uuid4()
print('run_id:',run_id)

run_id: be7a1a4a-b37c-46c9-9e16-64ae2d34cb50


In [0]:
def run_logger(service,log_op, opn, file_name, rc, status):
  """
  Inputs: SQL Query
  Output: Returns True if success
  """
  logging.info(f"Updating run log for {opn} operation....")
  if log_op == 'insert' :
    query = f"INSERT INTO TABLE run_log VALUES('{service}','{run_id}','{source_access}','{source_s3_path}','{file_name}','{opn}',{rc},'{target_access}','{target_table}','{status}',current_timestamp())"
  elif log_op == 'update':
    query = f"UPDATE run_log SET status='{status}' where run_id = '{run_id}' and operation='{opn}'"
  # Executing SQL Query
  spark.sql(query)
  return True

In [0]:
def file_pattern_check(source_s3_path,s3_conn):
  """
  Inputs: S3 connection details, S3 file path
  Output: Return file with max timestamp of pattern present in file path
  """
  logging.info('Executing File pattern check......')

  s3_parts_1 = source_s3_path.split('/')
  bucket_name = s3_parts_1[2]
  prefix = '/'.join(s3_parts_1[3:-1])
  s3_file_part = s3_parts_1[-1]
  s3_parts_2 = s3_file_part.split('_')
  file_name_pattern = '_'.join(s3_parts_2[:-1])
  file_format = source_s3_path.split('.')[-1]

  # Accessing keys from connection inputs
  access_key = s3_conn['s3_access_key']
  secret_key = s3_conn['s3_secret_key']

  # Creating boto3 client
  s3 = boto3.client('s3', aws_access_key_id=access_key, aws_secret_access_key=secret_key)
  
  # listing required files from s3 location
  obj_list = s3.list_objects(Bucket=bucket_name, Prefix=prefix)
  objs = [item['Key'] for item in obj_list['Contents']]
  obj_req = []
  match_list = []
  for obj in objs:
    if obj.endswith(('.txt', '.csv', '.parquet')) and (obj.count('/') < 2):
      obj = obj.split('/')[-1]
      obj_req.append(obj)

  # listing files with pattern match
  if 'yyyymmddHHMMSS' in source_s3_path :
    pattern = rf"{file_name_pattern}_\d{{14}}\.{file_format}"
    for obj in obj_req :
      if re.match(pattern, obj):
        match_list.append(obj)
  elif 'yyyymmdd' in source_s3_path :
    pattern = rf"{file_name_pattern}_\d{{8}}\.{file_format}"
    for obj in obj_req:
      if re.match(pattern, obj):
        match_list.append(obj)

  if len(match_list) == 0:
    logging.info(f"No files found at {source_s3_path} for input file pattern")
    #run_logger('S3-DB','insert','read','','','failed')
    return None
  else:
    # Selecting latest timestamp file for return
    latest_file = max(match_list)
    logging.info(f"Latest file for pattern - {pattern} at s3://{bucket_name}/{prefix}: {latest_file}")
    return latest_file
  

In [0]:
def s3_file_read(s3_conn, source_s3_path) :

  """
  Function to connect to user provided S3 access point
  Read Files at S3 path
  Create Spark df of file
  Input: S3 connection keys, S3 File Path
  Output: Spark DataFrame, file name
  """
  logging.info('Executing S3 File read.......')

  # Accessing keys from connection inputs
  access_key = s3_conn['s3_access_key']
  secret_key = s3_conn['s3_secret_key']

  # Creating boto3 client
  s3 = boto3.client('s3', aws_access_key_id=access_key, aws_secret_access_key=secret_key)

  # Setting Spark configs to access S3
  sc._jsc.hadoopConfiguration().set("fs.s3n.awsAccessKeyId", access_key)
  sc._jsc.hadoopConfiguration().set("fs.s3n.awsSecretAccessKey", secret_key)

  # Extracting bucket name, prefix & file name from path
  s3_parts = source_s3_path.split('/')
  bucket_name = s3_parts[2] 

  if bucket_name == '':
    logging.error("Invalid S3 File Path")
    run_logger('S3-DB','insert','read','','','failed')
    return None
  
  prefix_1 = '/'.join(s3_parts[3:-1])
  prefix_2 = '/'.join(s3_parts[3:])
  header = getArgument("Header_Row")
  delimiter = getArgument("File_Delimiter").strip()

  # Check if file name is present in file path & create df accordingly
  if source_s3_path.endswith(('.txt', '.csv', '.parquet')):
    if ('yyyymmdd' in source_s3_path) or ('yyyymmddHHMMSS' in source_s3_path):
      # Calling file pattern check function to get latest file
      file_name = file_pattern_check(source_s3_path, s3_conn)
      if file_name :
        file_path = 's3://'+bucket_name+'/'+prefix_1+'/'+file_name
        file_format = file_name.split('.')[1]
        if file_format == 'parquet' :
          df = spark.read.parquet(file_path)
        else:
          df = spark.read.option("inferSchema", "true").option("header", header).option("delimiter",delimiter).csv(file_path)
      else:
        logging.error("No latest file found")
        run_logger('S3-DB','insert','read','','','failed')
        return None

      rc = df.count()
      # Calling run logger to insert record in log table
      run_logger('S3-DB','insert','read',file_name,rc,'success')
      return [df,file_name]
        
    else :
      file_format = source_s3_path.split('.')[-1]
      file_name = file_name = s3_parts[-1]
      # Reading file into Spark DataFrame
      if file_format == 'parquet':
        df = spark.read.parquet(source_s3_path)
      else:
        df = spark.read.format(file_format).option("inferSchema", "true").option("header", header).option("delimiter",delimiter).csv(source_s3_path)
      rc = df.count()
      # Calling run logger to insert record in log table
      run_logger('S3-DB','insert','read',file_name,rc,'success')
      return [df,file_name]

  else:

    file_name = ''
    file_count = 0
    # listing all files from s3 location
    obj_list = s3.list_objects(Bucket=bucket_name, Prefix=prefix_2)
    try:
      objs = [item['Key'] for item in obj_list['Contents']]
    except Exception as e:
      logging.error(f"Error during listing files at s3: {e}")
      run_logger('S3-DB','insert','read','',0,'failed')
      return None
    
    for obj in objs:
      if obj.endswith(('.txt', '.csv', '.parquet')) and (obj.count('/')<2):
        file_format = obj.split('.')[-1]
        file_count+=1
        file = obj.split('/')[-1]
        file_name = file_name + file + '|'
    file_name = file_name.strip('|')

    if file_name:
      if file_format == 'parquet':
        df = spark.read.parquet(source_s3_path)
        file_name =''
      else:
        df = spark.read.option("inferSchema", "true").option("header", header).option("delimiter",delimiter).csv(source_s3_path)
      rc= df.count()
     
      logging.info("Completed reading files to df ........")
      run_logger('S3-DB','insert','read','MultipleFiles',rc,'success')
      return [df,file_name]
      
    else:
      logging.info("No files found at source s3 ......")
      run_logger('S3-DB','insert','read','',0,'failed')
      return None

In [0]:
def db_file_write(db_conn, target_table, input_df):

  """
  Receives Spark DataFrame from reader function
  Writes df to target DB table
  Input: Target access connection details, Target Database table name, Spark df from reader function
  Output: True False status of write, record count
  """
  logging.info("Executing file writing function .....")

  db_name = target_table.split('.')[0]
  dbtable = target_table.split('.')[1]
  
  # Input record count
  rc = input_df.count()

  # Preparing url string for JDBC connection
  if target_db == 'MySQL':
    url = f"jdbc:mysql://{db_conn['db_host']}:3306/{db_name}" 
  elif target_db == 'PSQL':
    url = f"jdbc:postgresql://{db_conn['db_host']}:5432/{db_name}" 

  # Writing input df to target table
  (
  input_df.write
  .format("jdbc")
  .option("url", url)
  .option("dbtable",dbtable)
  .option("user", db_conn['db_username'])
  .option("password", db_conn['db_password'])
  .mode(load_type)
  .save()
  )
  logging.info("Completed writing to table.......")
  return rc


In [0]:
def s3_db(s3_conn, source_s3_path, db_conn, target_table):
  
  """
  Main function to call read and write functions
  Inputs: source parameters, target prameters
  Output: record count
  """
  inputs = s3_file_read(s3_conn, source_s3_path)
  if inputs:
    result = db_file_write(db_conn, target_table, inputs[0])
  else:
    #run_logger('S3-DB','update','read','','','failed')
    result = None

  if result:
    run_logger('S3-DB','insert','write',f'{inputs[1]}',f'{result}','success')
  else:
    run_logger('S3-DB','insert','write','',0,'failed')

  return result

In [0]:
# Calling main function
status = s3_db(s3_conn, source_s3_path, db_conn, target_table)

# Checking status
if status :
  logging.info(f"{status} records transferred from {source_s3_path} to {target_table}")
else:
  logging.info("Failed to transfer file from S3 to Target table")

INFO:root:Executing S3 File read.......
INFO:root:Executing File pattern check......
INFO:root:Latest file for pattern - sftp_\d{8}\.txt at s3://sdevalla-portfolio/orch_test: sftp_20240323.txt
INFO:root:Updating run log for read operation....
INFO:root:Executing file writing function .....
INFO:root:Completed writing to table.......
INFO:root:Updating run log for write operation....
INFO:root:918 records transferred from s3://sdevalla-portfolio/orch_test/sftp_yyyymmdd.txt to reddit.reddit_posts_agg_c


In [0]:
from botocore.exceptions import ClientError
def send_email(subject, body_html, sender, recipients):

  """
  This function sends email notification to recipeints on run status
  Inputs: Email content, receiver email address list
  Output: Success/Failure message
  """
  access_key = ses_conn['access_key']
  secret_key = ses_conn['secret_key']

  #Creating boto3 ses client
  ses_client = boto3.client('ses', aws_access_key_id=access_key, aws_secret_access_key=secret_key, region_name = 'us-east-2')

  # Create a MIME message
  body_text = "This email requires HTML support. Please view in a HTML-compatible email client."
  charset = "UTF-8"
  
  # Assemble the email
  try:
      response = ses_client.send_email(
          Destination={
              'ToAddresses': recipients,
          },
          Message={
              'Body': {
                  'Html': {
                      'Charset': charset,
                      'Data': body_html,
                  },
                  'Text': {
                      'Charset': charset,
                      'Data': body_text,
                  },
              },
              'Subject': {
                  'Charset': charset,
                  'Data': subject,
              },
          },
          Source=sender,
      )
  except ClientError as e:
      print(e.response['Error']['Message'])
  else:
      print("Email sent! Message ID:", response['MessageId'])

In [0]:
def dataframe_to_html_table(df):
    # Convert DataFrame to HTML table
    html_table = df.to_html(index=False)
    # Add inline CSS styling to color the header
    styled_header = '<th style="background-color: #FB451D; color: white;">'
    html_table = html_table.replace('<th>', styled_header)
    # Format the HTML table
    formatted_html_table = f'<html><body>{html_table}</body></html>'
    return formatted_html_table

In [0]:
subject = f"{service} run status for run id - {run_id}"
df = spark.sql(""" select distinct service as Service, source_path_table as `Source_S3`, source_file_dml as `Source File`, operation as Task,  record_count as Record_Count, target_path_table as Target_Table, Status, Timestamp from run_log where run_id = '{}' order by task """.format(run_id ))
html_table = dataframe_to_html_table(df.toPandas())
body_html = f"<html><body><p>Hi,</p><p>Please find the status of service: {service} with run id: {run_id}</p>{html_table}</body></html>"
displayHTML(body_html)
sender = "noreplyd22snotification@gmail.com"
if notify_recipient != '' :
  recipients = notify_recipient.split(',')
else:
   recipients = []
if len(recipients) > 0:
  send_email(subject, body_html, sender, recipients)
else:
  print('No recipients to send email!')

Service,Source_S3,Source File,Task,Record_Count,Target_Table,Status,Timestamp
S3-DB,s3://sdevalla-portfolio/orch_test/sftp_yyyymmdd.txt,sftp_20240323.txt,read,918,reddit.reddit_posts_agg_c,success,2024-03-23 16:25:11.939
S3-DB,s3://sdevalla-portfolio/orch_test/sftp_yyyymmdd.txt,sftp_20240323.txt,write,918,reddit.reddit_posts_agg_c,success,2024-03-23 16:26:03.274


No recipients to send email!
