In [0]:
# Source S3 Access Name: Required
# Source S3 File Path: Required [Formats allowed: s3://bucket/prefix/filename.txt, s3://bucket/prefix/filename_yyyymmdd.txt, _yyyymmddHHMMSS.txt]
# Target S3 Access Name: Required
# Target S3 File Path: Required [Formats allowed: s3://bucket/prefix/filename.txt, s3://bucket/prefix/filename_yyyymmdd.txt, _yyyymmddHHMMSS.txt]
# [Note: If Source S3 File path doesn't have exact file path, renaming will be skipped]
# Email Notification: Defaults to skip [Requires email id to send notification]

In [0]:
# #Source Inputs
# dbutils.widgets.removeAll()
# dbutils.widgets.text("Source_S3_Access", "")
# dbutils.widgets.text("Source_S3_File_Path", "")

# #Target Inputs
# dbutils.widgets.text("Target_S3_Access", "")
# dbutils.widgets.text("Target_S3_File_Path", "")
# dbutils.widgets.text("Notification_Recipient","")

In [0]:
%run ./configs

In [0]:
# Reading user inputs
source_access = getArgument('Source_S3_Access').strip()
source_s3_file = getArgument('Source_S3_File_Path').strip()
target_access = getArgument('Target_S3_Access').strip()
target_s3_file = getArgument('Target_S3_File_Path').strip()
notify_recipient = getArgument('Notification_Recipient').strip()

ss3_conn ={}
ts3_conn ={}

# Validating user inputs
if source_access == '':
  dbutils.notebook.exit("Source S3 Access name missing!")
elif source_s3_file == '':
  dbutils.notebook.exit("Source File info missing!")
elif target_access == '':
  dbutils.notebook.exit("Target S3 Access Name missing!")
elif target_s3_file == '':
  dbutils.notebook.exit("Target S3 File Path missing!")

if source_access in s3_access_list: 
  ss3_conn['s3_access_key'] = s3_access_keys[source_access]
  ss3_conn['s3_secret_key'] = s3_secret_keys[source_access]
else:
  dbutils.notebook.exit("Invalid Source S3 Access Name!")

if target_access in s3_access_list: 
  ts3_conn['s3_access_key'] = s3_access_keys[target_access]
  ts3_conn['s3_secret_key'] = s3_secret_keys[target_access]
else:
  dbutils.notebook.exit("Invalid Target S3 Access Name!")
service = 'S3-S3'

In [0]:
import boto3
import uuid
import datetime, time
from datetime import datetime
import logging
import re
# Create a logger
logger = logging.getLogger()
logger.setLevel(logging.INFO)

In [0]:
run_id = uuid.uuid4()
print('run_id:',run_id)

run_id: b9836623-199b-4ad4-a12e-57051337161e


In [0]:
def run_logger(service,log_op, opn, srcfilename, rc, target,status):
  """
  Inputs: SQL Query
  Output: Returns True if success
  """

  logging.info(f"Updating run log table for {opn} operation.....")
  if log_op == 'insert' :
    query = f"INSERT INTO TABLE run_log VALUES('{service}','{run_id}','{source_access}','{source_s3_file}','{srcfilename}','{opn}',{rc},'{target_access}','{target}','{status}',current_timestamp())"
  elif log_op == 'update':
    query = f"UPDATE run_log SET status='{status}' where run_id = '{run_id}' and operation='{opn}'"
  # Executing SQL Query
  spark.sql(query)
  return True

In [0]:
def file_pattern_check(source_s3_path,s3_conn):
  """
  Inputs: S3 connection details, S3 file path
  Output: Return file with max timestamp of pattern present in file path
  """
  logging.info('Executing File pattern check......')

  s3_parts_1 = source_s3_path.split('/')
  bucket_name = s3_parts_1[2]
  prefix = '/'.join(s3_parts_1[3:-1])
  s3_file_part = s3_parts_1[-1]
  s3_parts_2 = s3_file_part.split('_')
  file_name_pattern = '_'.join(s3_parts_2[:-1])
  file_format = source_s3_path.split('.')[-1]

  # Accessing keys from connection inputs
  access_key = s3_conn['s3_access_key']
  secret_key = s3_conn['s3_secret_key']

  # Creating boto3 client
  s3 = boto3.client('s3', aws_access_key_id=access_key, aws_secret_access_key=secret_key)
  
  # listing required files from s3 location
  obj_list = s3.list_objects(Bucket=bucket_name, Prefix=prefix)
  objs = [item['Key'] for item in obj_list['Contents']]
  obj_req = []
  match_list = []
  for obj in objs:
    if obj.endswith(('.txt', '.csv', '.parquet')) and (obj.count('/') < 2):
      obj = obj.split('/')[-1]
      obj_req.append(obj)

  # listing files with pattern match
  if 'yyyymmddHHMMSS' in source_s3_path :
    pattern = rf"{file_name_pattern}_\d{{14}}\.{file_format}"
    for obj in obj_req :
      if re.match(pattern, obj):
        match_list.append(obj)
  elif 'yyyymmdd' in source_s3_path :
    pattern = rf"{file_name_pattern}_\d{{8}}\.{file_format}"
    for obj in obj_req:
      if re.match(pattern, obj):
        match_list.append(obj)

  if len(match_list) == 0:
    logging.info(f"No files found at {source_s3_path} with input pattern")
    run_logger('S3-DB','insert','copy','',0,'','failed')
    return None
  else:
    # Selecting latest timestamp file for return
    latest_file = max(match_list)
    logging.info(f"Latest file for pattern - {pattern} at s3://{bucket_name}/{prefix}: {latest_file}")
    return latest_file
  

In [0]:
def s3_s3_copy(ss3_conn, ts3_conn, source_s3_file, target_s3_file):
  """
  This function moves objects from source S3 to target S3
  Inputs: S3 connection details - source & Target, s3_paths - Source & Target, 
  Outputs: return list of status, filepath, filename
  """

  source_access_key = ss3_conn['s3_access_key']
  source_secret_key = ss3_conn['s3_secret_key']
  target_access_key = ts3_conn['s3_access_key']
  target_secret_key = ts3_conn['s3_secret_key']

  # Initialize S3 clients with different credentials
  source_s3 = boto3.client('s3', aws_access_key_id=source_access_key, aws_secret_access_key=source_secret_key)
  target_s3 = boto3.client('s3', aws_access_key_id=target_access_key, aws_secret_access_key=target_secret_key)
  
  current_time = datetime.now()
  timestamp = current_time.strftime("%Y%m%d%H%M%S")
  current_day = datetime.today().date()
  date = current_day.strftime("%Y%m%d")

  # Check if given path contains file name/file name pattern
  if source_s3_file.endswith(('.txt', '.csv', '.parquet')):
    # Retrieving buckets and prefix
    ss3_parts_1 = source_s3_file.split('/')
    source_bucket_name = ss3_parts_1[2]

    if source_bucket_name == '':
      logging.error("Invalid source file s3 path")
      run_logger('S3-S3','insert','copy','',0,target_s3_file,'failed')
      return None

    source_object_key = '/'.join(ss3_parts_1[3:])
    source_file = ss3_parts_1[-1]

    ss3_parts_2 = target_s3_file.split('/')
    target_bucket_name = ss3_parts_2[2]

    if target_bucket_name == '':
      logging.error("Invalid target file s3 path")
      run_logger('S3-S3','insert','copy','',0,target_s3_file,'failed')
      return None
    
    target_object_key = '/'.join(ss3_parts_2[3:])

    # Check if there's any datetime pattern in input Source S3 path
    if ('yyyymmdd' in source_file) or ('yyyymmddHHMMSS' in source_file):
      latest_file = file_pattern_check(source_s3_file,ss3_conn)
    
    if latest_file:
      source_object_key = '/'.join(ss3_parts_1[3:-1])
      source_object_key = source_object_key + '/'+ latest_file
      source_file = latest_file
    else :
      logging.error("Unable to get file of provided pattern...")
      run_logger('S3-S3','insert','copy','',0,target_s3_file,'failed')
      return None

    # Check if target file path contains target file name
    if target_s3_file.endswith(('.txt', '.csv', '.parquet')):
      target_file_name = target_s3_file.split('/')[-1]
      if ('_yyyymmddHHMMSS' in target_s3_file) :
        file_parts = target_file_name.split('_yyyymmddHHMMSS')
        file_name = file_parts[0]+'_'+timestamp+file_parts[1]
        target_object_key = '/'.join(ss3_parts_2[3:-1])
        target_object_key = target_object_key + '/'+ file_name

      elif ('_yyyymmdd' in target_s3_file) :
        file_parts = target_file_name.split('_yyyymmdd')
        file_name = file_parts[0]+'_'+date+file_parts[1]
        target_object_key = '/'.join(ss3_parts_2[3:-1])
        target_object_key = target_object_key + '/'+ file_name
    else:
      # If target name is not present use source file name as key at target S3
      if not target_object_key.endswith('/') :
        file_name = source_file
        target_object_key = target_object_key+'/'+file_name
      else:
        file_name = source_file
        target_object_key = target_object_key + file_name

    # Copy the object from source bucket to target bucket
    target_s3.copy_object(Bucket=target_bucket_name, Key=target_object_key, CopySource={'Bucket': source_bucket_name, 'Key': source_object_key})
    tgt = 's3://'+target_bucket_name+'/'+target_object_key
    run_logger('S3-S3', 'insert','copy',file_name,1,tgt,'success')
    return [True, file_name,1]

  else:

    ss3_parts_1 = source_s3_file.split('/')
    source_bucket_name = ss3_parts_1[2]

    if source_bucket_name == '':
      logging.error("Invalid source file s3 path")
      run_logger('S3-S3','insert','copy','',0,target_s3_file,'failed')
      return None

    source_prefix = '/'.join(ss3_parts_1[3:])
    if not source_prefix.endswith('/'):
      source_prefix = source_prefix+'/'

    ss3_parts_2 = target_s3_file.split('/')
    target_bucket_name = ss3_parts_2[2]

    if target_bucket_name == '':
      logging.error("Invalid target file s3 path")
      run_logger('S3-S3','insert','copy','',0,target_s3_file,'failed')
      return None

    target_object_key = '/'.join(ss3_parts_2[3:])
    if not target_object_key.endswith('/'):
      target_object_key = target_object_key+'/'

    # List all objects in the source bucket
    response = source_s3.list_objects_v2(Bucket=source_bucket_name, Prefix=source_prefix)

    if 'Contents' in response:
      objects_to_copy = response['Contents']
    
    # Check if there are any files present at given S3 path
    obj_list = []
    for obj in objects_to_copy:
      if obj['Key'].endswith('/'):
        continue
      else:
        obj_list.append(obj['Key'])
    # If no files present at given S3 Exit
    if len(obj_list) == 0:
      logging.info("No files at source s3 file path")
      run_logger('S3-S3','insert','copy','',0,target_s3_file,'failed')
      return None
    else:
      try:
        # Copy each object to the destination path
        nobj=0
        for obj in objects_to_copy:
          source_object_key = obj['Key']
          destination_object_key = target_object_key + source_object_key[len(source_prefix):]
          # Copy the object from source bucket to destination bucket
          target_s3.copy_object(Bucket=target_bucket_name, Key=destination_object_key, CopySource={'Bucket': source_bucket_name, 'Key': source_object_key})
          nobj+=1
        tgt = 's3://'+target_bucket_name+'/'+destination_object_key
        run_logger('S3-S3', 'insert','copy','MultipleFiles',nobj,tgt,'success')
        return [True,nobj]
      except Exception as e:
        logging.error(f"Unable to copy files to target s3:{e}")
        run_logger('S3-S3', 'insert','copy','MultipleFiles',0,target_s3_file,'failed')
        return None

In [0]:
result = s3_s3_copy(ss3_conn, ts3_conn, source_s3_file, target_s3_file)
if result:
  if len(result) == 3:
    file_name = result[1]
    logging.info(f"File Transfer Successful to Target S3 path with key {file_name}!")
  else:
    n = result[1]
    logging.info(f"{n} objects Transferred from Source S3 to Target S3 path!")
else:
  logging.info("File Transfer Failed!")

INFO:root:Executing File pattern check......
INFO:root:Latest file for pattern - sftp_\d{8}\.txt at s3://sdevalla-portfolio/orch_test: sftp_20240323.txt
INFO:root:Updating run log table for copy operation.....
INFO:root:File Transfer Successful to Target S3 path with key sftp_20240323.txt!


In [0]:
from botocore.exceptions import ClientError
def send_email(subject, body_html, sender, recipients):

  """
  This function sends email notification to recipeints on run status
  Inputs: Email content, receiver email address list
  Output: Success/Failure message
  """
  access_key = ses_conn['access_key']
  secret_key = ses_conn['secret_key']

  #Creating boto3 ses client
  ses_client = boto3.client('ses', aws_access_key_id=access_key, aws_secret_access_key=secret_key, region_name = 'us-east-2')

  # Create a MIME message
  body_text = "This email requires HTML support. Please view in a HTML-compatible email client."
  charset = "UTF-8"
  
  # Assemble the email
  try:
      response = ses_client.send_email(
          Destination={
              'ToAddresses': recipients,
          },
          Message={
              'Body': {
                  'Html': {
                      'Charset': charset,
                      'Data': body_html,
                  },
                  'Text': {
                      'Charset': charset,
                      'Data': body_text,
                  },
              },
              'Subject': {
                  'Charset': charset,
                  'Data': subject,
              },
          },
          Source=sender,
      )
  except ClientError as e:
      print(e.response['Error']['Message'])
  else:
      print("Email sent! Message ID:", response['MessageId'])

In [0]:
def dataframe_to_html_table(df):
    # Convert DataFrame to HTML table
    html_table = df.to_html(index=False)
    # Add inline CSS styling to color the header
    styled_header = '<th style="background-color: #FB451D; color: white;">'
    html_table = html_table.replace('<th>', styled_header)
    # Format the HTML table
    formatted_html_table = f'<html><body>{html_table}</body></html>'
    return formatted_html_table

In [0]:
subject = f"{service} run status for run id - {run_id}"
df = spark.sql(""" select distinct service as Service, source_path_table as `Source_S3`, source_file_dml as `Source File`, operation as Task,  record_count as File_Count, target_path_table as Target_S3, Status, Timestamp from run_log where run_id = '{}' order by task """.format(run_id ))
html_table = dataframe_to_html_table(df.toPandas())
body_html = f"<html><body><p>Hi,</p><p>Please find the status of service: {service} with run id: {run_id}</p>{html_table}</body></html>"
displayHTML(body_html)
sender = "noreplyd22snotification@gmail.com"
if notify_recipient != '' :
  recipients = notify_recipient.split(',')
else:
   recipients = []
if len(recipients) > 0:
  send_email(subject, body_html, sender, recipients)
else:
  print('No recipients to send email!')

Service,Source_S3,Source File,Task,File_Count,Target_S3,Status,Timestamp
S3-S3,s3://sdevalla-portfolio/orch_test/sftp_yyyymmdd.txt,sftp_20240323.txt,copy,1,s3://sdevalla-buck/test/sftp_20240323.txt,success,2024-03-23 16:27:37.322


No recipients to send email!
