In [0]:
# Source SFTP Access Name: Required
# Source SFTP File Path: Required [Formats allowed: /filename.txt, /filename_yyyymmdd.txt, /filename_yyyymmddHHMMSS.txt]
# Target S3 Access Name: Required
# Target S3 File Path: Required [Formats allowed: s3://bucket/prefix/filename.txt, s3://bucket/prefix/filename_yyyymmdd.txt, _yyyymmddHHMMSS.txt]
# Email Notification: Defaults to skip [Requires email id to send notification]

In [0]:
# #Source Inputs
# dbutils.widgets.removeAll()
# dbutils.widgets.text("Source_SFTP_Access", "")
# dbutils.widgets.text("Source_SFTP_File_Path", "")

# #Target Inputs
# dbutils.widgets.text("Target_S3_Access", "")
# dbutils.widgets.text("Target_S3_File_Path", "")
# dbutils.widgets.text("Notification_Recipient","")

In [0]:
%run ./configs

In [0]:
# Reading user inputs
source_access = getArgument('Source_SFTP_Access').strip()
source_sftp_path = getArgument('Source_SFTP_File_Path').strip()
target_access = getArgument('Target_S3_Access').strip()
target_s3_path = getArgument('Target_S3_File_Path').strip()
notify_recipient = getArgument('Notification_Recipient').strip()

sftp_conn ={}
s3_conn ={}

# Validating user inputs
if source_access == '':
  dbutils.notebook.exit("Source SFTP Access name missing!")
elif source_sftp_path == '':
  dbutils.notebook.exit("Source File info missing!")
elif target_access == '':
  dbutils.notebook.exit("Target S3 Access Name missing!")
elif target_s3_path == '':
  dbutils.notebook.exit("Target S3 File Path missing!")

if source_access in sftp_access_list: 
  sftp_conn['host'] = sftp_hosts[source_access]
  sftp_conn['username'] = sftp_username[source_access]
  sftp_conn['password'] = sftp_password[source_access]
else:
  dbutils.notebook.exit("Invalid Source SFTP Access!")

if target_access in s3_access_list: 
  s3_conn['s3_access_key'] = s3_access_keys[target_access]
  s3_conn['s3_secret_key'] = s3_secret_keys[target_access]
else:
  dbutils.notebook.exit("Invalid Target S3 Access Name!")
service = 'SFTP-S3'

In [0]:
import boto3
import uuid
import datetime, time
import pandas as pd
from datetime import datetime
import logging
import re
import os
!pip install pysftp
import pysftp
# Create a logger
logger = logging.getLogger()
logger.setLevel(logging.INFO)



Collecting pysftp
  Using cached pysftp-0.2.9-py3-none-any.whl
Collecting paramiko>=1.17
  Using cached paramiko-3.4.0-py3-none-any.whl (225 kB)
Collecting pynacl>=1.5
  Using cached PyNaCl-1.5.0-cp36-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.manylinux_2_24_x86_64.whl (856 kB)
Collecting bcrypt>=3.2
  Using cached bcrypt-4.1.2-cp39-abi3-manylinux_2_28_x86_64.whl (698 kB)
Installing collected packages: pynacl, bcrypt, paramiko, pysftp
Successfully installed bcrypt-4.1.2 paramiko-3.4.0 pynacl-1.5.0 pysftp-0.2.9
You should consider upgrading via the '/local_disk0/.ephemeral_nfs/envs/pythonEnv-5dbb50c4-d90b-478c-ab09-35a1013e7378/bin/python -m pip install --upgrade pip' command.[0m


In [0]:
run_id = uuid.uuid4()
print('run_id:',run_id)

run_id: 1709fe2a-c766-487b-b504-ca9535db0040


In [0]:
def run_logger(service,log_op, opn,srcfilename, rc, target,status):
  """
  Inputs: SQL Query
  Output: Returns True if success
  """
  logging.info(f"Updating run log table for {opn} operation.....")
  if log_op == 'insert' :
    query = f"INSERT INTO TABLE run_log VALUES('{service}','{run_id}','{source_access}','{source_sftp_path}','{srcfilename}','{opn}',{rc},'{target_access}','{target}','{status}',current_timestamp())"
  elif log_op == 'update':
    query = f"UPDATE run_log SET status='{status}' where run_id = '{run_id}' and operation='{opn}'"
  # Executing SQL Query
  spark.sql(query)
  return True

In [0]:
def sftp_file_pattern_check(sftp_conn, source_sftp_path):
  """
  Inputs: Sftp connection details, Sftp Directory
  Output: Return file with max timestamp of pattern present in Directory
  """
  logging.info('Executing File pattern check......')

  sftp_parts_1 = source_sftp_path.split('/')
  prefix = '/'.join(sftp_parts_1[:-1])
  sftp_file_part = sftp_parts_1[-1]
  sftp_parts_2 = sftp_file_part.split('_')
  file_name_pattern = '_'.join(sftp_parts_2[:-1])
  file_format = source_sftp_path.split('.')[-1]

  # Create pysftp CnOpts object to handle known host keys
  cnopts = pysftp.CnOpts()
  cnopts.hostkeys = None  # Disable host key checking

  hostname = sftp_conn['host']
  username = sftp_conn['username']
  password = sftp_conn['password'] 

  # Connect to the SFTP server
  with pysftp.Connection(host=hostname, username=username, password=password, port=22, cnopts=cnopts) as sftp:
    print('Connection successful to SFTP .........')
    #Change to the specified directory
    sftp.chdir(prefix)
    #List files in the directory
    objs = sftp.listdir()

  match_list = []
  # listing files with pattern match
  if 'yyyymmddhhmmss' in source_sftp_path :
    pattern = rf"{file_name_pattern}_\d{{14}}\.{file_format}"
    for obj in objs :
      if re.match(pattern, obj):
        match_list.append(obj)
  elif 'yyyymmdd' in source_sftp_path :
    pattern = rf"{file_name_pattern}_\d{{8}}\.{file_format}"
    for obj in objs:
      if re.match(pattern, obj):
        match_list.append(obj)

  print('match_list:',match_list)
  if len(match_list) == 0:
    logging.info(f"No files found at {source_sftp_path} with input pattern")
    #run_logger('SFTP-S3','insert','copy',0,'','failed')
    return None
  else:
    # Selecting latest timestamp file for return
    latest_file = max(match_list)
    print('latest file:',latest_file)
    logging.info(f"Latest file for pattern - {pattern} at {prefix}: {latest_file}")
    return latest_file
  

In [0]:
def move_and_rename_file_in_s3(s3_conn, target_s3, new_file_name):
  """
  This function moves the s3 file & renames it to required file name
  Inputs: S3 connection details, s3_path, file name
  Outputs: True
  """

  access_key = s3_conn['s3_access_key']
  secret_key = s3_conn['s3_secret_key']
  
  s3 = boto3.client('s3', aws_access_key_id=access_key, aws_secret_access_key=secret_key)

  s3_parts = target_s3.split('/')
  bucket_name = s3_parts[2]
  if target_s3.endswith(('.csv','.txt','.parquet')):
    prefix = '/'.join(s3_parts[3:-1])
  else:
    prefix = '/'.join(s3_parts[3:])
  
  if prefix.endswith('/'):
    folder_prefix = prefix + new_file_name + '/'
  else :
    folder_prefix = prefix +'/' +new_file_name+'/'

  fformat = '.'+new_file_name.split('.')[-1]
  if fformat in ['.csv','.txt']:
    fformat = '.csv'
  else:
    fformat = fformat
  # List objects in the folder
  response = s3.list_objects_v2(Bucket=bucket_name, Prefix=folder_prefix)
  # Retrieve the filenames from the list of objects
  csv_files = [obj['Key'] for obj in response.get('Contents', []) if obj['Key'].endswith(f'{fformat}')]
  if csv_files:
    # Pick the last csv file
    last_csv_file = csv_files[-1]
    # Move the file to upper directory
    if prefix.endswith('/') :
      key = prefix + new_file_name
    else:
      key = prefix + '/' + new_file_name
    s3.copy_object(Bucket=bucket_name, CopySource=f"{bucket_name}/{last_csv_file}", Key= key)
    # Delete original directory
    s3 = boto3.resource('s3', aws_access_key_id=access_key, aws_secret_access_key=secret_key)
    bucket = s3.Bucket(f'{bucket_name}')
    for obj in bucket.objects.filter(Prefix= f'{folder_prefix}'):
      s3.Object(bucket.name,obj.key).delete()
    logging.info("Moved and Renamed files")
    return True
  else :
    logging.info("Error during moving & renaming files")
    return None

In [0]:
def sftp_get_file(sftp_conn, source_sftp_path):

  """
  This function reads file from SFTP location and writes a single file to DBFS local
  Inputs: SFTP connection details, source file path
  Ouput: Returns local DBFS path
  """
 
  sftp_parts_1 = source_sftp_path.split('/')
  prefix = '/'.join(sftp_parts_1[:-1])
  sftp_file_part = sftp_parts_1[-1]
  sftp_parts_2 = sftp_file_part.split('_')
  file_name_pattern = '_'.join(sftp_parts_2[:-1])
  file_format = source_sftp_path.split('.')[-1]
  
  hostname = sftp_conn['host']
  username = sftp_conn['username']
  password = sftp_conn['password']

  # Create pysftp CnOpts object to handle known host keys
  cnopts = pysftp.CnOpts()
  cnopts.hostkeys = None  # Disable host key checking
  if source_sftp_path.endswith(('.txt','.csv', '.parquet')) :
    if ('yyyymmdd' in source_sftp_path) or ('yyyymmddHHMMSS' in source_sftp_path):
      latest_file = sftp_file_pattern_check(sftp_conn, source_sftp_path)
      if latest_file :
        remote_path = prefix+'/'+ latest_file
      else:
        logging.error("No latest file found")
        run_logger('SFTP-S3','insert','copy',0,'','failed')
        return None      
    else:
      latest_file = source_sftp_path.split('/')[-1]
      
    local_temp_dir = 'dbfs:/FileStore/sftp_store/'
    if not os.path.exists(local_temp_dir):
      os.makedirs(local_temp_dir)
    local_path = local_temp_dir + latest_file
    # Connect to the SFTP server
    with pysftp.Connection(host=hostname, username=username, password=password, port=22, cnopts=cnopts) as sftp:
      print('Connection successful to SFTP .........')
      #Change to the specified directory
      sftp.chdir(prefix)
      #Downloading file to temp directory
      remote_file_path = sftp.getcwd() + '/' + latest_file
      with sftp.open(remote_file_path) as remote_file:
        df = pd.read_csv(remote_file)
        rc = len(df)
        csv_data = df.to_csv(index=False)
        # Save the CSV data to DBFS
        dbutils.fs.put(local_path, csv_data, overwrite=True)

      #sftp.get(remote_file_path, dbfs_file_path)
      run_logger('SFTP-S3','insert','read',latest_file ,rc,target_s3_path,'success')
      logging.info(f"{latest_file} written to local dbfs directory {local_temp_dir} successfully")
      return local_path
  else:
    current_time = datetime.now()
    timestamp = current_time.strftime("%Y%m%d%H%M%S")
    local_temp_dir = 'dbfs:/FileStore/sftp_store/' 
    if not os.path.exists(local_temp_dir):
      os.makedirs(local_temp_dir)
    with pysftp.Connection(host=hostname, username=username, password=password, port=22, cnopts=cnopts) as sftp:
      print('Connection successful to SFTP .........')
      #Change to the specified directory
      sftp.chdir(source_sftp_path)
      #Downloading file to temp directory
      #sftp.get(source_sftp_path, local_path)

      #List files in the directory
      objs = sftp.listdir()

      if len(objs) == 0:
        logging.info(f"No files found at {source_sftp_path}!")
        run_logger('SFTP-S3','insert','read','',0,target_s3_path,'failed')
        return None
      else:
        tgt_file_name = str(run_id)+'_'+str(timestamp)+'.csv'
        local_path = local_temp_dir + tgt_file_name
        dfs = []
        for file_name in objs:
          remote_file_path = sftp.getcwd() + '/' + file_name
          with sftp.open(remote_file_path) as remote_file:
            df = pd.read_csv(remote_file)
            dfs.append(df)
        final_df = pd.concat(dfs,ignore_index=True)
        rc = len(final_df)  
        csv_data = final_df.to_csv(index=False)
        # Save the CSV data to DBFS
        dbutils.fs.put(local_path, csv_data, overwrite=True)
        run_logger('SFTP-S3','insert','read','MultipleFiles',rc,target_s3_path,'success')     
        logging.info(f"Files written to {local_path} successfully")
        return local_path  
      


In [0]:
def local_s3_transfer(s3_conn,local_path, target_s3_path):

  """
  This function reads from DBFS local and writes to target S3
  Inputs: DBFS local path, S3 connection details, target path
  Output: Final S3 object key
  """

  s3_parts_1 = target_s3_path.split('/')
  bucket_name = s3_parts_1[2]
  prefix = '/'.join(s3_parts_1[3:-1])
  key = '/'.join(s3_parts_1[3:])

  access_key = s3_conn['s3_access_key']
  secret_key = s3_conn['s3_secret_key']

  # Setting Spark configs to access S3
  sc._jsc.hadoopConfiguration().set("fs.s3n.awsAccessKeyId", access_key)
  sc._jsc.hadoopConfiguration().set("fs.s3n.awsSecretAccessKey", secret_key)  

  # Initialize S3 client with credentials
  s3 = boto3.client('s3', aws_access_key_id=access_key, aws_secret_access_key=secret_key)  

  current_time = datetime.now()
  timestamp = current_time.strftime("%Y%m%d%H%M%S")
  current_day = datetime.today().date()
  date = current_day.strftime("%Y%m%d")  

  if target_s3_path.endswith(('.csv','.txt')):
    target_file_name = target_s3_path.split('/')[-1]
    if ('_yyyymmddHHMMSS' in target_file_name) :
      file_parts = target_file_name.split('_yyyymmddHHMMSS')
      file_name = file_parts[0]+'_'+timestamp+file_parts[1]
      target_object_key = '/'.join(s3_parts_1[3:-1])
      target_object_key = target_object_key + '/'+ file_name
      file_path = 's3://'+bucket_name+'/'+target_object_key
    elif ('_yyyymmdd' in target_file_name) :
      file_parts = target_file_name.split('_yyyymmdd')
      file_name = file_parts[0]+'_'+date+file_parts[1]
      target_object_key = '/'.join(s3_parts_1[3:-1])
      target_object_key = target_object_key + '/'+ file_name
      file_path = 's3://'+bucket_name+'/'+target_object_key
    else:
      target_object_key = key
      file_path = 's3://'+bucket_name+'/'+target_object_key
      file_name = target_object_key.split('/')[-1]
  else:
    file_name = local_path.split('/')[-1]
    target_object_key = prefix+ '/'+ file_name
    file_path = 's3://'+bucket_name+'/'+target_object_key

  file_format = file_name.split('.')[-1]

  if file_format == 'txt':
    delimiter = '\t'
  else:
    delimiter = ','
  try:
    input_df = spark.read.csv(local_path)
    rc = input_df.count()
    input_df.coalesce(1).write.format('csv').option('header','False').option("delimiter",delimiter).mode('overwrite').save(file_path)
    res = move_and_rename_file_in_s3(s3_conn, target_s3_path, file_name)
    run_logger('SFTP-S3','insert','write',file_name,rc, file_path,'success')
    logging.info(f"File transfer successful to {bucket_name} with key {target_object_key}")
    return target_object_key
  except Exception as e:
    run_logger('SFTP-S3','insert','write',file_name,0,target_s3_path,'failed')
    logging.error(f"Unable to write to S3: {e}")
    return None

In [0]:
def sftp_s3(sftp_conn,source_sftp_path,s3_conn,target_s3_path):
  local_path = sftp_get_file(sftp_conn, source_sftp_path)
  if local_path :
    res = local_s3_transfer(s3_conn,local_path, target_s3_path)
    return res
  else:
    run_logger('SFTP-S3','insert','write','',0,target_s3_path,'failed')
    return None

In [0]:
status = sftp_s3(sftp_conn,source_sftp_path,s3_conn,target_s3_path)

# Checking status
if status :
  logging.info(f"File transfer Successful")
else:
  logging.info("Failed to transfer file from SFTP to S3")

INFO:root:Executing File pattern check......
INFO:paramiko.transport:Connected (version 2.0, client AzureSSH_1.0.0)
INFO:paramiko.transport:Authentication (password) successful!


Connection successful to SFTP .........


INFO:paramiko.transport.sftp:[chan 0] Opened sftp connection (server version 3)
INFO:paramiko.transport.sftp:[chan 0] sftp session closed.
INFO:root:Latest file for pattern - red_\d{14}\.csv at /Home/Incoming: red_20260101121212.csv
INFO:py4j.clientserver:Closing down clientserver connection
INFO:paramiko.transport:Connected (version 2.0, client AzureSSH_1.0.0)


match_list: ['red_20240101121212.csv', 'red_20260101121212.csv']
latest file: red_20260101121212.csv


INFO:paramiko.transport:Authentication (password) successful!


Connection successful to SFTP .........


INFO:paramiko.transport.sftp:[chan 0] Opened sftp connection (server version 3)
INFO:root:Updating run log table for read operation.....


Wrote 29829 bytes.


INFO:root:red_20260101121212.csv written to local dbfs directory dbfs:/FileStore/sftp_store/ successfully
INFO:paramiko.transport.sftp:[chan 0] sftp session closed.
INFO:root:Moved and Renamed files
INFO:root:Updating run log table for write operation.....
INFO:root:File transfer successful to sdevalla-portfolio with key orch_test/sftp_20240323.txt
INFO:root:File transfer Successful


In [0]:
from botocore.exceptions import ClientError
def send_email(subject, body_html, sender, recipients):

  """
  This function sends email notification to recipeints on run status
  Inputs: Email content, receiver email address list
  Output: Success/Failure message
  """
  access_key = ses_conn['access_key']
  secret_key = ses_conn['secret_key']

  #Creating boto3 ses client
  ses_client = boto3.client('ses', aws_access_key_id=access_key, aws_secret_access_key=secret_key, region_name = 'us-east-2')

  # Create a MIME message
  body_text = "This email requires HTML support. Please view in a HTML-compatible email client."
  charset = "UTF-8"
  
  # Assemble the email
  try:
      response = ses_client.send_email(
          Destination={
              'ToAddresses': recipients,
          },
          Message={
              'Body': {
                  'Html': {
                      'Charset': charset,
                      'Data': body_html,
                  },
                  'Text': {
                      'Charset': charset,
                      'Data': body_text,
                  },
              },
              'Subject': {
                  'Charset': charset,
                  'Data': subject,
              },
          },
          Source=sender,
      )
  except ClientError as e:
      print(e.response['Error']['Message'])
  else:
      print("Email sent! Message ID:", response['MessageId'])

In [0]:
def dataframe_to_html_table(df):
    # Convert DataFrame to HTML table
    html_table = df.to_html(index=False)
    # Add inline CSS styling to color the header
    styled_header = '<th style="background-color: #FB451D; color: white;">'
    html_table = html_table.replace('<th>', styled_header)
    # Format the HTML table
    formatted_html_table = f'<html><body>{html_table}</body></html>'
    return formatted_html_table

In [0]:
subject = f"{service} run status for run id - {run_id}"
df = spark.sql(""" select distinct service as Service, source_path_table as `Source SFTP`, source_file_dml as `Source File`, operation as Task,  record_count as Record_Count, target_path_table as Target_S3, Status, Timestamp from run_log where run_id = '{}' order by task """.format(run_id ))
html_table = dataframe_to_html_table(df.toPandas())
body_html = f"<html><body><p>Hi,</p><p>Please find the status of service: {service} with run id: {run_id}</p>{html_table}</body></html>"
displayHTML(body_html)
sender = "noreplyd22snotification@gmail.com"
if notify_recipient != '' :
  recipients = notify_recipient.split(',')
else:
   recipients = []
if len(recipients) > 0:
  send_email(subject, body_html, sender, recipients)
else:
  print('No recipients to send email!')

Service,Source SFTP,Source File,Task,Record_Count,Target_S3,Status,Timestamp
SFTP-S3,/Home/Incoming/red_yyyymmddhhmmss.csv,red_20260101121212.csv,read,918,s3://sdevalla-portfolio/orch_test/sftp_yyyymmdd.txt,success,2024-03-23 16:13:58.907
SFTP-S3,/Home/Incoming/red_yyyymmddhhmmss.csv,sftp_20240323.txt,write,919,s3://sdevalla-portfolio/orch_test/sftp_20240323.txt,success,2024-03-23 16:14:08.636


No recipients to send email!
