In [1]:
import os
import numpy as np
import configparser
import boto3
import pandas as pd
import time
import logging
from botocore.exceptions import ClientError
from watchdog.observers import Observer
from watchdog.events import FileSystemEventHandler
from model_pipeline import model_pipeline

In [2]:
class S3BucketHandler(FileSystemEventHandler):
    def __init__(self, credentials_file='aws_credentials.ini'):
        self.credentials_file = credentials_file
        self.s3 = self._load_credentials()
        self.last_contents = set()
        self.new_csv_files = [] #empty list to hold new files detected.

    def _load_credentials(self):
        config = configparser.ConfigParser()
        config.read(self.credentials_file)
        access_key_id = config.get('aws_credentials', 'aws_access_key_id')
        secret_access_key = config.get('aws_credentials', 'aws_secret_access_key')
        return boto3.client('s3', aws_access_key_id=access_key_id, aws_secret_access_key=secret_access_key)

    def on_any_event(self, event):
        required_columns = [
        'bvn', 'application_id', 'amount_requested', 'date_created', 'airtime_in_90days',
        'bill_payment_in_90days', 'cable_tv_in_90days', 'deposit_in_90days', 'easy_payment_in_90days',
        'farmer_in_90days', 'inter_bank_in_90days', 'mobile_in_90days', 'utility_bills_in_90days',
        'withdrawal_in_90days'
        ]

        column_types = {
            'bvn': 'object',
            'application_id': 'object',
            'amount_requested': 'float',
            'date_created': 'date',
            'airtime_in_90days': 'float',
            'bill_payment_in_90days': 'float',
            'cable_tv_in_90days': 'float',
            'deposit_in_90days': 'float',
            'easy_payment_in_90days': 'float',
            'farmer_in_90days': 'float',
            'inter_bank_in_90days': 'float',
            'mobile_in_90days': 'float',
            'utility_bills_in_90days': 'float',
            'withdrawal_in_90days': 'float',
            }
        bucket_name = "scetru-ml-bucket"
        try:
            current_contents = set(self._list_bucket_contents(bucket_name))
            new_files = current_contents - self.last_contents
            
            for file in new_files:
                if file.endswith('.csv'):
                    #### logging.info(f"Alert: New .csv file detected in bucket {self.bucket_name}: {file}")
                    df = self._read_csv_from_s3(bucket_name, file)
                        
                    # Check for required columns
                    missing_columns = set(required_columns) - set(df.columns)
                    if missing_columns:
                        #### logging.warning(f"Missing required columns in {file}: {missing_columns}")
                        print(f"Missing required columns in {file}: {missing_columns}")
                    else:
                        self.new_csv_files.append(df)
                        #### logging.info(f"All required columns present in {file}")                         
      
            self.last_contents = current_contents #set the last state of the bucket.

            # Merge dataframes if multiple new CSV files were detected
            if self.new_csv_files: 
                trans_data = pd.concat(self.new_csv_files, ignore_index=True) 
                trans_data = trans_data[required_columns]
                trans_data = trans_data[~trans_data['application_id'].isnull()] # excluded records where application_id is null
                trans_data.reset_index(drop=True, inplace=True)
                trans_data['application_id'] = trans_data['application_id'].astype(str)
                trans_data.fillna(0.0, inplace=True)
                #### logging.info(f"Merged {len(self.new_csv_files)} CSV files into a single DataFrame.")

                # Save the merged DataFrame to a CSV file in the working directory 
                trans_data.to_csv("trans_data.csv", index=False) 
                
                # convert columns to the right data types
                trans_data = self.convert_columns(trans_data, column_types)
                #### logging.info(f"Saved merged DataFrame to {output_file}")

                # Clear the list of new CSV files
                self.new_csv_files.clear()

                # merge trans with do_good_table
                merged_df = self.join_trans_with_do_good(trans_data)
                model_outcome = model_pipeline(merged_df)
                model_outcome.to_csv("model_outcome.csv", index=False) 

                # read and update the complete_table
                return_complete_table = self.read_and_update_complete_table(model_outcome)
                return_complete_table.to_csv("return_complete_table.csv", index=False) 
                
        except ClientError as e:
            #### logging.info(f"Error accessing S3 bucket: {e}")
            print(f"Error accessing S3 bucket: {e}")

    def _list_bucket_contents(self, bucket_name):
        try:
            response = self.s3.list_objects_v2(Bucket=bucket_name)
            return [obj['Key'] for obj in response.get('Contents', [])]
        except ClientError as e:
            # Handle bucket not found error here (e.response['Error']['Code'] == 'NoSuchBucket')
            #### logging.error(f"Bucket {self.bucket_name} does not exist or access denied: {e}")
            return []  # Return an empty list to avoid further errors

    def _read_csv_from_s3(self, bucket_name, file_key):
        obj = self.s3.get_object(Bucket=bucket_name, Key=file_key)
        df = pd.read_csv(obj['Body'])
        df['file_key'] = file_key  # Add file_key as a new column
        return df
        
    def join_trans_with_do_good(self, trans_data):
        do_good_table = self.collate_file("scetru-fcmb-do-good-table")
        columns_ = ['bvn', 'applicationID', 'date_of_default', 'outstanding_balance']
        do_good_table = do_good_table[columns_]

        merged_df = trans_data.merge(do_good_table, on='bvn', how='left')
        merged_df['date_of_default'] = pd.to_datetime(merged_df['date_of_default'], errors='coerce')
        current_date = pd.Timestamp.now()
        merged_df['default_in_last_90days'] = np.where(
            ((current_date - merged_df['date_of_default']).dt.days <= 90) & (merged_df['outstanding_balance'] != 0), 
            'Y', 
            'N'
        )
        merged_df['has_it_make_it_good'] = np.where(
            (merged_df['outstanding_balance'] == 0) | (merged_df['default_in_last_90days'] == 'N'), 
            'Y', 
            'N'
        )
        
        merged_df['bvn'] = merged_df['bvn'].astype(str)
        merged_df.drop(columns=['date_of_default', 'outstanding_balance', 'applicationID'], inplace=True)
        return merged_df

    def read_and_update_complete_table(self, outcome_table):
        complete_table = self.collate_file("complete-table")
        # Ensure application_id columns are of the same type (string)
        complete_table['application_id'] = complete_table['application_id'].astype(str)
        complete_table['bvn'] = complete_table['bvn'].astype(str)
        # Excluding transactions previously processed by ml services or streaming process.
        complete_table = complete_table[(complete_table['decline_reason'].isnull()) & (complete_table['amount_approved'].isnull())].reset_index(drop=True)
        complete_table = complete_table.drop(columns=['amount_approved', 'decline_reason'])
        update_complete_table = complete_table.merge(outcome_table[outcome_table['application_id'].isin(complete_table['application_id'].unique())][['bvn', 'application_id', 'amount_approved','decline_reason']],
                               on=['bvn', 'application_id'], how='inner', suffixes=('', '_outcome'))

        # Add new columns and reorder (consider using pipe syntax)
        update_complete_table['updated_date'] = pd.Timestamp.now().floor('min')
        update_complete_table['loan_message'] = 'Completed'
        update_complete_table = update_complete_table[['bvn', 'dob', 'amount_requested', 'application_id', 'loan_tenure','loan_repayment_structure',
                              'internal_id', 'amount_approved','created_date', 'updated_date', 'decline_reason', 'loan_message',
                              'file_key']]
        return update_complete_table

    def collate_file(self, bucket_name):
        new_files = []
        files = set(self._list_bucket_contents(bucket_name))
        for file in files:
            df = self._read_csv_from_s3(bucket_name, file)
            new_files.append(df)
        return pd.concat(new_files, ignore_index=True)

    def convert_columns(self, df, column_types):
        """
        Converts columns in the DataFrame to the specified data types.

        Parameters:
        df (pd.DataFrame): The DataFrame to be converted.
        column_types (dict): A dictionary where keys are column names and values are the desired data types.

        Returns:
        pd.DataFrame: The DataFrame with converted columns.
        """
        for col, dtype in column_types.items():
            if col in df.columns:
                if dtype == 'date':
                    df[col] = pd.to_datetime(df[col]).dt.date
                else:
                    df[col] = df[col].astype(dtype)
            else:
                print(f"Warning: Column '{col}' does not exist in the DataFrame.")
        return df



In [3]:
def monitor_s3_bucket(bucket_name, interval=1):
    handler = S3BucketHandler()
    observer = Observer()
    observer.schedule(handler, path='.', recursive=False)

    try:
        # Check bucket existence and credential validity before starting monitoring
        if not handler._list_bucket_contents(bucket_name):
            #### logging.error(f"Bucket {bucket_name} does not exist or access denied.")
            return

        #### logging.info(f"Bucket {bucket_name} exists and access successful. Starting monitoring.")
        observer.start()
        
        while True:
            time.sleep(interval)
    except KeyboardInterrupt:
        observer.stop()
    observer.join()

In [7]:
monitor_s3_bucket("scetru-ml-bucket")

Missing required columns in 22381849810.csv: {'bill_payment_in_90days', 'cable_tv_in_90days', 'easy_payment_in_90days', 'mobile_in_90days', 'airtime_in_90days', 'amount_requested', 'farmer_in_90days', 'application_id', 'inter_bank_in_90days', 'utility_bills_in_90days', 'deposit_in_90days', 'withdrawal_in_90days'}
Missing required columns in fcmb_20240829/22381849810.csv: {'bill_payment_in_90days', 'cable_tv_in_90days', 'easy_payment_in_90days', 'mobile_in_90days', 'airtime_in_90days', 'amount_requested', 'farmer_in_90days', 'application_id', 'inter_bank_in_90days', 'utility_bills_in_90days', 'deposit_in_90days', 'withdrawal_in_90days'}


In [9]:
trans_df = pd.read_csv("trans_data.csv")
trans_df

Unnamed: 0,bvn,application_id,amount_requested,date_created,airtime_in_90days,bill_payment_in_90days,cable_tv_in_90days,deposit_in_90days,easy_payment_in_90days,farmer_in_90days,inter_bank_in_90days,mobile_in_90days,utility_bills_in_90days,withdrawal_in_90days
0,22207845921,7564669779,340000,2019-06-13,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,22207845921,7564669779,340000,2019-06-13,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,22207845921,7564669779,340000,2019-06-13,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [11]:
model_outcome_df = pd.read_csv("model_outcome.csv")
model_outcome_df

Unnamed: 0,bvn,application_id,amount_requested,date_created,airtime_in_90days,bill_payment_in_90days,cable_tv_in_90days,deposit_in_90days,easy_payment_in_90days,farmer_in_90days,inter_bank_in_90days,mobile_in_90days,utility_bills_in_90days,withdrawal_in_90days,default_in_last_90days,has_it_make_it_good,amount_approved,decline_reason
0,22207845921,7564669779,340000.0,2019-06-13,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,N,Y,0.0,Loan declined due to low transaction or incomp...
1,22207845921,7564669779,340000.0,2019-06-13,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,N,Y,0.0,Loan declined due to low transaction or incomp...
2,22207845921,7564669779,340000.0,2019-06-13,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,N,Y,0.0,Loan declined due to low transaction or incomp...


In [13]:
return_complete_table = pd.read_csv("return_complete_table.csv")
return_complete_table

Unnamed: 0,bvn,dob,amount_requested,application_id,loan_tenure,loan_repayment_structure,internal_id,amount_approved,created_date,updated_date,decline_reason,loan_message,file_key
0,22207845921,26-08-2024,340000.0,7564669779,1 year,monthly,sce-50b7f803-5ce4-42c4-871e-dd07278f824f,0.0,2024-08-28 15:41,2024-10-30 23:24:00,Loan declined due to low transaction or incomp...,Completed,fcmb_20240828/7564669779.csv
1,22207845921,26-08-2024,340000.0,7564669779,1 year,monthly,sce-50b7f803-5ce4-42c4-871e-dd07278f824f,0.0,2024-08-28 15:41,2024-10-30 23:24:00,Loan declined due to low transaction or incomp...,Completed,fcmb_20240828/7564669779.csv
2,22207845921,26-08-2024,340000.0,7564669779,1 year,monthly,sce-50b7f803-5ce4-42c4-871e-dd07278f824f,0.0,2024-08-28 15:41,2024-10-30 23:24:00,Loan declined due to low transaction or incomp...,Completed,fcmb_20240828/7564669779.csv


In [None]:
## update the code file
# Upload return_complete_table
# archive the bucket_data
# clean bucket at midnight after 3 days.