In [6]:
from dotenv import load_dotenv # for load from env 
import os
import pandas as pd
from datetime import datetime 
import re # regex module
import logging

from minio import Minio 
from io  import BytesIO

from sqlalchemy import create_engine 
import sqlalchemy 
from pangres import upsert 

In [3]:
#load env variables 
load_dotenv(".env")

DB_HOST = os.getenv("DB_HOST")
DB_USER = os.getenv("DB_USER")
DB_PASSWORD = os.getenv("DB_PASSWORD")
DB_NAME = os.getenv("DB_NAME")
DB_SCHEMA_STG = os.getenv("DB_SCHEMA_STG")
DB_SCHEMA_LOG = os.getenv("DB_SCHEMA_LOG")
DB_SCHEMA_DWH = os.getenv("DB_SCHEMA_DWH")
MODEL_PATH = os.getenv("MODEL_PATH")   

# get minio access from env 
ACCESS_KEY_MINIO = os.getenv("ACCESS_KEY_MINIO")
SECRET_KEY_MINIO = os.getenv("SECRET_KEY_MINIO")

#### Read SQL

In [None]:
def read_sql(table_name) :
    # open sql file and read content
    with open(f"{MODEL_PATH}{table_name}.sql","r") as file:
        content = file.read()
    # return to text query
    return content

#### ETL Log

In [7]:
def etl_log(log_msg: dict) :
    try :
        #create connection database 
        conn = create_engine(f"postgresql://{DB_USER}:{DB_PASSWORD}@{DB_HOST}/{DB_NAME}")

        # change dictionary log_msg to dataframe
        df_log = pd.DataFrame([log_msg])

        #extract data log 
        with conn.connect() as connection :
            df_log.to_sql(
                name = "etl_log",
                con = connection,
                schema = "log",
                if_exists = "append",
                index = False
            )
    except Exception as e :
        logging.error(f"Cant save your log message. Error: {e}", exc_info=True)
        raise



#### Read ETL Log

In [8]:
def read_etl_log(filter_params: dict) :
    try :
        # create connection to database 
        conn = create_engine(f"postgresql://{DB_USER}:{DB_PASSWORD}@{DB_HOST}/{DB_NAME}")

        # get etl_date from latest
        query = sqlalchemy.text(read_sql("log"))

        # execute query with pd.read_sql 
        with conn.connect() as connection :
            df = pd.read_sql(
                sql = query,
                con = connection,
                params = (filter_params,)
            )
        # return extracted data 
        return df
    except Exception as  e :
        logging.error(f"Cant execute your query. Error: {e}", exc_info=True)
        raise

#### Extract Staging 

In [14]:
def extract_staging(table_name:str, schema_name:str) -> pd.DataFrame :
    try :
        # create connection to database 
        conn = create_engine(f"postgresql://{DB_USER}:{DB_PASSWORD}@{DB_HOST}/{DB_NAME}")

        # get date from previous process in etl_log 
        filter_log = {
            "step_name" : "warehouse",
            "table_name" : table_name,
            "status" : "success",
            "process" : "load"
        }
        etl_date = read_etl_log(filter_log) 

        # if previous process is null, set etl_date to 1990-01-01
        # if previous process is not null, get the latest etl_date 
        if (etl_date['max'][0] == None) :
            etl_date = '1990-01-01'
        else :
            etl_date = etl_date[max][0]

        # create query to select all column from specified table where created_at > etl_date
        query = f"SELECT * FROM {schema_name}.{table_name} WHERE created_at > %s::timestamp"

        # execute the query with pd.read_sql 
        with conn.connect() as connection :
            df = pd.read_sql(
                sql=query,
                con=connection,
                params=(etl_date,)
            )
        log_msg = {
            "step"  : "warehouse",
            "process" : "extraction",
            "status" : "success",
            "source" :  "database",
            "table_name" : table_name, 
            "etl_date" : datetime.now().strftime("%Y-%m-%d %H:%M:%S")
        }
        return df
        
    except Exception as  e :
        log_msg = {
            "step"  : "warehouse",
            "process" : "extraction",
            "status" : "failed",
            "source" :  "database",
            "table_name" : table_name, 
            "etl_date" : datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
            "error_msg" : str(e)
        }
        logging.error(f"Cant execute your query. Error: {e}", exc_info=True)
        raise
    finally :
        etl_log(log_msg)

#### Handle Error MINIO

In [None]:
# create function to handle error data and upload it to minio
def handle_error(data, bucket_name: str, table_name: str, process: str) :
    current_date = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
    
    try : 
        client = Minio('localhost:9000',
                       access_key=ACCESS_KEY_MINIO,
                       secret_key=SECRET_KEY_MINIO,
                       secure=False)
        # check if bucket exists, if not exists then create it
        if not client.bucket_exists(bucket_name):
            client.make_bucket(bucket_name)
        
        # convert dataframe to csv and then to bytes
        csv_bytes = data.to_csv().encode('utf-8')
        csv_buffer = BytesIO(csv_bytes)

        # upload the csv file to the bucket 
        client.put_object(
            bucket_name=bucket_name,
            object_name=f"{process}_{table_name}_{current_date}.csv",
            data=csv_buffer,
            length=len(csv_bytes),
            content_type='application/csv'
        )

        # list objects in the bucket 
        objects = client.list_objects(bucket_name, recursive=True)
        for obj in objects:
            logging.info(f"Object in bucket: {obj.object_name}")
    except Exception as e :
        logging.error(f"Failed to upload error data to Minio. Error: {e}", exc_info=True)
        raise


#### Load Warehouse 

In [17]:
# create function for load data to data warehouse
def load_warehouse(data, schema:str, table_name:str, idx_name:str, source):
    try :
        # create connection to database 
        conn = create_engine(f"postgresql://{DB_USER}:{DB_PASSWORD}@{DB_HOST}/{DB_NAME}")

        # set data index or primary key
        data = data.set_index(idx_name)

        # do upsert ( insert for non existing data, update for existing data )
        with conn.connect() as connection :
            upsert(
                df = data,
                con = connection,
                table_name = table_name,
                schema = schema,
                if_row_exists = "update"
            )
        log_msg = {
            "step" : "warehouse",
            "process":"load",
            "status": "success",
            "source": source,
            "table_name": table_name,
            "etl_date": datetime.now().strftime("%Y-%m-%d %H:%M:%S")  # Current timestamp
        }
    except Exception as e :
        log_msg = {
            "step" : "warehouse",
            "process":"load",
            "status": "failed",
            "source": source,
            "table_name": table_name,
            "etl_date": datetime.now().strftime("%Y-%m-%d %H:%M:%S"),  # Current timestamp
            "error_msg": str(e)

        }
        handle_error(data, bucket_name="error-dellstore", table_name=table_name, process="load")
    finally :
        etl_log(log_msg)    


#### Extract Target

In [16]:
# create function to get data from data warehouse to obtain value of foreign key
def extract_target(table_name:str) :
    conn = create_engine(f"postgresql://{DB_USER}:{DB_PASSWORD}@{DB_HOST}/{DB_NAME}")
    query = f"SELECT * FROM public.{table_name}"
    with conn.connect() as connection :
        df = pd.read_sql(
            sql = query,
            con = connection    
        )
    return df

#### Transform Categories

In [23]:
## this function is for transformation table category from staging to data warehouse
## - rename column from category to category_nk
## - rename column from categoryname to category_name

def transform_categories(data: pd.DataFrame, table_name:str) -> pd.DataFrame :
    try :
        process = "transformation"
        data = data.rename(column={'category' : 'category_nk',
                                   'categoryname' : 'category_name'})
        
        # remove dupicate based on category_nk and category_name 
        data = data.drop_duplicates(subset='category_nk')

        # drop column created_at 
        data = data.drop(columns=['created_at'])

        log_msg = {
            "step" : "warehouse",
            "process": process,
            "status": "success",
            "source": "staging",
            "table_name": table_name,
            "etl_date": datetime.now().strftime("%Y-%m-%d %H:%M:%S")  # Current timestamp
        }
        return data

    except Exception as e :
        log_msg = {
            "step" : "warehouse",
            "process": process,
            "status": "failed",
            "source": "staging",
            "table_name": table_name,
            "etl_date": datetime.now().strftime("%Y-%m-%d %H:%M:%S"),  # Current timestamp
            "error_msg" : str(e)
        }
        handle_error(data = data, 
                     bucket_name = 'error_dellstore',
                     table_name = table_name, 
                     process = process)
    finally :
        etl_log(log_msg)


#### Transform Customer

In [24]:
## this function is for transformation table customer from staging to data warehouse
## - rename column from creditcardtype to credit_card_type
## - rename column from creditcard to credit_card
## - rename column from creditcardexplanation to credit_card_explanation
## - rename column from firstname to first_name 
## - rename column from lastname to last_name

def transform_customer(data: pd.DataFrame, table_name:str) -> pd.DataFrame :
    try :
        process = "transformation"
        data = data.rename(columns={'customerid': 'customer_nk',
                            'firstname': 'first_name',
                            'lastname': 'last_name',
                            'creditcardtype': 'credit_card_type',
                            'creditcard': 'credit_card',
                            'creditcardexpiration': 'credit_card_expiration'})

        # remove dupicate based on customer_nk and customer_name 
        data = data.drop_duplicates(subset='customer_nk')

        # masking credit_card_number 
        data['credit_card'] = data['credit_card'].apply(lambda x: re.sub(r'\d', 'X', x[:-4]) + x[-4:])

        # drop column created_at 
        data = data.drop(columns=['created_at'])

        log_msg = {
            "step" : "warehouse",
            "process": process,
            "status": "success",
            "source": "staging",
            "table_name": table_name,
            "etl_date": datetime.now().strftime("%Y-%m-%d %H:%M:%S")  # Current timestamp
        }
        return data

    except Exception as e :
        log_msg = {
            "step" : "warehouse",
            "process": process,
            "status": "failed",
            "source": "staging",
            "table_name": table_name,
            "etl_date": datetime.now().strftime("%Y-%m-%d %H:%M:%S"),  # Current timestamp
            "error_msg" : str(e)
        }
        handle_error(data = data, 
                     bucket_name = 'error_dellstore',
                     table_name = table_name, 
                     process = process)
    finally :
        etl_log(log_msg)


#### Transform product

In [30]:
## this function is for transformation table products from staging to data warehouse
## - rename column from prod_id to product_nk
## - rename column from category to category_nk
## - lookup category_nk from categories tables based on category
def transform_product(data: pd.DataFrame, table_name:str) -> pd.DataFrame :
    try :
        process = "transformation"

        #rename column product 
        data = data.rename(columns= {
            "prod_id" : "product_nk",
            "category" : "category_nk"
        })

        # remove duplicate based on product_nk 
        data = data.drop_duplicates(subset='product_nk')

        # extract data from the 'categories' table 
        categories = extract_target('categories')

        # lookup 'category_id' from categories table based on 'category'
        data['category_id'] = data['category_nk'].apply(lambda x: categories.loc[categories['category_nk']== x, 'category_id'].values[0])
        # category_mapping = categories.set_index('category_nk')['category_id']
        # data['category_id'] = data['category_nk'].map(category_mapping)

        # drop column created_at
        data = data.drop(columns=['created_at','category_nk'])

        log_msg = {
            "step" : "warehouse",
            "process" : process, 
            "status" : "success",
            "source" : "staging",
            "table_name"  : table_name, 
            "etl_date" : datetime.now().strftime("%Y-%m-%d %H:%M:%S")  # Current timestamp
        }

        return data 
    
    except Exception as e :
        log_msg = {
            "step" : "warehouse",
            "process" : process, 
            "status" : "failed",
            "source" : "staging",
            "table_name"  : table_name, 
            "etl_date" : datetime.now().strftime("%Y-%m-%d %H:%M:%S"),  # Current timestamp
            "error_msg" : str(e)
        }
        handle_error(data = data, 
                     bucket_name = 'error_dellstore',
                     table_name = table_name, 
                     process = process)    
    finally :
        etl_log(log_msg)

#### Transform Inventory

In [28]:
## this function is for transformation table inventory from staging to data warehouse
## - rename column quan_in_stock to quantity_in_stock
## - column product_id refers to products table using column product_nk

def transform_inventory(data: pd.DataFrame, table_name: str) -> pd.DataFrame : 
    try :
        process = "transformation"

        # rename column inventory
        data = data.rename(columns={
            "quan_in_stock" : "quantity_in_stock",
            "prod_id" : "product_nk"
        }) 

        # remove duplicate based on product_nk 
        data = data.drop_duplicates(subset='product_nk')

        #extract data from the 'products' table 
        products = extract_target('products')

        # lookup 'product_id' from products table based on 'product_nk'
        data['product_id'] = data['product_nk'].apply(lambda x : products.loc[products['product_nk']== x, 'product_id'].values[0])
        # coba pake map 
        # products_mapping = products.set_index('product_nk')['product_id']
        # data['product_id'] = data['product_nk'].map(products_mapping)

        # drop column created_at 
        data = data.drop(columns=['created_at'])

        log_msg = {
            "step" : "warehouse",
            "process": process,
            "status": "success",
            "source": "staging",
            "table_name": table_name,
            "etl_date": datetime.now().strftime("%Y-%m-%d %H:%M:%S")  # Current timestamp
        }
        return data
    except Exception as e :
        log_msg = {
            "step" : "warehouse",
            "process": process,
            "status": "failed",
            "source": "staging",
            "table_name": table_name,
            "etl_date": datetime.now().strftime("%Y-%m-%d %H:%M:%S"),  # Current timestamp
            "error_msg" : str(e)
        }
        handle_error(data = data, 
                     bucket_name = 'error_dellstore',
                     table_name = table_name, 
                     process = process)
    finally :
        etl_log(log_msg)



#### Transform Orders 

In [31]:
## this function is for transformation table inventory from staging to data warehouse
## - rename column orderid to order_nk 
## - get customer_id values from customer table by matching customer_nk 
## - rename orderdate to order_date 
## - rename netamount to net_amount 
## - rename totalamount to total_amount 

def transform_orders(data: pd.DataFrame, table_name: str) -> pd.DataFrame : 
    try :
        
        process = "transformation"

        # rename column 
        data = data.rename(columns = {
            "orderid" : "order_nk",
            "orderdate" : "order_date",
            "netamount" : "net_amount",
            "totalamount" : "total_amount",
            "customerid" : "customer_nk"
        })

        # extract table customer 
        customers = extract_target("customers")

        # lookup customer_id from customer table based on customer_nk
        data['customer_id'] = data['customer_nk'].apply(lambda x : 
                                                        customers.loc[customers['customer_nk']==x, 'customer_id'].values[0])
        # or using map
        # customer_mapping = customers.set_index('customer_nk')['customer_id']  
        # data['customer_id'] = data['customer_nk'].map(customer_mapping)

        # drop column created at and customer_nk (for identifier customer can use customer_id column)
        data = data.drop(columns = ['created_at', 'customer_nk'])

        log_msg = {
                "step" : "warehouse",
                "process": process,
                "status": "success",
                "source": "staging",
                "table_name": table_name,
                "etl_date": datetime.now().strftime("%Y-%m-%d %H:%M:%S")  # Current timestamp
                }
        return data

    except Exception as e :
        log_msg = {
                "step" : "warehouse",
                "process": process,
                "status": "failed",
                "source": "staging",
                "table_name": table_name,
                "etl_date": datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
                "error_msg" : str(e)
                }
        handle_error(
            data=data,
            bucket_name='error-dellstore',
            table_name = table_name,
            process=process
        )

    finally :
        etl_log(log_msg)


#### Transform Orderline

In [33]:
## this function is for transform orders table from staging to data warehouse 
## - rename column orderlineid to orderline_nk
## - rename column orderid to order_id and lookup from orders table based on orderid
## - rename column prod_id to product_id and lookup from products based on prod_id
## - rename column orderdate to order_date 

def transform_orderlines(data: pd.DataFrame, table_name: str) -> pd.DataFrame :
    try :
        process = "transformation"

        #rename column 
        data = data.rename(columns= {
            "orderlineid" : "orderline_nk",
            "order_id" : "order_nk",
            "prod_id" : "product_nk",
            "orderdate"  :"order_date"
        })

        # extract data from orders table 
        orders = extract_target("orders")

        #lookup 'order_id' 
        data['order_id'] = data['order_nk'].apply(lambda x: orders.loc[orders['order_nk'] == x, 'order_id'].values[0])
        #or using map 
        # orders_mapping = orders.set_index('order_nk')['order_id']
        # data['order_id'] = data['order_nk'].map(orders_mapping)

        # extract data from products table 
        products = extract_target("products")

        #lookup product id 
        data['product_id'] = data['product_nk'].apply(lambda x: orders.loc[orders['product_nk'] == x, 'order_id'].values[0])
        #or using map 
        # products_mapping = products.set_index('product_nk')['product_id']
        # data['product_id'] = data['product_nk'].map(products_mapping)

        # drop unnecessary columns 
        data = data.drop(columns = ['created_at', 'order_nk', 'product_nk'])

        log_msg = {
                "step" : "warehouse",
                "process": process,
                "status": "success",
                "source": "staging",
                "table_name": table_name,
                "etl_date": datetime.now().strftime("%Y-%m-%d %H:%M:%S")  # Current timestamp
                }
        
        return data

    except Exception as e :
        log_msg = {
            "step" : "warehouse",
            "process": process,
            "status": "failed",
            "source": "staging",
            "table_name": table_name,
            "etl_date": datetime.now().strftime("%Y-%m-%d %H:%M:%S"),  # Current timestamp,
            "error_msg": str(e)
            }
        handle_error(
            data= data,
            bucket_name= 'error_dellstore',
            table_name = table_name,
            process= process
        )

    finally :
        etl_log(log_msg)


#### Transform customer_orders_history

Target:
- Table customers
- Table product
- Table orders
- Table orderlines

In [34]:
def transform_order_hist_cust(data: pd.DataFrame, table_name: str) -> pd.DataFrame:
    try:
        process = "transformation"
        # rename column for customers
        data = data.rename(columns={
                                'customer_id': 'customer_nk',
                                'customer_firstname': 'first_name',
                                'customer_lastname': 'last_name',
                                'customer_address1': 'address1',
                                'customer_address2': 'address2',
                                'customer_city': 'city',
                                'customer_state': 'state',
                                'customer_zip': 'zip',
                                'customer_country': 'country',
                                'customer_region': 'region',
                                'customer_email': 'email',
                                'customer_phone': 'phone',
                                'customer_creditcardtype': 'credit_card_type',
                                'customer_creditcard': 'credit_card',
                                'customer_creditcardexpiration': 'credit_card_expiration',
                                'customer_username': 'username',
                                'customer_password': 'password',
                                'customer_age': 'age',
                                'customer_income': 'income',
                                'customer_gender': 'gender'
                            }) 
        
        columns_to_keep = [
            'customer_nk', 'customer_id', 'first_name', 'last_name', 
            'address1', 'address2', 'city', 'state', 'zip', 
            'country', 'region', 'email', 'phone', 
            'credit_card_type', 'credit_card', 'credit_card_expiration', 
            'username', 'password', 'age', 'income', 'gender'
        ]

        # Drop unnecessary columns
        data = data.drop(columns=[col for col in data.columns if col not in columns_to_keep])

        # Deduplication based on customer_nk
        data = data.drop_duplicates(subset='customer_nk')

        log_msg = {
                "step" : "warehouse",
                "process": process,
                "status": "success",
                "source": "staging",
                "table_name": table_name,
                "etl_date": datetime.now().strftime("%Y-%m-%d %H:%M:%S")  # Current timestamp
                }
        
        return data
    except Exception as e:
        log_msg = {
            "step" : "warehouse",
            "process": process,
            "status": "failed",
            "source": "staging",
            "table_name": table_name,
            "etl_date": datetime.now().strftime("%Y-%m-%d %H:%M:%S"),  # Current timestamp,
            "error_msg": str(e)
            }
        print(e)
        handle_error(data = data, bucket_name='error-dellstore', table_name= table_name, process=process)
    finally:
        # Save the log message
        etl_log(log_msg)



#### Transform Customer History 

In [36]:
def transform_cust_hist(data: pd.DataFrame, table_name: str) -> pd.DataFrame:
    try:
        process = "transformation"
        # rename column orderlines
        data = data.rename(columns={'customerid':'customer_nk', 
                                    'prod_id':'product_nk', 
                                    'orderid':'order_nk'})
        
        # Extract data from the `customers` table
        customers = extract_target('customers')

        # Lookup `customer_id` from `customers` table based on `customerid`   
        data['customer_id'] = data['customer_nk'].apply(lambda x: customers.loc[customers['customer_nk'] == x, 'customer_id'].values[0])
        

        # Extract data from the `orders` table
        orders = extract_target('orders')

        # Lookup `order_id` from `orders` table based on `orderid`   
        data['order_id'] = data['order_nk'].apply(lambda x: orders.loc[orders['order_nk'] == x, 'order_id'].values[0])
        
        # Extract data from the `product` table
        products = extract_target('products')

        # Lookup `product_id` from `product` table based on `prod_id`   
        data['product_id'] = data['product_nk'].apply(lambda x: products.loc[products['product_nk'] == x, 'product_id'].values[0])
        
        # drop unnecessary columns
        data = data.drop(columns=['customer_nk','order_nk','product_nk'])


        log_msg = {
                "step" : "warehouse",
                "process": process,
                "status": "success",
                "source": "staging",
                "table_name": table_name,
                "etl_date": datetime.now().strftime("%Y-%m-%d %H:%M:%S")  # Current timestamp
                }
        
        return data
    except Exception as e:
        log_msg = {
            "step" : "warehouse",
            "process": process,
            "status": "failed",
            "source": "staging",
            "table_name": table_name,
            "etl_date": datetime.now().strftime("%Y-%m-%d %H:%M:%S"),  # Current timestamp,
            "error_msg": str(e)
            }
        handle_error(data = data, bucket_name='error-dellstore', table_name= table_name, process=process)
    finally:
        # Save the log message
        etl_log(log_msg)



#### Transform order status analytics

In [37]:
def transform_order_status_analytic(data: pd.DataFrame, table_name: str) -> pd.DataFrame:
    try:
        process = "transformation"
        # rename column order_status_analytic
        data = data.rename(columns={'orderid':'order_nk'})

        # Extract data from the `orders` table
        orders = extract_target('orders')

        # Lookup `order_id` from `orders` table based on `orderid`   
        data['order_id'] = data['order_nk'].apply(lambda x: orders.loc[orders['order_nk'] == x, 'order_id'].values[0])
        
        # drop unnecessary columns
        data = data.drop(columns='created_at')

        log_msg = {
                "step" : "warehouse",
                "process": process,
                "status": "success",
                "source": "staging",
                "table_name": table_name,
                "etl_date": datetime.now().strftime("%Y-%m-%d %H:%M:%S")  # Current timestamp
                }
        
        return data
    except Exception as e:
        log_msg = {
            "step" : "warehouse",
            "process": process,
            "status": "failed",
            "source": "staging",
            "table_name": table_name,
            "etl_date": datetime.now().strftime("%Y-%m-%d %H:%M:%S"),  # Current timestamp,
            "error_msg": str(e)
            }
        handle_error(data = data, bucket_name='error-dellstore', table_name= table_name, process=process)
    finally:
        # Save the log message
        etl_log(log_msg)

