Part 3: ETL Strategies
    1. Incremental Data Load:

In [1]:
import pandas as pd
from sqlalchemy import create_engine, text
import sqlalchemy
import logging

# -------------------------------
# Setup logging
# -------------------------------
logging.basicConfig(
    filename="etl_errors.log",
    level=logging.ERROR,
    format="%(asctime)s - %(levelname)s - %(message)s"
)

# -------------------------------
# Database connection
# -------------------------------
engine = create_engine("sqlite:///sales_dw.db", echo=False)

# -------------------------------
# Step 1: Extract new daily data
# -------------------------------
try:
    new_sales = pd.read_csv("../data/sales_data_daily_update.csv")

    # Add TotalSales
    new_sales["TotalSales"] = new_sales["QuantitySold"] * new_sales["Price"]

except Exception as e:
    logging.error(f"Error reading daily update CSV: {e}")
    raise

try:


    # Add TotalSales
    new_sales["TotalSales"] = new_sales["QuantitySold"] * new_sales["Price"]

except Exception as e:
    logging.error(f"Error reading daily update CSV: {e}")
    raise

autohandle_dates = True
# Handle invalid dates gracefully by coercing errors to NaT
if autohandle_dates:
    new_sales["DateID"] = pd.to_datetime(new_sales["Date"], errors="coerce").dt.date

display(new_sales)


Unnamed: 0,Date,ProductID,ProductName,QuantitySold,Price,Category,CustomerID,TotalSales,DateID
0,2023-02-23,1,Widget A,5,2.5,Gadgets,101,12.5,2023-02-23
1,2023-02-24,2,Gadget B,3,3.0,Gadgets,102,9.0,2023-02-24
2,2023-02-25,3,Widget C,8,4.0,Widgets,103,32.0,2023-02-25
3,2023-02-26,4,Gizmo D,2,5.0,Gizmos,104,10.0,2023-02-26
4,2023-02-27,5,Widget E,6,6.0,Widgets,105,36.0,2023-02-27
5,2023-02-28,6,Gadget F,7,7.0,Gadgets,106,49.0,2023-02-28
6,2023-02-29,7,Gizmo G,4,8.0,Gizmos,101,32.0,NaT



Step 2: Data Quality Checks


In [None]:


def validate_sales(df: pd.DataFrame) -> bool:
    try:
        # Required columns check
        required_cols = ["Date", "ProductID", "CustomerID", "QuantitySold", "Price"]
        for col in required_cols:
            if df[col].isnull().any():
                raise ValueError(f"Missing values in column: {col}")

        # Type checks
        if not pd.api.types.is_integer_dtype(df["QuantitySold"]):
            raise TypeError("QuantitySold must be integer")
        if not pd.api.types.is_float_dtype(df["Price"]):
            raise TypeError("Price must be float")

        # Business rules
        if (df["QuantitySold"] < 0).any():
            raise ValueError("Negative QuantitySold found")
        if (df["Price"] < 0).any():
            raise ValueError("Negative Price found")
        if df["DateID"].isnull().any(): # Check for invalid dates
            raise ValueError("Invalid dates found in Date column")

        return True

    except Exception as e:
        logging.error(f"Data quality check failed: {e}")
        return False

if not validate_sales(new_sales):
    print("Data quality check failed. Aborting ETL.")
    exit(1)


Data quality check failed. Aborting ETL.


3. incremental load with Error Handling

In [3]:
# -------------------------------
# Load new data into stage table
# -------------------------------
try:

    # Overwrite staging table each run
    new_sales.to_sql(
        "stg_sales",
        con=engine,
        if_exists="replace",   # staging is always replaced
        index=False,
        dtype={
            "DateID": sqlalchemy.types.Date(),
            "ProductID": sqlalchemy.types.Integer(),
            "ProductName": sqlalchemy.types.String(),
            "QuantitySold": sqlalchemy.types.Integer(),
            "Price": sqlalchemy.types.Float(),
            "Category": sqlalchemy.types.String(),
            "CustomerID": sqlalchemy.types.Integer(),
            "TotalSales": sqlalchemy.types.Float()
        }
    )

except Exception as e:
    logging.error(f"Error loading staging table: {e}")
    raise

# -------------------------------
# Incremental insert into fact_sales
# -------------------------------
try:
    with engine.begin() as conn:  # auto-commit transaction
        # Insert only rows that do not exist yet
        conn.execute(text("""
            INSERT INTO fact_sales (DateID, ProductID, ProductName, QuantitySold, Price, Category, CustomerID, TotalSales)
            SELECT s.DateID, s.ProductID, s.ProductName, s.QuantitySold, s.Price, s.Category, s.CustomerID, s.TotalSales
            FROM stg_sales s
            LEFT JOIN fact_sales f
              ON s.DateID = f.DateID
             AND s.ProductID = f.ProductID
             AND s.CustomerID = f.CustomerID
            WHERE f.DateID IS NULL and s.DateID IS NOT NULL
        """))

    print("Incremental load complete.")

except Exception as e:
    logging.error(f"Error during incremental insert: {e}")
    raise

Incremental load complete.


In [5]:
query = """

SELECT ProductID,	ProductName,	QuantitySold,	Price,	Category,	CustomerID,	TotalSales,	DateID
FROM fact_sales
  
"""

result_df = pd.read_sql_query(query, con=engine)
display(result_df)

Unnamed: 0,ProductID,ProductName,QuantitySold,Price,Category,CustomerID,TotalSales,DateID
0,9,Gadget I,10,10.0,Gadgets,102,100.0,2023-02-01
1,10,Gizmo J,3,11.0,Gizmos,103,33.0,2023-02-02
2,1,Widget A,6,2.5,Gadgets,104,15.0,2023-02-03
3,2,Gadget B,9,3.0,Gadgets,105,27.0,2023-02-04
4,3,Widget C,8,4.0,Widgets,101,32.0,2023-02-05
5,4,Gizmo D,6,5.0,Gizmos,102,30.0,2023-02-06
6,5,Widget E,7,6.0,Widgets,103,42.0,2023-02-07
7,6,Gadget F,5,7.0,Gadgets,104,35.0,2023-02-08
8,7,Gizmo G,2,8.0,Gizmos,105,16.0,2023-02-09
9,8,Widget H,11,9.0,Widgets,101,99.0,2023-02-10
