
# Real-World Modular ETL Pipeline (Python)

Notebook ini menunjukkan contoh implementasi ETL nyata yang dimodularisasi dengan Python. Struktur ini dapat digunakan untuk pipeline skala kecil hingga menengah, dan dapat diadaptasi untuk penggunaan cloud, warehouse, atau batch jobs.


In [None]:

# Struktur Folder Modular ETL (untuk produksi):
# ├── etl/
# │   ├── __init__.py
# │   ├── extract.py
# │   ├── transform.py
# │   └── load.py
# ├── config/
# │   └── settings.py
# ├── main.py
# └── data/
#     └── raw_data.csv

# Untuk keperluan notebook, kita akan simulasikan semua modulnya.


In [None]:

import pandas as pd

# Simulasi data CSV (raw)
df = pd.DataFrame({
    'customer_id': [101, 102, 103],
    'name': ['Alice', 'Bob', 'Charlie'],
    'join_date': ['2021-06-01', '2022-01-15', '2020-09-30'],
    'purchase_amount': ['100.5', '200', '300.75']
})
df.to_csv("raw_data.csv", index=False)


In [None]:

# etl/extract.py
import pandas as pd

def extract_from_csv(path: str) -> pd.DataFrame:
    try:
        df = pd.read_csv(path)
        print("✅ Data berhasil diekstrak")
        return df
    except Exception as e:
        print(f"❌ Gagal extract: {e}")
        return pd.DataFrame()


In [None]:

# etl/transform.py
def transform_customer_data(df):
    try:
        df['join_date'] = pd.to_datetime(df['join_date'])
        df['purchase_amount'] = df['purchase_amount'].astype(float)
        df['name'] = df['name'].str.upper()
        df['customer_lifetime'] = (pd.Timestamp.now() - df['join_date']).dt.days
        print("✅ Data berhasil ditransformasi")
        return df
    except Exception as e:
        print(f"❌ Gagal transform: {e}")
        return df


In [None]:

# etl/load.py
def load_to_csv(df, output_path):
    try:
        df.to_csv(output_path, index=False)
        print(f"✅ Data berhasil dimuat ke {output_path}")
    except Exception as e:
        print(f"❌ Gagal load: {e}")


In [None]:

# config/settings.py
INPUT_PATH = "raw_data.csv"
OUTPUT_PATH = "cleaned_data.csv"


In [None]:

# main.py (pipeline ETL)

from extract import extract_from_csv
from transform import transform_customer_data
from load import load_to_csv
from settings import INPUT_PATH, OUTPUT_PATH

def main_etl():
    df_raw = extract_from_csv(INPUT_PATH)
    df_clean = transform_customer_data(df_raw)
    load_to_csv(df_clean, OUTPUT_PATH)

main_etl()


In [None]:

# Simulasi eksekusi ETL secara langsung di notebook
from extract import extract_from_csv
from transform import transform_customer_data
from load import load_to_csv

df = extract_from_csv("raw_data.csv")
df_clean = transform_customer_data(df)
load_to_csv(df_clean, "cleaned_data.csv")
