In [1]:
import importlib
import dataingestion.DataIngestion
import datastorage.DataStorage
import datavalidation.DataValidation
import datapreparation.DataPreparation
import datatransformationandstorage.DataTransformationAndStorage
import featurestore.FeatureStore
import dataversioning.DataVersioning
import modelbuild.ModelBuild

importlib.reload(dataingestion.DataIngestion)
importlib.reload(datastorage.DataStorage)
importlib.reload(datavalidation.DataValidation)
importlib.reload(datapreparation.DataPreparation)
importlib.reload(datatransformationandstorage.DataTransformationAndStorage)
importlib.reload(featurestore.FeatureStore)
importlib.reload(dataversioning.DataVersioning)
importlib.reload(modelbuild.ModelBuild)


from prefect import task, flow, get_run_logger
from dataingestion.DataIngestion import load_csv, load_api, load_db
from datastorage.DataStorage import save_csv_or_db, save_api
from datavalidation.DataValidation import validate_churn_data
from datapreparation.DataPreparation import preprocess_and_eda
from datatransformationandstorage.DataTransformationAndStorage import transform_and_store
from featurestore.FeatureStore  import create_feature_store, sample_feature_queries
from dataversioning.DataVersioning import save_and_version_both
from modelbuild.ModelBuild import run_training
import sqlite3  

@task
def ingest_data():
    logger = get_run_logger()
    csv_url = "https://synapseaisolutionsa.z13.web.core.windows.net/data/bankcustomerchurn/churn.csv"
    logger.info(f"📥 Ingesting data from {csv_url}")
    df_csv = load_csv(csv_url, csv_url)
    logger.info(f"✅ Data ingestion complete. Shape: {df_csv.shape}")
    return df_csv

@task
def store_data(df_csv):
    logger = get_run_logger()
    base_dir = "datastorage"
    save_csv_or_db(df_csv, base_dir, "csv")
    logger.info(f"✅ Data stored at {base_dir}")
    return base_dir

@task
def validate_data(df_csv):
    logger = get_run_logger()
    base_dir = "datavalidation/reports"
    issues, metadata = validate_churn_data(df_csv, base_dir, "pdf")
    logger.info(f"🔍 Validation complete. Issues: {len(issues)} Metadata: {metadata}")
    return issues, metadata

@task
def prepare_data(df_csv):
    logger = get_run_logger()
    base_dir = "datapreparation/prepared"
    df_processed = preprocess_and_eda(df_csv, base_dir)
    logger.info(f"✅ Data preparation complete. Shape: {df_processed.shape}")
    return df_processed

@task
def transform_data(df_processed):
    logger = get_run_logger()
    base_dir = "datatransformationandstorage/transformationandstorage"
    df_txfnstr = transform_and_store(df_processed, base_dir, "churn")
    logger.info(f"✅ Data transformation complete. Shape: {df_txfnstr.shape}")
    return df_txfnstr

@task
def build_feature_store(df_txfnstr):
    logger = get_run_logger()
    base_path = "featurestore/featurestore"
    df_feature, conn, db_path = create_feature_store(df_txfnstr, base_path)
    sample_feature_queries(conn, base_path)
    logger.info(f"✅ Feature store created at {base_path}, DB path: {db_path}")
    return df_feature, db_path

@task
def version_data(df_csv, df_feature):
    logger = get_run_logger()
    save_and_version_both(
        df_csv,
        df_feature,
        "dataversioning/raw/churn_raw.csv",
        "dataversioning/transformed/churn_transformed_v1.csv",
        "churn_raw.csv",
        "Changes_Commited"
    )
    logger.info("✅ Data versioning complete.")

@task
def train_model(db_path):
    logger = get_run_logger()
    run_training(db_path)
    logger.info("✅ Model training complete.")


@flow(name="Churn ML Pipeline Orchestration")
def churn_pipeline():
    df_csv = ingest_data()
    store_data(df_csv)
    validate_data(df_csv)
    df_processed = prepare_data(df_csv)
    df_txfnstr = transform_data(df_processed)
    df_feature, db_path = build_feature_store(df_txfnstr)
    version_data(df_csv, df_feature)
    train_model(db_path)

    print("✅ Pipeline complete!")

    dot = Digraph(comment="Churn ML Pipeline", format="png")

    # Nodes
    dot.node("ingest_data", "📥 Ingest Data")
    dot.node("store_data", "💾 Store Data")
    dot.node("validate_data", "🔍 Validate Data")
    dot.node("prepare_data", "⚙️ Prepare Data")
    dot.node("transform_data", "🔄 Transform Data")
    dot.node("build_feature_store", "🏗️ Build Feature Store")
    dot.node("version_data", "🗂️ Version Data")
    dot.node("train_model", "🤖 Train Model")
    
    # Edges (dependencies)
    dot.edge("ingest_data", "store_data")
    dot.edge("ingest_data", "validate_data")
    dot.edge("ingest_data", "prepare_data")
    
    dot.edge("prepare_data", "transform_data")
    dot.edge("transform_data", "build_feature_store")
    
    dot.edge("build_feature_store", "train_model")
    
    dot.edge("ingest_data", "version_data")
    dot.edge("build_feature_store", "version_data")
    
    # Show graph inline in Jupyter
    dot


if __name__ == "__main__":
    churn_pipeline()

ModuleNotFoundError: No module named 'sqlalchemy'

In [2]:
!prefect orion start

[33mUsage: [0mprefect [OPTIONS] COMMAND [ARGS]...
[2mTry [0m[2;34m'prefect [0m[1;2;34m-[0m[1;2;34m-help[0m[2;34m'[0m[2m for help.[0m
[31m╭─[0m[31m Error [0m[31m─────────────────────────────────────────────────────────────────────[0m[31m─╮[0m
[31m│[0m No such command 'orion'.                                                     [31m│[0m
[31m╰──────────────────────────────────────────────────────────────────────────────╯[0m
