In [21]:
import importlib
import dataingestion.DataIngestion
import datastorage.DataStorage
import datavalidation.DataValidation
import datapreparation.DataPreparation
import datatransformationandstorage.DataTransformationAndStorage
import featurestore.FeatureStore
import dataversioning.DataVersioning
import modelbuild.ModelBuild
import os
importlib.reload(dataingestion.DataIngestion)
importlib.reload(datastorage.DataStorage)
importlib.reload(datavalidation.DataValidation)
importlib.reload(datapreparation.DataPreparation)
importlib.reload(datatransformationandstorage.DataTransformationAndStorage)
importlib.reload(featurestore.FeatureStore)
importlib.reload(dataversioning.DataVersioning)
importlib.reload(modelbuild.ModelBuild)


from prefect import task, flow, get_run_logger
from prefect.tasks import Task
from dataingestion.DataIngestion import load_csv, load_api, load_db
from datastorage.DataStorage import save_csv_or_db, save_api
from datavalidation.DataValidation import validate_churn_data
from datapreparation.DataPreparation import preprocess_and_eda
from datatransformationandstorage.DataTransformationAndStorage import transform_and_store
from featurestore.FeatureStore  import create_feature_store, sample_feature_queries
from dataversioning.DataVersioning import save_and_version_both
from modelbuild.ModelBuild import run_training
import sqlite3  
from graphviz import Digraph
import os
Digraph.format = "png"   # ensures inline rendering


# Define dependencies between tasks
dag_dependencies = {
    "ingest_data": ["store_data", "validate_data", "prepare_data"],
    "prepare_data": ["transform_data"],
    "transform_data": ["build_feature_store"],
    "build_feature_store": ["train_model", "version_data"]
}



def draw_dag(dependencies, title="Churn ML Pipeline"):
    dot = Digraph(comment=title, format="png")

    # Add all tasks as nodes
    tasks = set(dependencies.keys()) | {t for deps in dependencies.values() for t in deps}
    for task in tasks:
        dot.node(task, task)

    # Add edges
    for parent, children in dependencies.items():
        for child in children:
            dot.edge(parent, child)

    output_folder = "results"
    os.makedirs(output_folder, exist_ok=True)  # Create folder if it doesn't exist
    output_file = os.path.join(output_folder, "orchestor.dot")
    with open(output_file, "w") as f:
        f.write(dot.source)

    return dot
    
@task
def ingest_data():
    logger = get_run_logger()
    csv_url = "https://synapseaisolutionsa.z13.web.core.windows.net/data/bankcustomerchurn/churn.csv"
    logger.info(f"📥 Ingesting data from {csv_url}")
    df_csv = load_csv(csv_url, csv_url)
    logger.info(f"✅ Data ingestion complete. Shape: {df_csv.shape}")
    return df_csv

@task
def store_data(df_csv):
    logger = get_run_logger()
    base_dir = "results/store_data"
    save_csv_or_db(df_csv, base_dir, "csv")
    logger.info(f"✅ Data stored at {base_dir}")
    return base_dir

@task
def validate_data(df_csv):
    logger = get_run_logger()
    base_dir = "results/validate_data_reports"
    issues, metadata = validate_churn_data(df_csv, base_dir, "pdf")
    logger.info(f"🔍 Validation complete. Issues: {len(issues)} Metadata: {metadata}")
    return issues, metadata

@task
def prepare_data(df_csv):
    logger = get_run_logger()
    base_dir = "results/prepared_data"
    df_processed = preprocess_and_eda(df_csv, base_dir)
    logger.info(f"✅ Data preparation complete. Shape: {df_processed.shape}")
    return df_processed

@task
def transform_data(df_processed):
    logger = get_run_logger()
    base_dir = "results/transformation_and_storage"
    df_txfnstr = transform_and_store(df_processed, base_dir, "churn")
    logger.info(f"✅ Data transformation complete. Shape: {df_txfnstr.shape}")
    return df_txfnstr

@task
def build_feature_store(df_txfnstr):
    logger = get_run_logger()
    base_path = "results/featurestore"
    df_feature, conn, db_path = create_feature_store(df_txfnstr, base_path)
    sample_feature_queries(conn, base_path)
    logger.info(f"✅ Feature store created at {base_path}, DB path: {db_path}")
    return df_feature, db_path

@task
def version_data(df_csv, df_feature):
    logger = get_run_logger()
    save_and_version_both(
        df_csv,
        df_feature,
        "results/dataversion/churn_raw.csv",
        "results/dataversion/churn_transformed_v1.csv",
        "churn_raw",
        "Pipeline_runnning_updates"
    )
    logger.info("✅ Data versioning complete.")

@task
def train_model(db_path):
    logger = get_run_logger()
    run_training(db_path)
    logger.info("✅ Model training complete.")


@flow(name="Churn ML Pipeline Orchestration")
def churn_pipeline():
    df_csv = ingest_data()
    store_data(df_csv)
    validate_data(df_csv)
    df_processed = prepare_data(df_csv)
    df_txfnstr = transform_data(df_processed)
    df_feature, db_path = build_feature_store(df_txfnstr)
    version_data(df_csv, df_feature)
    train_model(db_path)
    dot = draw_dag(dag_dependencies)
    print("✅ Pipeline complete!")

# Draw DAG
   





if __name__ == "__main__":
    churn_pipeline()

IndentationError: expected an indented block after 'with' statement on line 65 (1117930757.py, line 66)