In [4]:
from prefect import task, flow, get_run_logger
from dataingestion.DataIngestion import load_csv, load_api, load_db
from datastorage.DataStorage import save_csv_or_db, save_api
from datavalidation.DataValidation import validate_churn_data
from datapreparation.DataPreparation import preprocess_and_eda
from datatransformationandstorage.DataTransformationAndStorage import transform_and_store
from featurestore.FeatureStore  import create_feature_store, sample_feature_queries
from dataversioning.DataVersioning import save_and_version_both
from modelbuild.ModelBuild import run_training

@task
def ingest_data():
    logger = get_run_logger()
    csv_url = "https://synapseaisolutionsa.z13.web.core.windows.net/data/bankcustomerchurn/churn.csv"
    logger.info(f"📥 Ingesting data from {csv_url}")
    df_csv = load_csv(csv_url, csv_url)
    logger.info(f"✅ Data ingestion complete. Shape: {df_csv.shape}")
    return df_csv

@task
def store_data(df_csv):
    logger = get_run_logger()
    base_dir = "datastorage"
    save_csv_or_db(df_csv, base_dir, "csv")
    logger.info(f"✅ Data stored at {base_dir}")
    return base_dir

@task
def validate_data(df_csv):
    logger = get_run_logger()
    base_dir = "datavalidation/reports"
    issues, metadata = validate_churn_data(df_csv, base_dir, "pdf")
    logger.info(f"🔍 Validation complete. Issues: {len(issues)} Metadata: {metadata}")
    return issues, metadata

@task
def prepare_data(df_csv):
    logger = get_run_logger()
    base_dir = "datapreparation/prepared"
    df_processed = preprocess_and_eda(df_csv, base_dir)
    logger.info(f"✅ Data preparation complete. Shape: {df_processed.shape}")
    return df_processed

@task
def transform_data(df_processed):
    logger = get_run_logger()
    base_dir = "datatransformationandstorage/transformationandstorage"
    df_txfnstr = transform_and_store(df_processed, base_dir, "churn")
    logger.info(f"✅ Data transformation complete. Shape: {df_txfnstr.shape}")
    return df_txfnstr

@task
def build_feature_store(df_txfnstr):
    logger = get_run_logger()
    base_path = "featurestore/featurestore"
    df_feature, conn, db_path = create_feature_store(df_txfnstr, base_path)
    sample_feature_queries(conn, base_path)
    conn.close()
    logger.info(f"✅ Feature store created at {base_path}, DB path: {db_path}")
    return df_feature, db_path

@task
def version_data(df_csv, df_feature):
    logger = get_run_logger()
    save_and_version_both(
        df_csv,
        df_feature,
        "dataversioning/raw/churn_raw.csv",
        "dataversioning/transformed/churn_transformed_v1.csv",
        "churn_raw.csv",
        "Changes_Commited"
    )
    logger.info("✅ Data versioning complete.")

@task
def train_model(db_path):
    logger = get_run_logger()
    run_training(db_path)
    logger.info("✅ Model training complete.")


@flow(name="Churn ML Pipeline Orchestration")
def churn_pipeline():
    df_csv = ingest_data()
    store_data(df_csv)
    validate_data(df_csv)
    df_processed = prepare_data(df_csv)
    df_txfnstr = transform_data(df_processed)
    df_feature, db_path = build_feature_store(df_txfnstr)
    version_data(df_csv, df_feature)
    train_model(db_path)

    print("✅ Pipeline complete!")


if __name__ == "__main__":
    churn_pipeline()

[main 4558816] Dataset update: churn_raw.csv (raw + transformed) - Changes_Commited
 258 files changed, 616 insertions(+), 315851 deletions(-)
 rename {dataingestion/.ipynb_checkpoints => .ipynb_checkpoints}/Untitled-checkpoint.ipynb (100%)
 delete mode 100644 Churn
 create mode 100644 Untitled.ipynb
 create mode 100644 data/.ipynb_checkpoints/model_results-checkpoint.txt
 create mode 100644 data/.ipynb_checkpoints/model_versions-checkpoint.json
 create mode 100644 data/.ipynb_checkpoints/version_metadata-checkpoint.json
 create mode 100644 data/model_results.txt
 create mode 100644 data/model_versions.json
 delete mode 100644 dataingestion/Untitled.ipynb
 delete mode 100644 datapreparation/.ipynb_checkpoints/Untitled-checkpoint.ipynb
 delete mode 100644 datapreparation/Untitled.ipynb
 delete mode 100644 datapreparation/prepared/.ipynb_checkpoints/eda_report_20250821_165634-checkpoint.pdf
 delete mode 100644 datapreparation/prepared/cleaned_data_20250821_165636.csv
 delete mode 100644 

From https://github.com/dhairyas87/dmml-bank-churn-pipeline
 * branch            main       -> FETCH_HEAD


Current branch main is up to date.
✅ Raw + Transformed datasets for churn_raw.csv saved, versioned, and pushed under commit 4558816118b126cb41f9d26ea5c6839651711a01


To https://github.com/dhairyas87/dmml-bank-churn-pipeline.git
   a21ff23..59f50d3  main -> main


NameError: name 'sqlite3' is not defined

In [2]:
pip install prefect

Collecting prefect
  Downloading prefect-3.4.14-py3-none-any.whl.metadata (13 kB)
Collecting aiosqlite<1.0.0,>=0.17.0 (from prefect)
  Downloading aiosqlite-0.21.0-py3-none-any.whl.metadata (4.3 kB)
Collecting alembic<2.0.0,>=1.7.5 (from prefect)
  Downloading alembic-1.16.4-py3-none-any.whl.metadata (7.3 kB)
Collecting anyio<5.0.0,>=4.4.0 (from prefect)
  Using cached anyio-4.10.0-py3-none-any.whl.metadata (4.0 kB)
Collecting apprise<2.0.0,>=1.1.0 (from prefect)
  Downloading apprise-1.9.4-py3-none-any.whl.metadata (55 kB)
Collecting asgi-lifespan<3.0,>=1.0 (from prefect)
  Downloading asgi_lifespan-2.1.0-py3-none-any.whl.metadata (10 kB)
Collecting asyncpg<1.0.0,>=0.23 (from prefect)
  Downloading asyncpg-0.30.0-cp312-cp312-macosx_11_0_arm64.whl.metadata (5.0 kB)
Collecting coolname<3.0.0,>=1.0.4 (from prefect)
  Downloading coolname-2.2.0-py2.py3-none-any.whl.metadata (6.2 kB)
Collecting dateparser<2.0.0,>=1.1.1 (from prefect)
  Downloading dateparser-1.2.2-py3-none-any.whl.metadata