In [30]:
# Data Ingestion, Processing, and MLflow Model Logging
import io, os
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from pandas.plotting import scatter_matrix
import mlflow
from mlflow.models import infer_signature

from domino.data_sources import DataSourceClient
from domino_data.datasets import DatasetClient

from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

from ydata_profiling import ProfileReport

%matplotlib inline

domino_working_dir = os.environ.get("DOMINO_WORKING_DIR", ".")
domino_datasource_dir = domino_working_dir.replace('code', 'data')
domino_artifact_dir = domino_working_dir.replace('code', 'artifacts')
domino_project_name = os.environ.get("DOMINO_PROJECT_NAME", "my-local-project")

## Data Ingestion, Processing, and MLflow Model Logging

def run_data_ingestion_and_processing(raw_filename: str, pca_filename: str, n_components: int = 28):
    # 1) Download the raw file
    ds = DataSourceClient().get_datasource("credit_card_fraud_detection")
    buf = io.BytesIO()
    ds.download_fileobj(raw_filename, buf)
    buf.seek(0)
    df = pd.read_csv(buf)
    print(f"🔍 Loaded {len(df):,} rows from {raw_filename}")

    # 2) Drop missing rows
    before = len(df)
    df = df.dropna()
    print(f"🧹 Dropped {before - len(df):,} rows with missing data")

    # 3) Define columns
    cat_cols = ["TxType", "DeviceType", "MerchantCat", "Channel"]
    num_cols = [c for c in df.columns if c not in cat_cols + ["Class"]]
    X = df[cat_cols + num_cols]
    y = df["Class"]

    # 4) Build and fit Pipeline: OHE -> Scale -> PCA
    preprocessor = ColumnTransformer(
        transformers=[
            ("ohe", OneHotEncoder(sparse_output=False, handle_unknown="ignore"), cat_cols),
            ("scale", StandardScaler(), num_cols)
        ],
        remainder="drop"
    )
    pipeline = Pipeline([
        ("preproc", preprocessor),
        ("pca",     PCA(n_components=n_components, random_state=0))
    ])

    # Fit the pipeline and get PCs
    PCs = pipeline.fit_transform(X)
    pca_model = pipeline.named_steps["pca"]

    # 5) Reassemble and save PCA DataFrame
    pca_df = pd.DataFrame(PCs, columns=[f"V{i+1}" for i in range(n_components)])
    pca_df["Time"]   = df["Time"].astype(int)
    pca_df["Amount"] = df["Amount"]
    pca_df["Class"]  = df["Class"].astype(int)

    full_path = f"{domino_datasource_dir}/{domino_project_name}/{pca_filename}"
    pca_df.to_csv(full_path, index=False)
    print(f"✅ Wrote {len(pca_df):,} rows to: {pca_filename}")

    # 6) Start MLflow run and log everything
    mlflow.set_experiment('CC Fraud PCA Training [testing]')
    with mlflow.start_run(run_name="PCA Pipeline") as run:
        mlflow.log_param("n_components", n_components)
        mlflow.log_param("raw_filename", raw_filename)
        mlflow.log_param("pca_filename", pca_filename)
        mlflow.log_param("num_rows_loaded", before)
        mlflow.log_param("num_rows_after_dropna", len(df))
        mlflow.log_param("num_cat_features", len(cat_cols))
        mlflow.log_param("num_num_features", len(num_cols))

        # Log the PCA CSV
        mlflow.log_artifact(full_path, artifact_path="data")

                # Log the pipeline as a single model
        # Ensure numeric columns are float64 for signature
        X_sig = X.copy()
        for col in num_cols:
            if np.issubdtype(X_sig[col].dtype, np.integer):
                X_sig[col] = X_sig[col].astype("float64")
        signature = infer_signature(X_sig.iloc[:5], pipeline.transform(X_sig.iloc[:5]))
        mlflow.sklearn.log_model(
            pipeline,
            artifact_path="preproc_pca_pipeline",
            signature=signature
        )
        mlflow.set_tag("pipeline", "full_preproc_pca")

        # 7) Generate and log artifacts (corr, scatter, scree, etc.)
        num_df = df.select_dtypes(include="number").drop(columns=["Time", "Class"], errors="ignore")
        # Correlation heatmap
        plt.figure(figsize=(14,12))
        sns.heatmap(num_df.corr(), annot=True, fmt=".2f", cmap="vlag")
        plt.title("Correlation Matrix")
        corr_path = f"{domino_artifact_dir}/raw_correlation_matrix.png"
        plt.savefig(corr_path); plt.close()
        mlflow.log_artifact(corr_path, artifact_path="plots")
        # Scatter matrix
        sample_df = num_df.sample(n=500, random_state=0)
        fig = scatter_matrix(sample_df, alpha=0.2, diagonal="hist", figsize=(15,15))
        scatter_path = f"{domino_artifact_dir}/raw_scatter_plots.png"
        plt.savefig(scatter_path); plt.close()
        mlflow.log_artifact(scatter_path, artifact_path="plots")
        # Scree and cumulative
        evr = pca_model.explained_variance_ratio_
        pcs = np.arange(1, len(evr)+1)
        # Scree
        plt.figure(figsize=(8,5))
        plt.plot(pcs, evr, marker='o')
        plt.xlabel("PC"); plt.ylabel("Explained Var Ratio"); plt.title("Scree Plot")
        scree_path = f"{domino_artifact_dir}/pca_scree.png"
        plt.savefig(scree_path); plt.close()
        mlflow.log_artifact(scree_path, artifact_path="plots")
        # Cumulative
        cumvar = np.cumsum(evr)
        plt.figure(figsize=(8,5))
        plt.plot(pcs, cumvar, marker='o')
        plt.axhline(0.9, linestyle='--', label='90%')
        plt.xlabel("# Components"); plt.ylabel("Cumulative Var"); plt.title("Cumulative Variance")
        plt.legend()
        cumvar_path = f"{domino_artifact_dir}/pca_cumulative_variance.png"
        plt.savefig(cumvar_path); plt.close()
        mlflow.log_artifact(cumvar_path, artifact_path="plots")

        # 8) EDA HTML
        profile = ProfileReport(df, title="EDA Report", explorative=True, minimal=True)
        eda_path = f"{domino_artifact_dir}/eda_report.html"
        profile.to_file(eda_path)
        mlflow.log_artifact(eda_path, artifact_path="eda")

    return df, pca_df

# Usage:
raw_df, pca_df = run_data_ingestion_and_processing(
    raw_filename="raw_cc_transactions.csv",
    pca_filename="cleaned_cc_transactions.csv",
    n_components=28
)


🔍 Loaded 478,324 rows from raw_cc_transactions.csv
🧹 Dropped 64,997 rows with missing data
✅ Wrote 413,327 rows to: cleaned_cc_transactions.csv




Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]


  0%|          | 0/19 [00:00<?, ?it/s][A
  5%|▌         | 1/19 [00:01<00:20,  1.16s/it][A
 11%|█         | 2/19 [00:02<00:16,  1.02it/s][A
 16%|█▌        | 3/19 [00:02<00:11,  1.42it/s][A
 21%|██        | 4/19 [00:02<00:06,  2.15it/s][A
100%|██████████| 19/19 [00:03<00:00,  6.08it/s][A


Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

Export report to file:   0%|          | 0/1 [00:00<?, ?it/s]

🏃 View run PCA Pipeline at: http://127.0.0.1:8768/#/experiments/1537/runs/71a47a53d096430b9487dd2ca8910159
🧪 View experiment at: http://127.0.0.1:8768/#/experiments/1537
