In [None]:
# Data Ingestion, Processing, and MLflow Model Logging
import io, os
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from pandas.plotting import scatter_matrix
import mlflow
from mlflow.models import infer_signature

from domino.data_sources import DataSourceClient
from domino_data.datasets import DatasetClient

from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

from ydata_profiling import ProfileReport

import time
import yaml

%matplotlib inline

domino_working_dir = os.environ.get("DOMINO_WORKING_DIR", ".")
domino_datasource_dir = domino_working_dir.replace('code', 'data')
domino_artifact_dir = domino_working_dir.replace('code', 'artifacts')
domino_project_name = os.environ.get("DOMINO_PROJECT_NAME", "my-local-project")

## Data Ingestion, Processing, and MLflow Model Logging

def run_data_ingestion_and_processing(raw_filename: str, pca_filename: str, n_components: int = 8):
    # 1) Download the raw file
    ds = DataSourceClient().get_datasource("credit_card_fraud_detection")
    buf = io.BytesIO()
    ds.download_fileobj(raw_filename, buf)
    buf.seek(0)
    df = pd.read_csv(buf)
    print(f"🔍 Loaded {len(df):,} rows from {raw_filename}")

    # 2) Drop missing rows
    before = len(df)
    df = df.dropna()
    after = len(df)
    pct_removed = 100 * (before - after) / before if before > 0 else 0
    print(f"🧹 Dropped {before - after:,} rows with missing data")

    # 3) Define columns
    cat_cols = ["TxType", "DeviceType", "MerchantCat", "Channel", "CardPresent"]
    num_cols = [
        "Amount", "Age", "Tenure", "MerchantRisk", "DeviceTrust",
        "Txn24h", "Avg30d", "IPReputation", "Latitude", "Longitude", "DistFromHome"
    ]
    X = df[cat_cols + num_cols]
    y = df["Class"]

    # 4) Build and fit Pipeline: OHE categoricals, scale numerics, then PCA on all
    preprocessor = ColumnTransformer(
        transformers=[
            ("ohe", OneHotEncoder(sparse_output=False, handle_unknown="ignore"), cat_cols),
            ("scale", StandardScaler(), num_cols)
        ],
        remainder="drop"
    )
    pipeline = Pipeline([
        ("preproc", preprocessor),
        ("pca",     PCA(n_components=n_components, random_state=0))
    ])
    start_time = time.time()
    PCs = pipeline.fit_transform(X)
    fit_time = time.time() - start_time
    pca_model = pipeline.named_steps["pca"]

    # 5) Reassemble and save PCA DataFrame
    pca_df = pd.DataFrame(PCs, columns=[f"V{i+1}" for i in range(n_components)])
    pca_df["Time"]   = df["Time"].astype(int)
    pca_df["Amount"] = df["Amount"]
    pca_df["Class"]  = df["Class"].astype(int)

    full_path = f"{domino_datasource_dir}/{domino_project_name}/{pca_filename}"
    pca_df.to_csv(full_path, index=False)
    print(f"✅ Wrote {len(pca_df):,} rows to: {pca_filename}")

    # 6) Start MLflow run and log everything
    mlflow.set_experiment('CC Fraud PCA Training [testing]')
    with mlflow.start_run(run_name="PCA Pipeline") as run:
        # Log parameters
        mlflow.log_param("n_components", n_components)
        mlflow.log_param("raw_filename", raw_filename)
        mlflow.log_param("pca_filename", pca_filename)
        mlflow.log_param("num_rows_loaded", before)
        mlflow.log_param("num_rows_after_dropna", after)
        mlflow.log_param("num_cat_features", len(cat_cols))
        mlflow.log_param("num_num_features", len(num_cols))

        # Log human-readable pipeline parameters as YAML
        pipeline_params = {
            "n_components": n_components,
            "raw_filename": raw_filename,
            "pca_filename": pca_filename,
            "num_rows_loaded": before,
            "num_rows_after_dropna": after,
            "num_cat_features": len(cat_cols),
            "num_num_features": len(num_cols),
            "categorical_columns": cat_cols,
            "numerical_columns": num_cols,
        }
        params_yaml_path = f"{domino_artifact_dir}/pipeline_params.yaml"
        with open(params_yaml_path, "w") as f:
            yaml.dump(pipeline_params, f, default_flow_style=False)
        mlflow.log_artifact(params_yaml_path, artifact_path="params")

        # Log the PCA CSV
        mlflow.log_artifact(full_path, artifact_path="data")

        # Log the pipeline as a single model
        X_sig = X.copy()
        for col in num_cols:
            if np.issubdtype(X_sig[col].dtype, np.integer):
                X_sig[col] = X_sig[col].astype("float64")
        signature = infer_signature(X_sig.iloc[:5], pipeline.transform(X_sig.iloc[:5]))
        mlflow.sklearn.log_model(
            pipeline,
            artifact_path="preproc_pca_pipeline",
            registered_model_name="CC Fraud Preprocessing & PCA",
            signature=signature
        )
        mlflow.set_tag("pipeline", "full_preproc_pca")

        # Log metrics
        mlflow.log_metric("pct_data_removed", pct_removed)
        mlflow.log_metric("num_rows_removed", before - after)
        mlflow.log_metric("pca_fit_time_sec", fit_time)
        evr = pca_model.explained_variance_ratio_
        mlflow.log_metric("explained_variance_pc1", float(evr[0]) if len(evr) > 0 else 0)
        mlflow.log_metric("explained_variance_total", float(np.sum(evr)))

        # 7) Generate and log artifacts (corr, scatter, scree, etc.)
        num_df = df.select_dtypes(include="number").drop(columns=["Time", "Class"], errors="ignore")
        # Correlation heatmap
        plt.figure(figsize=(14,12))
        sns.heatmap(num_df.corr(), annot=True, fmt=".2f", cmap="vlag")
        plt.title("Correlation Matrix")
        corr_path = f"{domino_artifact_dir}/raw_correlation_matrix.png"
        plt.savefig(corr_path); plt.close()
        mlflow.log_artifact(corr_path, artifact_path="plots")
        # Scatter matrix
        sample_df = num_df.sample(n=500, random_state=0)
        fig = scatter_matrix(sample_df, alpha=0.2, diagonal="hist", figsize=(15,15))
        scatter_path = f"{domino_artifact_dir}/raw_scatter_plots.png"
        plt.savefig(scatter_path); plt.close()
        mlflow.log_artifact(scatter_path, artifact_path="plots")
        # Scree and cumulative
        evr = pca_model.explained_variance_ratio_
        pcs = np.arange(1, len(evr)+1)
        # Scree
        plt.figure(figsize=(8,5))
        plt.plot(pcs, evr, marker='o')
        plt.xlabel("PC"); plt.ylabel("Explained Var Ratio"); plt.title("Scree Plot")
        scree_path = f"{domino_artifact_dir}/pca_scree.png"
        plt.savefig(scree_path); plt.close()
        mlflow.log_artifact(scree_path, artifact_path="plots")
        # Cumulative
        cumvar = np.cumsum(evr)
        plt.figure(figsize=(8,5))
        plt.plot(pcs, cumvar, marker='o')
        plt.axhline(0.9, linestyle='--', label='90%')
        plt.xlabel("# Components"); plt.ylabel("Cumulative Var"); plt.title("Cumulative Variance")
        plt.legend()
        cumvar_path = f"{domino_artifact_dir}/pca_cumulative_variance.png"
        plt.savefig(cumvar_path); plt.close()
        mlflow.log_artifact(cumvar_path, artifact_path="plots")

        # 8) EDA HTML
        profile = ProfileReport(df, title="EDA Report", explorative=True, minimal=True)
        eda_path = f"{domino_artifact_dir}/eda_report.html"
        profile.to_file(eda_path)
        mlflow.log_artifact(eda_path, artifact_path="eda")

    return df, pca_df

# Usage:
raw_df, pca_df = run_data_ingestion_and_processing(
    raw_filename="raw_cc_transactions.csv",
    pca_filename="cleaned_cc_transactions.csv",
    n_components=8
)
