In [None]:
import pandas as pd
import numpy as np

# Read some customer data from the Aqueduct repo.
customers_table = pd.read_csv(
    "https://raw.githubusercontent.com/aqueducthq/aqueduct/main/examples/churn_prediction/data/customers.csv"
)
churn_table = pd.read_csv(
    "https://raw.githubusercontent.com/aqueducthq/aqueduct/main/examples/churn_prediction/data/churn_data.csv"
)
pd.merge(customers_table, churn_table, on="cust_id").head()

In [None]:
from zenml.steps import step

In [None]:
# The @op decorator here allows Aqueduct to run this function as
# a part of an Aqueduct workflow. It tells Aqueduct that when
# we execute this function, we're defining a step in the workflow.
# While the results can be retrieved immediately, nothing is
# published until we call `publish_flow()` below.
@step
def log_featurize(cust: pd.DataFrame) -> pd.DataFrame:
    """
    log_featurize takes in customer data from the Aqueduct customers table
    and log normalizes the numerical columns using the numpy.log function.
    It skips the cust_id, using_deep_learning, and using_dbt columns because
    these are not numerical columns that require regularization.

    log_featurize adds all the log-normalized values into new columns, and
    maintains the original values as-is. In addition to the original company_size
    column, log_featurize will add a log_company_size column.
    """
    features = cust.copy()
    skip_cols = ["cust_id", "using_deep_learning", "using_dbt"]

    for col in features.columns.difference(skip_cols):
        features["log_" + col] = np.log(features[col] + 1.0)

    return features.drop(columns="cust_id")

In [None]:
# Calling `.local()` on an @op-annotated function allows us to execute the
# function locally for testing purposes. When a function is called with
# `.local()`, Aqueduct does not capture the function execution as a part of
# the definition of a workflow.
features_table = log_featurize.entrypoint(customers_table)
features_table.head()

### Training the Model

In this example, we will train and ensemble two basic classifiers.  In practice, would probably do something more interesting but this will help illustrate post-processing logic (the ensemble function).

In [None]:
from sklearn.linear_model import LogisticRegression

linear_model = LogisticRegression(max_iter=10000)
linear_model.fit(features_table, churn_table["churn"])

In [None]:
from sklearn.tree import DecisionTreeClassifier

decision_tree_model = DecisionTreeClassifier(max_depth=10, min_samples_split=3)
decision_tree_model.fit(features_table, churn_table["churn"])

In [None]:
@step
def data_loader() -> pd.DataFrame:
    # Read some customer data from the Aqueduct repo.
    customers_table = pd.read_csv(
        "https://raw.githubusercontent.com/aqueducthq/aqueduct/main/examples/churn_prediction/data/customers.csv"
    )
    
    return customers_table

@step
def predict_linear(features_table: pd.DataFrame) -> pd.DataFrame:
    """
    Generates predictions using the logistic regression model and
    returns a new DataFrame with a column called linear that has
    the likelihood of the customer churning.
    """
    return pd.DataFrame({"linear": linear_model.predict_proba(features_table)[:, 1]})

@step
def predict_tree(features_table: pd.DataFrame) -> pd.DataFrame:
    """
    Generates predictions using the decision tree model and
    returns a new DataFrame with a column called tree that has
    the likelihood of the customer churning.
    """
    return pd.DataFrame({"tree": decision_tree_model.predict_proba(features_table)[:, 1]})

@step
def predict_ensemble(customers_table: pd.DataFrame, linear_pred_table: pd.DataFrame, tree_pred_table: pd.DataFrame) -> pd.DataFrame:
    """
    predict_ensemble combines the results from our logistic regression
    and decision tree models by taking the average of the two models'
    probabilities that a user might churn. The resulting average is
    then assigned into the `prob_churn` column on the customers_table.
    """
    return customers_table.assign(prob_churn=linear_pred_table.join(tree_pred_table).mean(axis=1))

In [None]:
from zenml.pipelines import pipeline


@pipeline(enable_cache=False)
def churn_ensemble_cgwu(
    data_loader,
    log_featurize,
    predict_linear,
    predict_tree,
    predict_ensemble,
):
    customers_table = data_loader()
    features_table = log_featurize(customers_table)
    linear_pred_table = predict_linear(features_table)
    tree_pred_table = predict_tree(features_table)
    predict_ensemble(customers_table, linear_pred_table, tree_pred_table)

In [None]:
churn_ensemble(
    data_loader=data_loader(),
    log_featurize=log_featurize(),
    predict_linear=predict_linear(),
    predict_tree=predict_tree(),
    predict_ensemble=predict_ensemble(),
).run()