# (All Data) Get At-Risk Customers Using Business Metrics

In [None]:
import os
from datetime import datetime
from io import BytesIO, StringIO
from pathlib import Path

import boto3
import botocore.exceptions
import numpy as np
import pandas as pd
from dotenv import load_dotenv
from IPython.display import Markdown

In [None]:
PROJ_ROOT = Path.cwd().parent

In [None]:
assert load_dotenv(dotenv_path=PROJ_ROOT.parent / '.env')

In [None]:
import cc_churn.costs as costs
import cc_churn.visualization as vzu

## About

Get the at-risk customers and predicted ROI using all available data.

## User Inputs

In [None]:
# R2 data bucket details
bucket_name = 'cc-churn-splits'
# # name of validation data with predictions key (file) in private R2 bucket
r2_key_all_partial = 'all_predictions__logisticregression__'
# # name of validation data with predictions key (file) in private R2 bucket
r2_key_val_partial = 'validation_predictions__logisticregression__'

# columns to load
columns = [
    'clientnum',
    'card_category',
    'total_revolv_bal',
    'total_trans_amt',
    'model_name',
    'y_pred_proba',
    'y_pred',
    'best_decision_threshold',
    'is_churned',
]

# costs
# # revenue from transactions (bank earns #% of transaction volume)
interchange_rate = 0.02
# # revenue from revolving balance (~20% interest)
apr = 0.18
# # fee revenue from credit card exposure (modeled from card type)
card_fees = {"Blue": 0, "Silver": 50, "Gold": 100, "Platinum": 200}
tenure_years = 3
discount = 0.9
# # percentage of churners who can be convinced to stay (i.e. success rate
# # of saving a churning customer)
success_rate = 0.40
# # cost of intervention to get a single customer to not churn (discounts,
# # call center time, retention offers, etc.)
intervention_cost = 50
# # maximum number of customers that can be targeted based on client's budget
num_customers_max = 100

In [None]:
account_id = os.getenv('ACCOUNT_ID')
access_key_id = os.getenv('ACCESS_KEY_ID')
secret_access_key = os.getenv('SECRET_ACCESS_KEY')

s3_client = boto3.client(
    's3',
    endpoint_url=f'https://{account_id}.r2.cloudflarestorage.com',
    aws_access_key_id=access_key_id,
    aws_secret_access_key=secret_access_key,
    region_name='auto'
)

# costs
multiplier = (1 - discount**tenure_years) / (1 - discount)

In [None]:
def pandas_read_parquet_r2(bucket_name, r2_key, columns):
    """Read parquet file from private R2 bucket."""
    s3_object = s3_client.get_object(Bucket=bucket_name, Key=r2_key)
    df = pd.read_parquet(
        BytesIO(s3_object['Body'].read()),
        columns=columns,
        dtype_backend='pyarrow',
    )
    return df


def pandas_read_filtered_parquets_r2(bucket_name, key_prefix, cols_to_load):
    """Read parquet files using partial filename from private R2 bucket."""
    s3_objects = s3_client.list_objects_v2(
        Bucket=bucket_name, Prefix=key_prefix, MaxKeys=1
    )
    assert s3_objects['ResponseMetadata']['HTTPStatusCode'] == 200
    df = pd.concat(
        [
            pandas_read_parquet_r2(
                bucket_name, obj['Key'], columns=cols_to_load
            )
            for obj in s3_objects['Contents']
        ],
        ignore_index=True,
    )
    return df


def export_df_to_r2(df, bucket_name, r2_key):
    """Export DataFrame to file in private R2 bucket, if not present."""
    try:
        s3_client.head_object(Bucket=bucket_name, Key=r2_key)
        print(f"Key {r2_key} already exists in bucket {bucket_name}")
    except botocore.exceptions.ClientError as e:
        if e.response["Error"]["Code"] == "404":
            print(f"Key {r2_key} does not exist in bucket {bucket_name}")
            buffer = BytesIO()
            df.to_parquet(
                buffer,
                index=False,
                engine='pyarrow',
                compression='gzip',
            )
            response = s3_client.put_object(
                Bucket=bucket_name, Key=r2_key, Body=buffer.getvalue()
            )
            assert response['ResponseMetadata']['HTTPStatusCode'] == 200
            print(f"Exported {len(df):,} rows to key: {r2_key}")
        elif e.response["Error"]["Code"] == "403":
            print(f"Access denied to bucket {bucket_name} or key {r2_key}")
        else:
            print(f"An unexpected error occurred: {e}")

## Load Data with Predictions

In [None]:
%%time
from io import BytesIO

import numpy as np
import pandas as pd
import sklearn.ensemble as skens
import sklearn.metrics as mtr
import sklearn.preprocessing as pp
import sklearn.utils as skut
from IPython.display import display
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline

# R2 data bucket details
bucket_name = "cc-churn-splits"

# columns to load
columns = [
    "clientnum",
    "card_category",
    "total_revolv_bal",
    "total_trans_amt",
    "model_name",
    "y_pred_proba",
    "y_pred",
    "best_decision_threshold",
    "is_churned",
]

ordinal_features = [
    "income_category",
    "education_level",
]
categorical_features = [
    # 'card_category',
    "marital_status",
]
numeric_features = [
    # 'customer_age',
    # # 'dependent_count',
    "months_on_book",
    "num_products",
    "months_inactive_12_mon",
    "contacts_count_12_mon",
    "total_revolv_bal",
    # 'avg_open_to_buy',
    "total_amt_chng_q4_q1",
    # 'total_trans_amt',
    "total_trans_ct",
    "total_ct_chng_q4_q1",
    # 'avg_utilization_ratio',
]
features = numeric_features + ordinal_features + categorical_features


def pandas_read_parquet_r2_c(s3_client, bucket_name, r2_key, columns):
    """Read parquet file from private R2 bucket."""
    s3_object = s3_client.get_object(Bucket=bucket_name, Key=r2_key)
    df = pd.read_parquet(
        BytesIO(s3_object["Body"].read()),
        columns=columns,
        dtype_backend="pyarrow",
    )
    return df


df_train = pandas_read_parquet_r2_c(
    s3_client, bucket_name, "train_data.parquet.gzip", None
)
df_val = pandas_read_parquet_r2_c(
    s3_client, bucket_name, "validation_data.parquet.gzip", None
)
df_test = pandas_read_parquet_r2_c(
    s3_client, bucket_name, "test_data.parquet.gzip", None
)

df = pd.concat([df_train, df_val, df_test])
X = df.drop(columns=["is_churned"])
y = df["is_churned"]

numeric_transformer = Pipeline(steps=[("scaler", pp.MinMaxScaler())])
categorical_transformer = Pipeline(
    steps=[("ohe", pp.OneHotEncoder(handle_unknown="ignore", drop="if_binary"))]
)
ordinal_transformer = Pipeline(
    steps=[
        (
            "oe",
            pp.OrdinalEncoder(
                categories=[
                    [
                        "Unknown",
                        "Less than $40K",
                        "$40K - $60K",
                        "$60K - $80K",
                        "$80K - $120K",
                        "$120K +",
                    ],
                    [
                        "Unknown",
                        "Uneducated",
                        "High School",
                        "College",
                        "Graduate",
                        "Post-Graduate",
                        "Doctorate",
                    ],
                ],
                handle_unknown="use_encoded_value",
                dtype=np.float64,
                unknown_value=np.nan,
            ),
        ),
    ]
)
preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, numeric_features),
        ("ord", ordinal_transformer, ordinal_features),
        ("cat", categorical_transformer, categorical_features),
    ],
    remainder="drop",
    verbose_feature_names_out=False,
    n_jobs=-1,
)

# clf = LogisticRegression(
#     class_weight='balanced', random_state=42, n_jobs=-1
# )
clf = skens.HistGradientBoostingClassifier(
    # # VERSION 1
    # max_depth=3,
    # max_bins=255,
    # l2_regularization=0.25,
    # learning_rate=0.1,
    # max_iter=250,
    # class_weight='balanced',
    # random_state=42,
    # VERSION 2
    max_depth=3,
    l2_regularization=0.25,
    class_weight="balanced",
    random_state=42,
)
# clf = skens.RandomForestClassifier(
#     n_estimators=600,
#     max_depth=3,
#     class_weight='balanced',
#     random_state=42,
#     n_jobs=-1,
# )

pipe = Pipeline([("pre", preprocessor), ("clf", clf)])

best_decision_threshold = 0.5

pipe.fit(X, y)
y_pred_proba = pipe.predict_proba(X)[:, 1]
y_pred_proba = pd.Series(y_pred_proba, name="y_pred_proba", index=X.index)
y_pred = (y_pred_proba >= best_decision_threshold).astype(int).rename('y_pred')

df_all_pred = (
    pd.concat([df, y_pred_proba, y_pred], axis=1)
    .assign(
        model_name=type(clf).__name__,
        best_decision_threshold=best_decision_threshold,
    )[columns]
)
assert df_all_pred.isna().sum().sum() == 0

print(
    f"Size of all data = {len(df_all_pred):,} rows X {df_all_pred.shape[1]:,} "
    "columns"
)

Load all available data with predictions

In [None]:
%%time
# df_all_pred = pandas_read_filtered_parquets_r2(bucket_name, r2_key_all_partial, columns)
print(f"Got {len(df_all_pred):,} rows of all available data")
with pd.option_context('display.max_columns', None):
    display(df_all_pred)

Extract best decision threshold and name of best ML model from model predictions of the validation data

In [None]:
%%time
best_decision_threshold, best_model_name = pandas_read_filtered_parquets_r2(
    bucket_name, r2_key_val_partial, ['best_decision_threshold', 'model_name']
).head(1).squeeze().to_list()

## All Available Data

### Class Imbalance

Get the true and predicted class imbalance for all available data

In [None]:
%%time
df_true_pred_class_imbalance = (
    (
        df_all_pred['y_pred']
        .value_counts(normalize=True)
        .rename('predicted')
        .to_frame()
    )
    .merge(
        (
            df_all_pred['is_churned']
            .value_counts(normalize=True)
            .rename('true')
            .to_frame()
        ),
        left_index=True,
        right_index=True,
    )
)
df_true_pred_class_imbalance.index = df_true_pred_class_imbalance.index.map(
    {0: 'No Churn', 1: 'Churn'}
)
churn_true = (df_true_pred_class_imbalance.loc['Churn'].mul(100))['true']
churn_pred = (df_true_pred_class_imbalance.loc['Churn'].mul(100))['predicted']
df_true_pred_class_imbalance

In [None]:
Markdown(
    "**Observations**\n"
    f"1. The class imbalance in the test split is approximately the same as that "
    f"in the training split, which was seen in the EDA notebook. "
    f"~{100*churn_true:.2f}% of customers showed churn in the test data.\n"
    f"2. Due to the inaccuracy of the model, ~{churn_pred:.2f}% instead of "
    f"~{churn_true:.2f}% of customers are predicted to churn."
)

Show the class imbalance and distribution of prediction probabilities for all available data

In [None]:
%%time
vzu.plot_class_imbalance_proba_distribution(
    df_clasS_imbalance=df_true_pred_class_imbalance.rename(columns=str.title),
    df_probabilities=(df_all_pred['y_pred_proba']*100),
    ptitle1='~10% Higher Churn Predicted in Test Split',
    title1_xloc=-0.3,
    ptitle2=(
        'Predicted Probabilities show Right Skew with Weak Peak Above ~90%'
    ),
    vline_label=f'Optimized Churn Cutoff ({best_decision_threshold*100:.0f}%)',
    decision_threshold=best_decision_threshold,
    subfigure_width_ratios=[1.15, 3],
    fig_size=(12, 4)
)

**Observations**

1. Similar to the test split, the distribution of predicted probabilities shows a right-skew.

### Costs

Calculate the true savings, expected (predicted) savings and error in predicted savings (cost) using all available data

In [None]:
%%time
df_costs_all, _, _ = costs.get_cost(
    df_all_pred,
    best_decision_threshold,
    interchange_rate,
    apr,
    card_fees,
    multiplier,
    success_rate,
    intervention_cost,
)
with pd.option_context('display.max_columns', None):
    display(df_costs_all)

### Get True and Predicted ROI

Plot true and predicted expected savings and ROI curves to visualize the following

1. true ROI
2. predicted ROI

using all available data

In [None]:
%%time
vzu.plot_roi_curves(
    df_costs_all['n'],
    df_costs_all['cum_true_savings'],
    df_costs_all['cum_pred_savings'],
    df_costs_all['ROI_percent'],
    df_costs_all['ROI_percent_pred'],
    {},
    ptitle=(
        'Excluding initial Noisy Period, ROI is Maximized after Selecting Top '
        '74 At-Risk Customers'
    ),
    legend_loc='upper left',
    xlabel=f"Number of Predicted Churners to Contact (Top-N)",
    ylabel="Expected Net Savings ($)",
    fig_size=(12, 8),
)

**Observations**

1. Customers with a high `total_revolv_bal` account for the sharp increase in ROI, resulting in a peak at ~500 customers. Between ~500 and ~900 customers, there are minimal such customers so further sharp increases are not seen. As mentioned in the previous notebook, selecting as many high total_revolv_bal customers as possible captures steep increases in ROI. Between ~900 and ~1,400, the high `total_revolv_bal` customers appear again. After selecting the top ~1,400 customers, ROI shows a weak downward trend.
2. If the budget allows for targeting at most the top 1,000 customers (~10% of all customers in the random sample) then the optimal number of customers is ~500.
3. If there is room in the budget to target all possible at-risk cutomers then the optimal number of customers is ~1,400. Here, we will assume this is true. So, the optimal number customers to be targeted is ~1,400.

Find the optimal number of customers to target in order to maximize true ROI, using the costs on all available data

In [None]:
%%time
df_costs_optimal = (
    df_costs_all
    .query(
        "(total_intervention_cost > 0) & "
        # capture second peak in ROI
        "(n >= 750)"
    )
    .sort_values(
        by=['ROI', 'ROI_error', 'n'], ascending=[False, True, True],
        ignore_index=True,
    )
    .head(1)
)
optimal_N_roi = df_costs_optimal['n'].squeeze()
cols_costs = [
    'n',
    'cum_true_savings',
    'cum_pred_savings',
    'ROI_error',
    'ROI_percent',
    'ROI_percent_pred',
]
(
    df_costs_optimal[cols_costs]
    .style
    .set_properties(
        subset=['ROI_error', 'ROI_percent_pred'],
        **{'background-color': 'yellow', 'color': 'black'}
    )
)

In [None]:
roi_error_optimal_val = df_costs_optimal['ROI_error'].squeeze()
predicted_roi_optimal_val = df_costs_optimal['ROI_percent_pred'].squeeze()
Markdown(
    "**Observations**\n"
    "1. In order to minimize the error in predicted ROI while also maximizing "
    f"true ROI, the optimal number of customers to target is {optimal_N_roi:,}. "
    "This is consistent with observations from the chart above.\n"
    f"2. If the top {optimal_N_roi:,} customers from the test data are "
    "targeted, then the\n"
    "   - error in the predicted ROI is approximately "
    f"{roi_error_optimal_val:.1f}%\n"
    f"   - predicted ROI is approximately {predicted_roi_optimal_val:.1f}%"
)

In [None]:
%%time
df_costs_all_true_pred = (
    df_costs_all
    .query(f"n == {optimal_N_roi}")
    .rename(
        columns={
            "ROI_percent": "ROI_true_percent",
            'ROI_percent_pred': 'ROI_pred_percent',
        }
    )
    .melt(
        id_vars=['n', 'y_pred_proba'],
        value_vars=[
            'cum_true_savings',
            'cum_pred_savings',
            'ROI_true_percent',
            'ROI_pred_percent',
        ],
        var_name='variable',
        value_name='value',
    )
    .assign(metric=lambda df: df['variable'].str.split('_', expand=True)[2])
    .pivot(
        index=['n', 'y_pred_proba'], columns=['variable'], values='value'
    )
    .reset_index()
    .assign(
        pct_pred_error=lambda df: 100*(
            (df['ROI_pred_percent']-df['ROI_true_percent'])
            /df['ROI_true_percent']
        )
    )
)
error_pred_roi = df_costs_all_true_pred['pct_pred_error'].squeeze()
display(
    df_costs_all_true_pred
    .style
    .apply(
        lambda x: [
            'background: yellow' if x.name == 'pct_pred_error' else ''
            for i in x
        ]
    )
)

In [None]:
Markdown(
    "**Observations**\n"
    "1. If we apply the recommendations from predicted ROI and contact "
    f"(target) the top {optimal_N_roi} customers, then the client is "
    f"incorrectly reported a gain of approximately {error_pred_roi:.2f}% "
    "of the maximum possible true ROI."
)

Append column to costs indicating if targeting customer maximizes ROI

In [None]:
df_costs_all = (
    df_costs_all
    .assign(maximizes_roi=lambda df: df['n'] <= optimal_N_roi)
)
(
    df_costs_all
    [
        [
            'clientnum',
            'n',
            'y_pred_proba',
            'y_pred',
            'clv',
            'ROI_percent_pred',
            'maximizes_roi',
        ]
    ]
)

### At-Risk Customers

As mentioned in the previous notebook, the `y_pred` column indicates if a customer is at-risk (1) or not (0), so it will now be renamed to `is_at_risk`

In [None]:
%%time
df_costs_all = df_costs_all.rename(columns={'y_pred': 'is_at_risk'})
with pd.option_context('display.max_columns', None):
    display(df_costs_all)

## Export Project Deliverables to Private R2 Bucket

Get the current timestamp in the format `YYmmdd_HHMMSS`

In [None]:
curr_timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')

### All Customers with Indicator of At-Risk and Maximizing ROI

Combine costs (predicted churners) with predicted non-churners

In [None]:
%%time
df_all_pred_with_costs = (
    pd.concat(
        [
            df_costs_all.assign(y_pred=1),
            df_all_pred.query("y_pred != 1"),
        ],
        ignore_index=True
    )
    .fillna(
        {
            'interchange_rev': np.nan,
            'interest_rev': np.nan,
            'fee_rev': np.nan,
            'annual_rev': np.nan,
            'clv': np.nan,
            'success_rate': np.nan,
            'expected_savings': np.nan,
            'true_savings': np.nan,
            'cum_pred_savings': np.nan,
            'cum_true_savings': np.nan,
            'n': np.nan,
            'random_savings': np.nan,
            'total_intervention_cost': np.nan,
            'ROI': np.nan,
            'ROI_pred': np.nan,
            'ROI_error': np.nan,
            'ROI_percent': np.nan,
            'ROI_percent_pred': np.nan,
            'maximizes_roi': np.nan,
            'is_at_risk': 0,
        }
    )
    .convert_dtypes(dtype_backend='pyarrow')
)
with pd.option_context('display.max_columns', None):
    display(df_all_pred_with_costs)

Next, export to a file in the R2 bucket with the following file name format `all_predictions_with_business_metrics__logisticregression__<current-timestamp-YYmmdd_HHMMSS>.parquet.gzip`

In [None]:
# %%time
# export_df_to_r2(
#     df_all_pred_with_costs,
#     bucket_name,
#     (
#         f"all_predictions_with_business_metrics__{best_model_name.lower()}__"
#         f"{curr_timestamp}.parquet.gzip"
#     ),
# )