# (All Data) Get At-Risk Customers Using Business Metrics

## <font color='red'>**Notebook is Incomplete**</font>

In [None]:
import os
from datetime import datetime
from io import BytesIO, StringIO
from pathlib import Path

import boto3
import botocore.exceptions
import numpy as np
import pandas as pd
from dotenv import load_dotenv
from IPython.display import Markdown

In [None]:
PROJ_ROOT = Path.cwd().parent

In [None]:
assert load_dotenv(dotenv_path=PROJ_ROOT.parent / '.env')

In [None]:
import cc_churn.costs as costs
import cc_churn.visualization as vzu

## About

Get the at-risk customers and determine how many customers should be selected in order to maximize true ROI while minimizing error in predicted ROI.

## User Inputs

In [None]:
# R2 data bucket details
bucket_name = 'cc-churn-splits'
# # name of validation data with predictions key (file) in private R2 bucket
r2_key_all_partial = 'all_predictions__logisticregression__'
# # name of validation data with predictions key (file) in private R2 bucket
r2_key_val_partial = 'validation_predictions__logisticregression__'

# columns to load
columns = [
    'clientnum',
    'card_category',
    'total_revolv_bal',
    'total_trans_amt',
    'model_name',
    'y_pred_proba',
    'y_pred',
    'best_decision_threshold',
    'is_churned',
]

# costs
# # revenue from transactions (bank earns #% of transaction volume)
interchange_rate = 0.02
# # revenue from revolving balance (~20% interest)
apr = 0.18
# # fee revenue from credit card exposure (modeled from card type)
card_fees = {"Blue": 0, "Silver": 50, "Gold": 100, "Platinum": 200}
tenure_years = 3
discount = 0.9
# # percentage of churners who can be convinced to stay (i.e. success rate
# # of saving a churning customer)
success_rate = 0.40
# # cost of intervention to get a single customer to not churn (discounts,
# # call center time, retention offers, etc.)
intervention_cost = 50
# # maximum number of customers that can be targeted based on client's budget
num_customers_max = 100

In [None]:
account_id = os.getenv('ACCOUNT_ID')
access_key_id = os.getenv('ACCESS_KEY_ID')
secret_access_key = os.getenv('SECRET_ACCESS_KEY')

s3_client = boto3.client(
    's3',
    endpoint_url=f'https://{account_id}.r2.cloudflarestorage.com',
    aws_access_key_id=access_key_id,
    aws_secret_access_key=secret_access_key,
    region_name='auto'
)

# costs
multiplier = (1 - discount**tenure_years) / (1 - discount)

In [None]:
def pandas_read_parquet_r2(bucket_name, r2_key, columns):
    """Read parquet file from private R2 bucket."""
    s3_object = s3_client.get_object(Bucket=bucket_name, Key=r2_key)
    df = pd.read_parquet(
        BytesIO(s3_object['Body'].read()),
        columns=columns,
        dtype_backend='pyarrow',
    )
    return df


def pandas_read_filtered_parquets_r2(bucket_name, key_prefix, cols_to_load):
    """Read parquet files using partial filename from private R2 bucket."""
    s3_objects = s3_client.list_objects_v2(
        Bucket=bucket_name, Prefix=key_prefix, MaxKeys=1
    )
    assert s3_objects['ResponseMetadata']['HTTPStatusCode'] == 200
    df = pd.concat(
        [
            pandas_read_parquet_r2(
                bucket_name, obj['Key'], columns=cols_to_load
            )
            for obj in s3_objects['Contents']
        ],
        ignore_index=True,
    )
    return df


def export_df_to_r2(df, bucket_name, r2_key):
    """Export DataFrame to file in private R2 bucket, if not present."""
    try:
        s3_client.head_object(Bucket=bucket_name, Key=r2_key)
        print(f"Key {r2_key} already exists in bucket {bucket_name}")
    except botocore.exceptions.ClientError as e:
        if e.response["Error"]["Code"] == "404":
            print(f"Key {r2_key} does not exist in bucket {bucket_name}")
            buffer = BytesIO()
            df.to_parquet(
                buffer,
                index=False,
                engine='pyarrow',
                compression='gzip',
            )
            response = s3_client.put_object(
                Bucket=bucket_name, Key=r2_key, Body=buffer.getvalue()
            )
            assert response['ResponseMetadata']['HTTPStatusCode'] == 200
            print(f"Exported {len(df):,} rows to key: {r2_key}")
        elif e.response["Error"]["Code"] == "403":
            print(f"Access denied to bucket {bucket_name} or key {r2_key}")
        else:
            print(f"An unexpected error occurred: {e}")

## Load Data with Predictions

Load all available data with predictions

In [None]:
%%time
df_all_pred = pandas_read_filtered_parquets_r2(bucket_name, r2_key_all_partial, columns)
print(f"Got {len(df):,} rows of all available data")
with pd.option_context('display.max_columns', None):
    display(df_all_pred)

Extract best decision threshold and name of best ML model from model predictions of the validation data

In [None]:
%%time
best_decision_threshold, best_model_name = pandas_read_filtered_parquets_r2(
    bucket_name, r2_key_val_partial, ['best_decision_threshold', 'model_name']
).head(1).squeeze().to_list()

## All Available Data

### Class Imbalance

Get the true and predicted class imbalance for the test data

In [None]:
%%time
df_true_pred_class_imbalance = (
    (
        df_all_pred['y_pred']
        .value_counts(normalize=True)
        .rename('predicted')
        .to_frame()
    )
    .merge(
        (
            df_all_pred['is_churned']
            .value_counts(normalize=True)
            .rename('true')
            .to_frame()
        ),
        left_index=True,
        right_index=True,
    )
)
df_true_pred_class_imbalance.index = df_true_pred_class_imbalance.index.map(
    {0: 'No Churn', 1: 'Churn'}
)
df_true_pred_class_imbalance

**Observations**

1. The class imbalance in the test split is approximately the same as that in the training split (~84%:16%).
2. Due to the inaccuracy of the model, the class imbalance of the predictions is ~75%:25% which means ~25% instead of ~16% of customers are predicted to churn (i.e. ~25% of customers are predicted to cancel their credit card).

Show the class imbalance and distribution of prediction probabilities for the test data

In [None]:
%%time
vzu.plot_class_imbalance_proba_distribution(
    df_clasS_imbalance=df_true_pred_class_imbalance.rename(columns=str.title),
    df_probabilities=(df_all_pred['y_pred_proba']*100),
    ptitle1='~10% Higher Churn Predicted in Test Split',
    title1_xloc=-0.3,
    ptitle2=(
        'Predicted Probabilities show Right Skew with Weak Peak Above ~88%'
    ),
    vline_label=f'Optimized Churn Cutoff ({best_decision_threshold*100:.0f}%)',
    decision_threshold=best_decision_threshold,
    subfigure_width_ratios=[1.15, 3],
    fig_size=(12, 4)
)

**Observations**

1. As expected from the predicted class imbalance, the distribution of predicted probabilities is right-skewed and a small fraction of customers have a predicted probability above 50% (the tuned classification decision threshold).

### Costs

Calculate the true savings, expected (predicted) savings and error in predicted savings (cost) using all available data

In [None]:
%%time
df_costs_all, _, _ = costs.get_cost(
    df_all_pred,
    best_decision_threshold,
    interchange_rate,
    apr,
    card_fees,
    multiplier,
    success_rate,
    intervention_cost,
)
with pd.option_context('display.max_columns', None):
    display(df_costs_all)

### Use True ROI and Error in Predicted ROI to Optimize Number of Targeted Customers (`N`)

Plot true and predicted expected savings and ROI curves to visualize the following

1. true ROI
2. predicted ROI

using all available data

In [None]:
%%time
vzu.plot_roi_curves(
    df_costs_all['n'],
    df_costs_all['cum_true_savings'],
    df_costs_all['cum_pred_savings'],
    df_costs_all['ROI_percent'],
    df_costs_all['ROI_percent_pred'],
    {},
    ptitle=(
        'Excluding initial Noisy Period, ROI is Maximized after Selecting Top '
        '74 At-Risk Customers'
    ),
    legend_loc='upper left',
    xlabel=f"Number of Predicted Churners to Contact (Top-N)",
    ylabel="Expected Net Savings ($)",
    fig_size=(12, 8),
)

**Notes**

1. The model cost is the error in predicted ROI.

**Observations**

1. The true and predicted ROI is noisy when selecting a small number of churners (less than approximately 50 churners). As explained below, individual customers are make an outsized contribution to the overall ROI for `N` < ~50.
2. Excluding the initial ROI increases, the peak in optimal `N` is at ~75 customers (churners), the true ROI value reaches a maximum of ~150% at ~75, then drops to ~105% at ~180 and then increases to ~140% at ~210.
3. Based on the observations in this chart, if the value of `N` is selected to be ~75, the ROI is maximized and the error in predicted ROI from the best ML model is minimal. This choice of `N` is an optimal balance between true ROI and error in ROI predicted by the best ML model.

**Observations**

1. There are three terms that cause fluctuations in the calculation of annual revenue (and consequently in the calculation of expected savings and ROI) when a small number of customers are selected
   - `interest_rev` (depending on `total_revolv_bal`)
   - `fee_rev` (depending on `card_category`)

   This results in strong fluctuations in ROI if selecting a small number of customers. The overall ROI is highly sensitive to contributions from a small number of customers here. This causes the noisy patterns in both true and predicted ROI in the above chart when targeting less than approximately 55 customers.

Find the optimal number of customers to target in order to maximize predicted ROI, using the costs on the validation data

In [None]:
%%time
df_costs_optimal = (
    df_costs_all
    .query(
        "(total_intervention_cost > 0) & "
        # avoid initial noisy period where few customers have dominant
        # impact on cumulative ROI
        "(n >= 150)"
        # enforce limit based on client's budget
        # f"(n <= {num_customers_max})"
    )
    .sort_values(
        by=['ROI', 'ROI_error', 'n'], ascending=[False, True, True],
        ignore_index=True,
    )
    .head(1)
)
optimal_N_roi = df_costs_optimal['n'].squeeze()
cols_costs = [
    'n',
    'cum_true_savings',
    'cum_pred_savings',
    'ROI_error',
    'ROI_percent',
    'ROI_percent_pred',
]
(
    df_costs_optimal[cols_costs]
    .style
    .set_properties(
        subset=['ROI_error', 'ROI_percent_pred'],
        **{'background-color': 'yellow', 'color': 'black'}
    )
)

In [None]:
roi_error_optimal_val = df_costs_optimal['ROI_error'].squeeze()
predicted_roi_optimal_val = df_costs_optimal['ROI_percent_pred'].squeeze()
Markdown(
    "**Observations**\n"
    "1. In order to maximize true ROI and minimize the error in predicted "
    f"ROI, the optimal number of customers to target is {optimal_N_roi}. "
    "This is consistent with observations from the chart above.\n"
    f"2. If the top {optimal_N_roi} customers from the test data are "
    "targeted, then the\n"
    "   - error in the predicted ROI is approximately "
    f"{roi_error_optimal_val:.1f}%\n"
    f"   - predicted ROI is approximately {predicted_roi_optimal_val:.1f}%"
)

### Get Loss in Predicted ROI Using Optimized Number of Targeted Customers (`N`)

Append column to costs indicating if targeting customer maximizes ROI

In [None]:
df_costs_all = (
    df_costs_all
    .assign(maximizes_roi=lambda df: df['n'] <= optimal_N_roi)
)
(
    df_costs_all
    [
        [
            'clientnum',
            'n',
            'y_pred_proba',
            'y_pred',
            'clv',
            'ROI_percent_pred',
            'maximizes_roi',
        ]
    ]
)

### At-Risk Customers

In order to identify at-risk customers from the `y_pred_proba` (predicted probability) column, we must pick an optimal decision threshold based on the business goal (catching true churners). For the current business use-case, we need to prioritize recall. The optimal decision threshold was determined during ML development.

This decision threshold was optimized to maximize F2 Score, since it prioritizes recall over precision, which is in line with the business goal. The tuned threshold is stored in the `best_decision_threshold` column of `df_val_pred` and `df_test_pred`. The `y_pred` column was created by comparing `y_pred_proba` to the best decision threshold. With this in mind, the `y_pred` column already indicates if a customer is at-risk (1) or not (0).

So, the `y_pred` column will now be renamed to `is_at_risk`

In [None]:
%%time
df_costs_test = df_costs_test.rename(columns={'y_pred': 'is_at_risk'})
with pd.option_context('display.max_columns', None):
    display(df_costs_test)

## Export Project Deliverables to Private R2 Bucket

Get the current timestamp in the format `YYmmdd_HHMMSS`

In [None]:
curr_timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')

### Unseen Data (Test Split) Customers with Indicator of At-Risk and Maximizing ROI

Combine costs (predicted churners) with predicted non-churners

In [None]:
%%time
df_test_pred_with_costs = (
    pd.concat(
        [
            df_costs_test.assign(y_pred=1),
            df_test_pred.query("y_pred != 1"),
        ],
        ignore_index=True
    )
    .fillna(
        {
            'interchange_rev': np.nan,
            'interest_rev': np.nan,
            'fee_rev': np.nan,
            'annual_rev': np.nan,
            'clv': np.nan,
            'success_rate': np.nan,
            'expected_savings': np.nan,
            'true_savings': np.nan,
            'cum_pred_savings': np.nan,
            'cum_true_savings': np.nan,
            'n': np.nan,
            'random_savings': np.nan,
            'total_intervention_cost': np.nan,
            'ROI': np.nan,
            'ROI_pred': np.nan,
            'ROI_error': np.nan,
            'ROI_percent': np.nan,
            'ROI_percent_pred': np.nan,
            'maximizes_roi': np.nan,
            'is_at_risk': 0,
        }
    )
    .convert_dtypes(dtype_backend='pyarrow')
)
with pd.option_context('display.max_columns', None):
    display(df_test_pred_with_costs)

Next, export to a file in the R2 bucket with the following file name format `test_predictions_with_business_metrics__logisticregression__<current-timestamp-YYmmdd_HHMMSS>.parquet.gzip`

In [None]:
%%time
export_df_to_r2(
    df_test_pred_with_costs,
    bucket_name,
    (
        f"test_predictions_with_business_metrics__{best_model_name.lower()}__"
        f"{curr_timestamp}.parquet.gzip"
    ),
)