# Linear Model Cost-Sensitive Learning

In [1]:
import os
import tempfile
from datetime import datetime
from io import BytesIO, StringIO
from pathlib import Path

import boto3
import botocore.exceptions
import joblib
import numpy as np
import pandas as pd
from dotenv import load_dotenv
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import (
    accuracy_score, classification_report, confusion_matrix, roc_auc_score
)
from sklearn.preprocessing import StandardScaler

In [2]:
PROJ_ROOT = Path.cwd().parent

In [3]:
assert load_dotenv(dotenv_path=PROJ_ROOT.parent / '.env')

## About

Machine Learning (ML) model training using a linear model (`LogisticRegression`).

### Outputs

Based on the project deliverables in [this project's scoping document](https://github.com/edesz/credit-card-churn/blob/main/references/01_proposal.md#project-deliverables), this notebook produces following outputs

1. The best trained ML model is used to make predictions on the
   - validation data split and these predictions and probabilities are exported to a file. These predictions will be used in the next step to calculate business metrics on the validation split.
   - test data split these predictions and probabilities are also exported to a file in order to calculate business metrics on the unseen data (test split).
2. The best ML model object is exported after training it on
   - combined train and validation data
   - all available data

## User Inputs

In [4]:
# R2 data bucket details
bucket_name = 'cc-churn-splits'
# # name of train data key (file) in private R2 bucket
r2_key_train = 'train_data.parquet.gzip'
# # name of validation data key (file) in private R2 bucket
r2_key_val = 'validation_data.parquet.gzip'
# # name of test data key (file) in private R2 bucket
r2_key_test = 'test_data.parquet.gzip'

# datatypes for categorical columns
dtypes_categoricals = {
    "gender": 'string[pyarrow]',
    "marital_status": 'string[pyarrow]',
    "income_category": 'string[pyarrow]',
    "card_category": 'string[pyarrow]',
    "education_level": 'string[pyarrow]',
}

label = 'is_churned'

In [5]:
account_id = os.getenv('ACCOUNT_ID')
access_key_id = os.getenv('ACCESS_KEY_ID_USER2')
secret_access_key = os.getenv('SECRET_ACCESS_KEY_USER2')

s3_client = boto3.client(
    's3',
    endpoint_url=f'https://{account_id}.r2.cloudflarestorage.com',
    aws_access_key_id=access_key_id,
    aws_secret_access_key=secret_access_key,
    region_name='auto'
)

In [6]:
def pandas_read_parquet_r2(bucket_name, r2_key):
    """Read parquet file from private R2 bucket."""
    s3_object = s3_client.get_object(Bucket=bucket_name, Key=r2_key)
    df = pd.read_parquet(
        BytesIO(s3_object['Body'].read()), dtype_backend='pyarrow'
    )
    return df


def export_df_to_r2(df, bucket_name, r2_key):
    """Export DataFrame to file in private R2 bucket, if not present."""
    try:
        s3_client.head_object(Bucket=bucket_name, Key=r2_key)
        print(f"Key {r2_key} already exists in bucket {bucket_name}")
    except botocore.exceptions.ClientError as e:
        if e.response["Error"]["Code"] == "404":
            print(f"Key {r2_key} does not exist in bucket {bucket_name}")
            buffer = BytesIO()
            df.to_parquet(
                buffer,
                index=False,
                engine='pyarrow',
                compression='gzip',
            )
            response = s3_client.put_object(
                Bucket=bucket_name, Key=r2_key, Body=buffer.getvalue()
            )
            assert response['ResponseMetadata']['HTTPStatusCode'] == 200
            print(f"Exported {len(df):,} rows to key: {r2_key}")
        elif e.response["Error"]["Code"] == "403":
            print(f"Access denied to bucket {bucket_name} or key {r2_key}")
        else:
            print(f"An unexpected error occurred: {e}")

## Load Data

### Data for Model Validation

Load the training data

In [7]:
%%time
df_train = (
    pandas_read_parquet_r2(bucket_name, r2_key_train)
    .astype(dtypes_categoricals)
)
print(f"Loaded {len(df_train):,} rows of training data")
with pd.option_context('display.max_columns', None):
    display(df_train.head())

Loaded 6,285 rows of training data


Unnamed: 0,clientnum,is_churned,customer_age,gender,dependent_count,education_level,marital_status,income_category,card_category,months_on_book,num_products,months_inactive_12_mon,contacts_count_12_mon,credit_limit,total_revolv_bal,avg_open_to_buy,total_amt_chng_q4_q1,total_trans_amt,total_trans_ct,total_ct_chng_q4_q1,avg_utilization_ratio
0,714283458,0,40,M,2,College,Single,$80K - $120K,Blue,36,5,1,4,14544.0,0.0,14544.0,0.768,4064.0,92,0.769,0.0
1,787587033,0,42,F,4,Graduate,Single,Less than $40K,Blue,32,3,1,2,2996.0,1992.0,1004.0,0.948,4463.0,87,0.74,0.665
2,714672933,0,52,F,3,Graduate,Married,$40K - $60K,Blue,36,6,4,2,3143.0,2268.0,875.0,0.801,4417.0,84,0.68,0.722
3,714974658,0,48,F,4,College,Married,Less than $40K,Blue,36,6,1,4,2464.0,1867.0,597.0,0.6,1219.0,35,1.333,0.758
4,712049208,0,56,M,3,Post-Graduate,Married,$60K - $80K,Blue,39,3,1,2,3955.0,2517.0,1438.0,0.484,1238.0,25,1.083,0.636


CPU times: user 91.5 ms, sys: 14.2 ms, total: 106 ms
Wall time: 230 ms


### Data for Model Evaluation

Load the validation data

In [8]:
%%time
df_val = (
    pandas_read_parquet_r2(bucket_name, r2_key_val)
    .astype(dtypes_categoricals)
)
print(f"Loaded {len(df_val):,} rows of validation data")
with pd.option_context('display.max_columns', None):
    display(df_val.head())

Loaded 1,693 rows of validation data


Unnamed: 0,clientnum,is_churned,customer_age,gender,dependent_count,education_level,marital_status,income_category,card_category,months_on_book,num_products,months_inactive_12_mon,contacts_count_12_mon,credit_limit,total_revolv_bal,avg_open_to_buy,total_amt_chng_q4_q1,total_trans_amt,total_trans_ct,total_ct_chng_q4_q1,avg_utilization_ratio
0,823840458,0,47,F,1,High School,Married,Unknown,Blue,43,4,2,2,1828.0,1517.0,311.0,0.661,4542.0,82,0.577,0.83
1,716328558,0,46,M,2,Uneducated,Married,$80K - $120K,Blue,38,4,5,2,11434.0,0.0,11434.0,0.84,4520.0,83,0.844,0.0
2,714735033,0,45,F,5,Doctorate,Married,Less than $40K,Blue,34,4,2,2,1438.3,491.0,947.3,0.708,4376.0,84,0.787,0.341
3,712163433,1,47,M,3,Uneducated,Married,$40K - $60K,Blue,29,1,2,4,1684.0,644.0,1040.0,0.723,2164.0,36,0.714,0.382
4,720307683,0,43,F,4,Graduate,Divorced,Less than $40K,Blue,36,3,2,2,1438.3,743.0,695.3,0.624,4484.0,78,0.625,0.517


CPU times: user 52.5 ms, sys: 6.09 ms, total: 58.6 ms
Wall time: 160 ms


Get the combined training+validation data split

In [9]:
%%time
df_train_val = pd.concat([df_train, df_val], ignore_index=True)
print(f"Obtained {len(df_train_val):,} rows of training+validation data")
with pd.option_context('display.max_columns', None):
    display(df_train_val.head())

Obtained 7,978 rows of training+validation data


Unnamed: 0,clientnum,is_churned,customer_age,gender,dependent_count,education_level,marital_status,income_category,card_category,months_on_book,num_products,months_inactive_12_mon,contacts_count_12_mon,credit_limit,total_revolv_bal,avg_open_to_buy,total_amt_chng_q4_q1,total_trans_amt,total_trans_ct,total_ct_chng_q4_q1,avg_utilization_ratio
0,714283458,0,40,M,2,College,Single,$80K - $120K,Blue,36,5,1,4,14544.0,0.0,14544.0,0.768,4064.0,92,0.769,0.0
1,787587033,0,42,F,4,Graduate,Single,Less than $40K,Blue,32,3,1,2,2996.0,1992.0,1004.0,0.948,4463.0,87,0.74,0.665
2,714672933,0,52,F,3,Graduate,Married,$40K - $60K,Blue,36,6,4,2,3143.0,2268.0,875.0,0.801,4417.0,84,0.68,0.722
3,714974658,0,48,F,4,College,Married,Less than $40K,Blue,36,6,1,4,2464.0,1867.0,597.0,0.6,1219.0,35,1.333,0.758
4,712049208,0,56,M,3,Post-Graduate,Married,$60K - $80K,Blue,39,3,1,2,3955.0,2517.0,1438.0,0.484,1238.0,25,1.083,0.636


CPU times: user 16.3 ms, sys: 99 μs, total: 16.4 ms
Wall time: 15.2 ms


Load the test data

In [10]:
%%time
df_test = (
    pandas_read_parquet_r2(bucket_name, r2_key_test)
    .astype(dtypes_categoricals)
)
print(f"Loaded {len(df_test):,} rows of test data")
with pd.option_context('display.max_columns', None):
    display(df_test)

Loaded 2,149 rows of test data


Unnamed: 0,clientnum,is_churned,customer_age,gender,dependent_count,education_level,marital_status,income_category,card_category,months_on_book,num_products,months_inactive_12_mon,contacts_count_12_mon,credit_limit,total_revolv_bal,avg_open_to_buy,total_amt_chng_q4_q1,total_trans_amt,total_trans_ct,total_ct_chng_q4_q1,avg_utilization_ratio
0,708223383,1,47,F,3,High School,Married,Unknown,Blue,39,6,3,4,11410.0,979.0,10431.0,1.049,2736.0,38,0.462,0.086
1,715052583,0,45,M,3,Doctorate,Single,$60K - $80K,Silver,34,3,3,1,27494.0,879.0,26615.0,0.671,14375.0,112,0.672,0.032
2,719910333,0,43,F,3,Unknown,Unknown,Less than $40K,Blue,36,4,4,2,5853.0,1190.0,4663.0,0.936,3595.0,80,0.667,0.203
3,785328408,0,37,F,4,Graduate,Married,Less than $40K,Blue,31,5,2,3,1758.0,1180.0,578.0,0.744,2013.0,41,0.367,0.671
4,716321583,0,50,F,3,Unknown,Single,$40K - $60K,Blue,36,5,3,3,12740.0,1173.0,11567.0,1.04,3441.0,55,0.719,0.092
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2144,817247658,0,45,M,3,High School,Unknown,$60K - $80K,Blue,27,6,2,3,21317.0,0.0,21317.0,0.833,3814.0,67,1.03,0.0
2145,816834933,0,34,F,4,Graduate,Single,Unknown,Silver,29,6,2,2,30702.0,673.0,30029.0,0.721,2652.0,64,0.524,0.022
2146,790221633,0,44,F,4,Graduate,Single,$40K - $60K,Blue,38,3,3,3,3897.0,2296.0,1601.0,0.746,4627.0,77,0.791,0.589
2147,719535558,0,47,M,4,College,Divorced,$40K - $60K,Blue,36,5,1,1,5570.0,1858.0,3712.0,0.783,4129.0,72,0.636,0.334


CPU times: user 54.6 ms, sys: 5.89 ms, total: 60.5 ms
Wall time: 160 ms


## Separate Features from Target

In [11]:
# model validation
X_train = df_train.drop(columns=[label])
y_train = df_train[label]

X_val = df_val.drop(columns=[label])
y_val = df_val[label]

# model evaluation
X_train_val = df_train_val.drop(columns=[label])
y_train_val = df_train_val[label]

X_test = df_test.drop(columns=[label])
y_test = df_test[label]

# model inference
X_all = pd.concat([X_train, X_val, X_test])
y_all = pd.concat([y_train, y_val, y_test])

## Clean Data

In [12]:
# model validation
X_train_clean = X_train.copy()
X_val_clean = X_val.copy()

# model evaluation
X_train_val_clean = X_train_val.copy()
X_test_clean = X_test.copy()

# model inference
X_all_clean = X_all.copy()

### Handling Missing or Unknown Values

Several categorical columns, like Education_Level, Marital_Status, and Income_Category, contained the value "Unknown". This is a placeholder for missing or unavailable data. We replaced these with NaN to treat them as missing values properly. This allows us to later apply imputation techniques to fill them in appropriately. Treating unknowns as real values can distort model behavior, so identifying them as missing is a crucial cleaning step.

In [13]:
categorical_cols = ["education_level", "marital_status", "income_category"]

# model validation
X_train_clean[categorical_cols] = X_train_clean[categorical_cols].replace("Unknown", pd.NA)
X_val_clean[categorical_cols] = X_val_clean[categorical_cols].replace("Unknown", pd.NA)

# model evaluation
X_train_val_clean[categorical_cols] = X_train_val_clean[categorical_cols].replace("Unknown", pd.NA)
X_test_clean[categorical_cols] = X_test_clean[categorical_cols].replace("Unknown", pd.NA)

# model inference
X_all_clean[categorical_cols] = X_all_clean[categorical_cols].replace("Unknown", pd.NA)

## Model Validation

Model validation uses the validation data split.

Model validation is performed below using the training data split (`df_train`) to find the best model, decision threshold and (optional) hyperparameters.

### Encoding Categorical Variables

Machine learning models require all features to be numeric. I handled categorical data in two ways:

**Ordinal Encoding for Education_Level**: Since education has a natural order (e.g., High School < Graduate < Doctorate), we mapped it to integers from 0 to 5.

**One-Hot Encoding for Other Categories**: For nominal variables like Gender, Marital_Status, Income_Category, and Card_Category, we used one-hot encoding. This creates separate binary columns for each category, allowing the model to treat them independently without assuming any order.

This transformation makes the data fully numeric and model-friendly.

In [14]:
education_map = {
    "Uneducated": 0,
    "High School": 1,
    "College": 2,
    "Graduate": 3,
    "Post-Graduate": 4,
    "Doctorate": 5
}

X_train_clean["education_level"] = X_train_clean["education_level"].map(education_map)
X_train_clean = pd.get_dummies(
    X_train_clean,
    columns=["gender", "marital_status", "income_category", "card_category"],
    drop_first=True,
)

X_val_clean["education_level"] = X_val_clean["education_level"].map(education_map)
X_val_clean = pd.get_dummies(
    X_val_clean,
    columns=["gender", "marital_status", "income_category", "card_category"],
    drop_first=True,
)

### Model Selection

Here, we assume the best model is `LogisticRegression`

In [15]:
best_model_name = 'LogisticRegression'
model_best = LogisticRegression(
    max_iter=1000, class_weight="balanced", random_state=42
)

**Notes**

1. In the next notebook, multiple types of ML models will be compared to eachother using the `prauc` scoring metric to determine the best model. See [phase 1 from the metrics section of the project scope](https://github.com/edesz/credit-card-churn/blob/main/references/01_proposal.md#final-choice-of-metrics) for details about the choice of metric during model selection.

### Model Training and Prediction

As mentioned above, here we do not validate the model by comparing different types of models using cross-validation on the training data (`X_train_scaled` and `y_train`). Instead, we just use the model to make predictions for the validation split (`df_val`)

In [16]:
# Impute missing values
imputer = SimpleImputer(strategy="most_frequent")
X_train_imputed = imputer.fit_transform(X_train_clean)
X_val_imputed = imputer.transform(X_val_clean)

# Scale features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_imputed)
X_val_scaled = scaler.transform(X_val_imputed)

# Train Logistic Regression
model_best.fit(X_train_scaled, y_train)

# Predictions
y_pred_val = model_best.predict(X_val_scaled)
y_prob_val = model_best.predict_proba(X_val_scaled)[:, 1]

### Decision Threshold Tuning

Here, we use the default decision threshold of 0.5

In [17]:
best_decision_threshold = 0.5

**Notes**

1. In the next notebook, multiple decision thresholds will be compared to eachother using the `f2_score` scoring metric to determine the best model. See [phase 2 from the metrics section of the project scope](https://github.com/edesz/credit-card-churn/blob/main/references/01_proposal.md#final-choice-of-metrics) for details about the choice of metric for tuning the threshold.

## Model Evaluation

Model evaluation uses the test data split.

Model evaluation is performed to evaluate the performance of the model on unseen data (test split). The best model is trained on the combined training and validation data. It is then used the make predictions of the test split and these predictions are scored. See [phase 3 from the metrics section of the project scope](https://github.com/edesz/credit-card-churn/blob/main/references/01_proposal.md#final-choice-of-metrics) for details about the choice of metric for model evaluation.

### Encoding Categorical Variables

In [18]:
X_train_val_clean["education_level"] = X_train_val_clean["education_level"].map(education_map)
X_train_val_clean = pd.get_dummies(
    X_train_val_clean,
    columns=["gender", "marital_status", "income_category", "card_category"],
    drop_first=True,
)

X_test_clean["education_level"] = X_test_clean["education_level"].map(education_map)
X_test_clean = pd.get_dummies(
    X_test_clean,
    columns=["gender", "marital_status", "income_category", "card_category"],
    drop_first=True,
)

### Model Training and Prediction

The best model is now trained on the combined train+validation data (`df_train_val`). It is then used to make predictions for the test split (`df_test`) and score these predictions.

In [19]:
# Impute missing values
imputer = SimpleImputer(strategy="most_frequent")
X_train_val_imputed = imputer.fit_transform(X_train_val_clean)
X_test_imputed = imputer.transform(X_test_clean)

# Scale features
scaler = StandardScaler()
X_train_val_scaled = scaler.fit_transform(X_train_val_imputed)
X_test_scaled = scaler.transform(X_test_imputed)

# Train best model
model_best.fit(X_train_val_scaled, y_train_val)

# Predictions
y_prob_test = model_best.predict_proba(X_test_scaled)[:, 1]
y_pred_test = (y_prob_test >= best_decision_threshold).astype(int)

# Evaluation
acc = accuracy_score(y_test, y_pred_test)
auc = roc_auc_score(y_test, y_prob_test)
report = classification_report(y_test, y_pred_test)
cm = confusion_matrix(y_test, y_pred_test)

acc, auc, cm, report

(0.8506281991624011,
 0.9323419775699734,
 array([[1535,  269],
        [  52,  293]]),
 '              precision    recall  f1-score   support\n\n         0.0       0.97      0.85      0.91      1804\n         1.0       0.52      0.85      0.65       345\n\n    accuracy                           0.85      2149\n   macro avg       0.74      0.85      0.78      2149\nweighted avg       0.90      0.85      0.86      2149\n')

### Conclusion

The model achieved an **accuracy of 85.1%** and an excellent **ROC-AUC of 93%**, showing it can effectively separate churners from non-churners. The confusion matrix reveals that it correctly identified most customers, including **85% of actual churners (high recall)**, which is crucial for customer retention. However, the **precision for churners is only 52%**, meaning nearly half of the customers flagged as churners are actually loyal. This trade-off is acceptable in churn prediction since it is better to mistakenly target some loyal customers with retention offers than to miss real churners. Overall, the model provides strong recall and good overall performance, though improving precision with advanced models like Random Forest or XGBoost could make it more practical for business use.

## Model Inference

Prepare model for inference using live data in production.

### Encoding Categorical Variables

In [20]:
X_all_clean["education_level"] = X_all_clean["education_level"].map(education_map)
X_all_clean = pd.get_dummies(
    X_all_clean,
    columns=["gender", "marital_status", "income_category", "card_category"],
    drop_first=True,
)

### Model Training and Prediction

The best model is now trained on all available data (train+validation+test data), so it can be used to make inference predictions on live data during production

In [21]:
# Impute missing values
imputer = SimpleImputer(strategy="most_frequent")
X_all_imputed = imputer.fit_transform(X_all_clean)

# Scale features
scaler = StandardScaler()
X_all_scaled = scaler.fit_transform(X_all_imputed)

# Train best model
model_best.fit(X_all_scaled, y_all)

# Predictions
y_prob_all = model_best.predict_proba(X_all_scaled)[:, 1]
y_pred_all = (y_prob_all >= best_decision_threshold).astype(int)

## Export Project Deliverables to Private R2 Bucket

Get the current timestamp in the format `YYmmdd_HHMMSS`

In [22]:
curr_timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')

### Data Files with Churn Predictions

The following will be appended to the validation split (`df_val`) and test split (`df_test`)

1. ML model predictions (in the `y_pred` column)
2. ML model prediction probabilities (in the `y_pred_proba` column)
3. name of best model (in the `model_name` column)
4. best decision threshold ( in the `best_decision_threshold` column)

#### Validation Split

Append to the validation split (`df_val`)

In [23]:
%%time
df_val_pred = (
    df_val
    .assign(
        # 4.best model name
        model_name=best_model_name,
        # 1. ML model predictions
        y_pred_proba=pd.Series(
            y_prob_val, index=X_val_clean.index, dtype='float64[pyarrow]'
        ),
        # 2. ML model prediction probabilities
        y_pred=pd.Series(
            y_pred_val, index=X_val_clean.index, dtype='int16[pyarrow]'
        ),
        # 3. best decision threshold
        best_decision_threshold=best_decision_threshold,
    )
    .astype(
        {
            "model_name": 'category',
            'best_decision_threshold': 'float64[pyarrow]',
        }
    )
    .astype({k: 'category' for k in list(dtypes_categoricals)})
)
print(f"Got {len(df_val_pred):,} rows of validation split predictions")
with pd.option_context('display.max_columns', None):
    display(df_val_pred.head())
df_val_pred.info(memory_usage='deep')

Got 1,693 rows of validation split predictions


Unnamed: 0,clientnum,is_churned,customer_age,gender,dependent_count,education_level,marital_status,income_category,card_category,months_on_book,num_products,months_inactive_12_mon,contacts_count_12_mon,credit_limit,total_revolv_bal,avg_open_to_buy,total_amt_chng_q4_q1,total_trans_amt,total_trans_ct,total_ct_chng_q4_q1,avg_utilization_ratio,model_name,y_pred_proba,y_pred,best_decision_threshold
0,823840458,0,47,F,1,High School,Married,Unknown,Blue,43,4,2,2,1828.0,1517.0,311.0,0.661,4542.0,82,0.577,0.83,LogisticRegression,0.010016,0,0.5
1,716328558,0,46,M,2,Uneducated,Married,$80K - $120K,Blue,38,4,5,2,11434.0,0.0,11434.0,0.84,4520.0,83,0.844,0.0,LogisticRegression,0.073041,0,0.5
2,714735033,0,45,F,5,Doctorate,Married,Less than $40K,Blue,34,4,2,2,1438.3,491.0,947.3,0.708,4376.0,84,0.787,0.341,LogisticRegression,0.034725,0,0.5
3,712163433,1,47,M,3,Uneducated,Married,$40K - $60K,Blue,29,1,2,4,1684.0,644.0,1040.0,0.723,2164.0,36,0.714,0.382,LogisticRegression,0.914736,1,0.5
4,720307683,0,43,F,4,Graduate,Divorced,Less than $40K,Blue,36,3,2,2,1438.3,743.0,695.3,0.624,4484.0,78,0.625,0.517,LogisticRegression,0.17757,0,0.5


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1693 entries, 0 to 1692
Data columns (total 25 columns):
 #   Column                   Non-Null Count  Dtype          
---  ------                   --------------  -----          
 0   clientnum                1693 non-null   int32[pyarrow] 
 1   is_churned               1693 non-null   int8[pyarrow]  
 2   customer_age             1693 non-null   int8[pyarrow]  
 3   gender                   1693 non-null   category       
 4   dependent_count          1693 non-null   int8[pyarrow]  
 5   education_level          1693 non-null   category       
 6   marital_status           1693 non-null   category       
 7   income_category          1693 non-null   category       
 8   card_category            1693 non-null   category       
 9   months_on_book           1693 non-null   int16[pyarrow] 
 10  num_products             1693 non-null   int16[pyarrow] 
 11  months_inactive_12_mon   1693 non-null   int16[pyarrow] 
 12  contacts_count_12_mo

Next, export to a file in the R2 bucket with the following file name format `validation_predictions__logisticregression__<current-timestamp-YYmmdd_HHMMSS>.parquet.gzip`

In [24]:
%%time
export_df_to_r2(
    df_val_pred,
    bucket_name,
    (
        f"validation_predictions__{best_model_name.lower()}__"
        f"{curr_timestamp}.parquet.gzip"
    ),
)

Key validation_predictions__logisticregression__20251010_140733.parquet.gzip does not exist in bucket cc-churn-splits
Exported 1,693 rows to key: validation_predictions__logisticregression__20251010_140733.parquet.gzip
CPU times: user 486 ms, sys: 270 μs, total: 486 ms
Wall time: 333 ms


#### Test Split

Append to the test split (`df_test`)

In [25]:
%%time
df_test_pred = (
    df_test
    .assign(
        # 4.best model name
        model_name=best_model_name,
        # 1. ML model predictions
        y_pred_proba=pd.Series(
            y_prob_test, index=X_test_clean.index, dtype='float64[pyarrow]'
        ),
        # 2. ML model prediction probabilities
        y_pred=pd.Series(
            y_pred_test, index=X_test_clean.index, dtype='int16[pyarrow]'
        ),
        # 3. best decision threshold
        best_decision_threshold=best_decision_threshold,
    )
    .astype(
        {
            "model_name": 'category',
            'best_decision_threshold': 'float64[pyarrow]',
        }
    )
    .astype({k: 'category' for k in list(dtypes_categoricals)})
)
print(f"Got {len(df_test_pred):,} rows of test split predictions")
with pd.option_context('display.max_columns', None):
    display(df_test_pred.head())
df_test_pred.info(memory_usage='deep')

Got 2,149 rows of test split predictions


Unnamed: 0,clientnum,is_churned,customer_age,gender,dependent_count,education_level,marital_status,income_category,card_category,months_on_book,num_products,months_inactive_12_mon,contacts_count_12_mon,credit_limit,total_revolv_bal,avg_open_to_buy,total_amt_chng_q4_q1,total_trans_amt,total_trans_ct,total_ct_chng_q4_q1,avg_utilization_ratio,model_name,y_pred_proba,y_pred,best_decision_threshold
0,708223383,1,47,F,3,High School,Married,Unknown,Blue,39,6,3,4,11410.0,979.0,10431.0,1.049,2736.0,38,0.462,0.086,LogisticRegression,0.906784,1,0.5
1,715052583,0,45,M,3,Doctorate,Single,$60K - $80K,Silver,34,3,3,1,27494.0,879.0,26615.0,0.671,14375.0,112,0.672,0.032,LogisticRegression,0.182446,0,0.5
2,719910333,0,43,F,3,Unknown,Unknown,Less than $40K,Blue,36,4,4,2,5853.0,1190.0,4663.0,0.936,3595.0,80,0.667,0.203,LogisticRegression,0.081685,0,0.5
3,785328408,0,37,F,4,Graduate,Married,Less than $40K,Blue,31,5,2,3,1758.0,1180.0,578.0,0.744,2013.0,41,0.367,0.671,LogisticRegression,0.685992,1,0.5
4,716321583,0,50,F,3,Unknown,Single,$40K - $60K,Blue,36,5,3,3,12740.0,1173.0,11567.0,1.04,3441.0,55,0.719,0.092,LogisticRegression,0.466255,0,0.5


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2149 entries, 0 to 2148
Data columns (total 25 columns):
 #   Column                   Non-Null Count  Dtype          
---  ------                   --------------  -----          
 0   clientnum                2149 non-null   int32[pyarrow] 
 1   is_churned               2149 non-null   int8[pyarrow]  
 2   customer_age             2149 non-null   int8[pyarrow]  
 3   gender                   2149 non-null   category       
 4   dependent_count          2149 non-null   int8[pyarrow]  
 5   education_level          2149 non-null   category       
 6   marital_status           2149 non-null   category       
 7   income_category          2149 non-null   category       
 8   card_category            2149 non-null   category       
 9   months_on_book           2149 non-null   int16[pyarrow] 
 10  num_products             2149 non-null   int16[pyarrow] 
 11  months_inactive_12_mon   2149 non-null   int16[pyarrow] 
 12  contacts_count_12_mo

Next, export to a file in the R2 bucket with the following file name format `test_predictions__logisticregression__<current-timestamp-YYmmdd_HHMMSS>.parquet.gzip`

In [26]:
%%time
export_df_to_r2(
    df_test_pred,
    bucket_name,
    (
        f"test_predictions__{best_model_name.lower()}__"
        f"{curr_timestamp}.parquet.gzip"
    ),
)

Key test_predictions__logisticregression__20251010_140733.parquet.gzip does not exist in bucket cc-churn-splits
Exported 2,149 rows to key: test_predictions__logisticregression__20251010_140733.parquet.gzip
CPU times: user 36.1 ms, sys: 1.04 ms, total: 37.2 ms
Wall time: 353 ms


#### All Available Data

Append to all available data

In [27]:
%%time
df_all_pred = (
    pd.concat([df_train, df_val, df_test])
    .assign(
        # 4.best model name
        model_name=best_model_name,
        # 1. ML model predictions
        y_pred_proba=pd.Series(
            y_prob_all, index=X_all_clean.index, dtype='float64[pyarrow]'
        ),
        # 2. ML model prediction probabilities
        y_pred=pd.Series(
            y_pred_all, index=X_all_clean.index, dtype='int16[pyarrow]'
        ),
        # 3. best decision threshold
        best_decision_threshold=best_decision_threshold,
    )
    .astype(
        {
            "model_name": 'category',
            'best_decision_threshold': 'float64[pyarrow]',
        }
    )
    .astype({k: 'category' for k in list(dtypes_categoricals)})
)
print(f"Got {len(df_all_pred):,} rows of predictions on all available data")
with pd.option_context('display.max_columns', None):
    display(df_all_pred.head())
df_all_pred.info(memory_usage='deep')

Got 10,127 rows of predictions on all available data


Unnamed: 0,clientnum,is_churned,customer_age,gender,dependent_count,education_level,marital_status,income_category,card_category,months_on_book,num_products,months_inactive_12_mon,contacts_count_12_mon,credit_limit,total_revolv_bal,avg_open_to_buy,total_amt_chng_q4_q1,total_trans_amt,total_trans_ct,total_ct_chng_q4_q1,avg_utilization_ratio,model_name,y_pred_proba,y_pred,best_decision_threshold
0,714283458,0,40,M,2,College,Single,$80K - $120K,Blue,36,5,1,4,14544.0,0.0,14544.0,0.768,4064.0,92,0.769,0.0,LogisticRegression,0.011299,0,0.5
1,787587033,0,42,F,4,Graduate,Single,Less than $40K,Blue,32,3,1,2,2996.0,1992.0,1004.0,0.948,4463.0,87,0.74,0.665,LogisticRegression,0.005139,0,0.5
2,714672933,0,52,F,3,Graduate,Married,$40K - $60K,Blue,36,6,4,2,3143.0,2268.0,875.0,0.801,4417.0,84,0.68,0.722,LogisticRegression,0.006612,0,0.5
3,714974658,0,48,F,4,College,Married,Less than $40K,Blue,36,6,1,4,2464.0,1867.0,597.0,0.6,1219.0,35,1.333,0.758,LogisticRegression,0.134924,0,0.5
4,712049208,0,56,M,3,Post-Graduate,Married,$60K - $80K,Blue,39,3,1,2,3955.0,2517.0,1438.0,0.484,1238.0,25,1.083,0.636,LogisticRegression,0.248991,0,0.5


<class 'pandas.core.frame.DataFrame'>
Index: 10127 entries, 0 to 2148
Data columns (total 25 columns):
 #   Column                   Non-Null Count  Dtype          
---  ------                   --------------  -----          
 0   clientnum                10127 non-null  int32[pyarrow] 
 1   is_churned               10127 non-null  int8[pyarrow]  
 2   customer_age             10127 non-null  int8[pyarrow]  
 3   gender                   10127 non-null  category       
 4   dependent_count          10127 non-null  int8[pyarrow]  
 5   education_level          10127 non-null  category       
 6   marital_status           10127 non-null  category       
 7   income_category          10127 non-null  category       
 8   card_category            10127 non-null  category       
 9   months_on_book           10127 non-null  int16[pyarrow] 
 10  num_products             10127 non-null  int16[pyarrow] 
 11  months_inactive_12_mon   10127 non-null  int16[pyarrow] 
 12  contacts_count_12_mon   

Next, export to a file in the R2 bucket with the following file name format `all_predictions__logisticregression__<current-timestamp-YYmmdd_HHMMSS>.parquet.gzip`

In [28]:
%%time
export_df_to_r2(
    df_all_pred,
    bucket_name,
    (
        f"all_predictions__{best_model_name.lower()}__"
        f"{curr_timestamp}.parquet.gzip"
    ),
)

Key all_predictions__logisticregression__20251010_140733.parquet.gzip does not exist in bucket cc-churn-splits
Exported 10,127 rows to key: all_predictions__logisticregression__20251010_140733.parquet.gzip
CPU times: user 67.6 ms, sys: 4.93 ms, total: 72.5 ms
Wall time: 413 ms


### Best Trained ML Model Object

#### Trained on Combined Training and Validataion Data (Train+Validation)

Train the best model on the combined training and validation data (`X_train_val_scaled` and `y_train_val`)

In [29]:
%%time
model_best.fit(X_train_val_scaled, y_train_val)

CPU times: user 65.1 ms, sys: 2.77 ms, total: 67.8 ms
Wall time: 20.4 ms


0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,1.0
,fit_intercept,True
,intercept_scaling,1
,class_weight,'balanced'
,random_state,42
,solver,'lbfgs'
,max_iter,1000


Export to `.joblib` file in R2 bucket

In [30]:
%%time
with tempfile.TemporaryFile() as fp:
    # Dump the object to the in-memory file
    joblib.dump(model_best, fp)

    # Seek to the beginning of the file before uploading
    fp.seek(0)

    # Upload the in-memory file to our team's R2 bucket
    s3_client.put_object(
        Body=fp.read(),
        Bucket=bucket_name,
        Key=(
            f"best_model__{best_model_name}__train_val__{curr_timestamp}.joblib"
        ),
    )

CPU times: user 851 ms, sys: 1.12 ms, total: 852 ms
Wall time: 434 ms


#### Trained on All Available Data (Train+Validation+Test)

Train the best model on all available data (`X_all_scaled` and `y_all`)

In [31]:
%%time
model_best.fit(X_all_scaled, y_all)

CPU times: user 90.4 ms, sys: 2.81 ms, total: 93.2 ms
Wall time: 22.9 ms


0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,1.0
,fit_intercept,True
,intercept_scaling,1
,class_weight,'balanced'
,random_state,42
,solver,'lbfgs'
,max_iter,1000


Export to `.joblib` file in R2 bucket

In [32]:
%%time
with tempfile.TemporaryFile() as fp:
    # Dump the object to the in-memory file
    joblib.dump(model_best, fp)

    # Seek to the beginning of the file before uploading
    fp.seek(0)

    # Upload the in-memory file to our team's R2 bucket
    s3_client.put_object(
        Body=fp.read(),
        Bucket=bucket_name,
        Key=(
            f"best_model__{best_model_name}__all__{curr_timestamp}.joblib"
        ),
    )

CPU times: user 854 ms, sys: 378 μs, total: 854 ms
Wall time: 245 ms
