# Model Development - `LogisticRegression`

In [1]:
import os
import tempfile
from datetime import datetime
from io import BytesIO, StringIO
from pathlib import Path

import boto3
import botocore.exceptions
import joblib
import numpy as np
import pandas as pd
from dotenv import load_dotenv
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import (
    accuracy_score, classification_report, confusion_matrix, roc_auc_score
)
from sklearn.preprocessing import StandardScaler

In [2]:
PROJ_ROOT = Path.cwd().parent

In [3]:
assert load_dotenv(dotenv_path=PROJ_ROOT.parent / '.env')

## About

Machine Learning (ML) model training using `LogisticRegression`.

### Outputs

Based on the project deliverables in [this project's scoping document](https://github.com/edesz/credit-card-churn/blob/main/references/01_proposal.md#project-deliverables), this notebook produces following outputs

1. The best trained ML model is used to make predictions on the
   - validation data split and these predictions and probabilities are exported to a file. These predictions will be used in the next step to calculate business metrics on the validation split.
   - test data split these predictions and probabilities are also exported to a file in order to calculate business metrics on the unseen data (test split).
2. The best ML model object is exported after training it on
   - combined train and validation data
   - all available data

## User Inputs

In [4]:
# R2 data bucket details
bucket_name = 'cc-churn-splits'
# # name of train data key (file) in private R2 bucket
r2_key_train = 'train_data.parquet.gzip'
# # name of validation data key (file) in private R2 bucket
r2_key_val = 'validation_data.parquet.gzip'
# # name of test data key (file) in private R2 bucket
r2_key_test = 'test_data.parquet.gzip'

# datatypes for categorical columns
dtypes_categoricals = {
    "gender": 'string[pyarrow]',
    "marital_status": 'string[pyarrow]',
    "income_category": 'string[pyarrow]',
    "card_category": 'string[pyarrow]',
    "education_level": 'string[pyarrow]',
}

label = 'is_churned'

In [5]:
account_id = os.getenv('ACCOUNT_ID')
access_key_id = os.getenv('ACCESS_KEY_ID_USER2')
secret_access_key = os.getenv('SECRET_ACCESS_KEY_USER2')

s3_client = boto3.client(
    's3',
    endpoint_url=f'https://{account_id}.r2.cloudflarestorage.com',
    aws_access_key_id=access_key_id,
    aws_secret_access_key=secret_access_key,
    region_name='auto'
)

In [6]:
def pandas_read_parquet_r2(bucket_name, r2_key):
    """Read parquet file from private R2 bucket."""
    s3_object = s3_client.get_object(Bucket=bucket_name, Key=r2_key)
    df = pd.read_parquet(
        BytesIO(s3_object['Body'].read()), dtype_backend='pyarrow'
    )
    return df


def export_df_to_r2(df, bucket_name, r2_key):
    """Export DataFrame to file in private R2 bucket, if not present."""
    try:
        s3_client.head_object(Bucket=bucket_name, Key=r2_key)
        print(f"Key {r2_key} already exists in bucket {bucket_name}")
    except botocore.exceptions.ClientError as e:
        if e.response["Error"]["Code"] == "404":
            print(f"Key {r2_key} does not exist in bucket {bucket_name}")
            buffer = BytesIO()
            df.to_parquet(
                buffer,
                index=False,
                engine='pyarrow',
                compression='gzip',
            )
            response = s3_client.put_object(
                Bucket=bucket_name, Key=r2_key, Body=buffer.getvalue()
            )
            assert response['ResponseMetadata']['HTTPStatusCode'] == 200
            print(f"Exported {len(df):,} rows to key: {r2_key}")
        elif e.response["Error"]["Code"] == "403":
            print(f"Access denied to bucket {bucket_name} or key {r2_key}")
        else:
            print(f"An unexpected error occurred: {e}")

## Load Data

### Data for Model Validation

Load the training data

In [7]:
%%time
df_train = (
    pandas_read_parquet_r2(bucket_name, r2_key_train)
    .astype(dtypes_categoricals)
)
print(f"Loaded {len(df_train):,} rows of training data")
with pd.option_context('display.max_columns', None):
    display(df_train.head())

Loaded 6,982 rows of training data


Unnamed: 0,clientnum,is_churned,customer_age,gender,dependent_count,education_level,marital_status,income_category,card_category,months_on_book,num_products,months_inactive_12_mon,contacts_count_12_mon,credit_limit,total_revolv_bal,avg_open_to_buy,total_amt_chng_q4_q1,total_trans_amt,total_trans_ct,total_ct_chng_q4_q1,avg_utilization_ratio
0,710115108,0,45,F,2,Graduate,Unknown,$40K - $60K,Blue,37,5,2,3,5562.0,1556.0,4006.0,0.751,4568.0,76,0.767,0.28
1,713668158,1,31,F,1,Uneducated,Single,Unknown,Blue,36,2,3,3,21067.0,0.0,21067.0,0.995,9212.0,71,0.821,0.0
2,710180958,0,43,M,3,High School,Single,$40K - $60K,Blue,33,5,2,3,3040.0,2517.0,523.0,0.493,1598.0,31,0.476,0.828
3,711204483,1,56,M,4,High School,Married,$60K - $80K,Blue,46,3,3,3,2340.0,1930.0,410.0,0.987,781.0,15,0.364,0.825
4,712797258,0,43,M,3,Graduate,Married,$120K +,Blue,33,3,1,3,34516.0,2129.0,32387.0,0.587,1463.0,38,0.583,0.062


CPU times: user 58.3 ms, sys: 19 ms, total: 77.3 ms
Wall time: 372 ms


### Data for Model Evaluation

Load the validation data

In [8]:
%%time
df_val = (
    pandas_read_parquet_r2(bucket_name, r2_key_val)
    .astype(dtypes_categoricals)
)
print(f"Loaded {len(df_val):,} rows of validation data")
with pd.option_context('display.max_columns', None):
    display(df_val.head())

Loaded 1,427 rows of validation data


Unnamed: 0,clientnum,is_churned,customer_age,gender,dependent_count,education_level,marital_status,income_category,card_category,months_on_book,num_products,months_inactive_12_mon,contacts_count_12_mon,credit_limit,total_revolv_bal,avg_open_to_buy,total_amt_chng_q4_q1,total_trans_amt,total_trans_ct,total_ct_chng_q4_q1,avg_utilization_ratio
0,816086508,0,47,F,2,Unknown,Married,Less than $40K,Blue,42,1,3,3,1438.3,0.0,1438.3,0.677,4624.0,73,0.587,0.0
1,715749858,0,30,M,1,Unknown,Married,$40K - $60K,Blue,18,5,1,3,4726.0,1380.0,3346.0,0.622,1723.0,31,0.55,0.292
2,795991758,0,47,F,3,Unknown,Married,$40K - $60K,Blue,41,5,3,3,7553.0,660.0,6893.0,0.549,4079.0,67,0.457,0.087
3,771594783,1,56,F,1,College,Married,Unknown,Blue,44,2,4,4,1438.3,0.0,1438.3,0.859,2264.0,37,0.423,0.0
4,771502383,0,54,F,1,High School,Married,Less than $40K,Blue,42,3,5,4,2474.0,1539.0,935.0,0.649,4884.0,80,0.509,0.622


CPU times: user 56.9 ms, sys: 2.8 ms, total: 59.7 ms
Wall time: 184 ms


Get the combined training+validation data split

In [9]:
%%time
df_train_val = pd.concat([df_train, df_val], ignore_index=True)
print(f"Obtained {len(df_train_val):,} rows of training+validation data")
with pd.option_context('display.max_columns', None):
    display(df_train_val.head())

Obtained 8,409 rows of training+validation data


Unnamed: 0,clientnum,is_churned,customer_age,gender,dependent_count,education_level,marital_status,income_category,card_category,months_on_book,num_products,months_inactive_12_mon,contacts_count_12_mon,credit_limit,total_revolv_bal,avg_open_to_buy,total_amt_chng_q4_q1,total_trans_amt,total_trans_ct,total_ct_chng_q4_q1,avg_utilization_ratio
0,710115108,0,45,F,2,Graduate,Unknown,$40K - $60K,Blue,37,5,2,3,5562.0,1556.0,4006.0,0.751,4568.0,76,0.767,0.28
1,713668158,1,31,F,1,Uneducated,Single,Unknown,Blue,36,2,3,3,21067.0,0.0,21067.0,0.995,9212.0,71,0.821,0.0
2,710180958,0,43,M,3,High School,Single,$40K - $60K,Blue,33,5,2,3,3040.0,2517.0,523.0,0.493,1598.0,31,0.476,0.828
3,711204483,1,56,M,4,High School,Married,$60K - $80K,Blue,46,3,3,3,2340.0,1930.0,410.0,0.987,781.0,15,0.364,0.825
4,712797258,0,43,M,3,Graduate,Married,$120K +,Blue,33,3,1,3,34516.0,2129.0,32387.0,0.587,1463.0,38,0.583,0.062


CPU times: user 16 ms, sys: 1 ms, total: 17 ms
Wall time: 15.8 ms


Load the test data

In [10]:
%%time
df_test = (
    pandas_read_parquet_r2(bucket_name, r2_key_test)
    .astype(dtypes_categoricals)
)
print(f"Loaded {len(df_test):,} rows of test data")
with pd.option_context('display.max_columns', None):
    display(df_test)

Loaded 1,718 rows of test data


Unnamed: 0,clientnum,is_churned,customer_age,gender,dependent_count,education_level,marital_status,income_category,card_category,months_on_book,num_products,months_inactive_12_mon,contacts_count_12_mon,credit_limit,total_revolv_bal,avg_open_to_buy,total_amt_chng_q4_q1,total_trans_amt,total_trans_ct,total_ct_chng_q4_q1,avg_utilization_ratio
0,721418133,0,45,F,3,Graduate,Single,Less than $40K,Blue,33,6,2,3,2016.0,972.0,1044.0,0.685,4535.0,82,0.745,0.482
1,719513733,0,48,M,3,Doctorate,Married,$80K - $120K,Blue,41,3,3,4,8739.0,0.0,8739.0,0.715,4276.0,69,0.769,0.0
2,721043058,0,58,M,4,Unknown,Married,$60K - $80K,Blue,44,5,1,3,12010.0,2149.0,9861.0,0.801,1700.0,35,1.5,0.179
3,718435158,0,50,M,3,High School,Married,$120K +,Blue,40,5,1,4,34516.0,0.0,34516.0,0.986,1930.0,36,0.44,0.0
4,711336033,0,26,F,0,Unknown,Single,Unknown,Blue,36,3,3,2,7595.0,1032.0,6563.0,1.035,4080.0,72,0.8,0.136
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1713,719674533,0,48,M,3,Unknown,Single,$80K - $120K,Blue,44,5,1,3,16794.0,1527.0,15267.0,0.912,4040.0,76,0.689,0.091
1714,757024233,0,34,M,2,Graduate,Single,$120K +,Blue,27,3,1,4,31313.0,2152.0,29161.0,0.679,2555.0,62,0.676,0.069
1715,710784333,0,57,M,2,Unknown,Married,$120K +,Blue,36,3,2,2,23700.0,1314.0,22386.0,1.487,1644.0,28,0.647,0.055
1716,795613233,0,48,F,2,Graduate,Married,Unknown,Blue,42,4,2,0,10514.0,1494.0,9020.0,0.649,4949.0,76,0.81,0.142


CPU times: user 63.7 ms, sys: 869 μs, total: 64.6 ms
Wall time: 156 ms


## Separate Features from Target

In [11]:
# model validation
X_train = df_train.drop(columns=[label])
y_train = df_train[label]

X_val = df_val.drop(columns=[label])
y_val = df_val[label]

# model evaluation
X_train_val = df_train_val.drop(columns=[label])
y_train_val = df_train_val[label]

X_test = df_test.drop(columns=[label])
y_test = df_test[label]

# model inference
X_all = pd.concat([X_train, X_val, X_test])
y_all = pd.concat([y_train, y_val, y_test])

## Clean Data

In [12]:
# model validation
X_train_clean = X_train.copy()
X_val_clean = X_val.copy()

# model evaluation
X_train_val_clean = X_train_val.copy()
X_test_clean = X_test.copy()

# model inference
X_all_clean = X_all.copy()

### Handling Missing or Unknown Values

Several categorical columns, like Education_Level, Marital_Status, and Income_Category, contained the value "Unknown". This is a placeholder for missing or unavailable data. We replaced these with NaN to treat them as missing values properly. This allows us to later apply imputation techniques to fill them in appropriately. Treating unknowns as real values can distort model behavior, so identifying them as missing is a crucial cleaning step.

In [13]:
categorical_cols = ["education_level", "marital_status", "income_category"]

# model validation
X_train_clean[categorical_cols] = X_train_clean[categorical_cols].replace("Unknown", pd.NA)
X_val_clean[categorical_cols] = X_val_clean[categorical_cols].replace("Unknown", pd.NA)

# model evaluation
X_train_val_clean[categorical_cols] = X_train_val_clean[categorical_cols].replace("Unknown", pd.NA)
X_test_clean[categorical_cols] = X_test_clean[categorical_cols].replace("Unknown", pd.NA)

# model inference
X_all_clean[categorical_cols] = X_all_clean[categorical_cols].replace("Unknown", pd.NA)

## Model Validation

Model validation uses the validation data split.

Model validation is performed below using the training data split (`df_train`) to find the best model, decision threshold and (optional) hyperparameters.

### Encoding Categorical Variables

Machine learning models require all features to be numeric. I handled categorical data in two ways:

**Ordinal Encoding for Education_Level**: Since education has a natural order (e.g., High School < Graduate < Doctorate), we mapped it to integers from 0 to 5.

**One-Hot Encoding for Other Categories**: For nominal variables like Gender, Marital_Status, Income_Category, and Card_Category, we used one-hot encoding. This creates separate binary columns for each category, allowing the model to treat them independently without assuming any order.

This transformation makes the data fully numeric and model-friendly.

In [14]:
education_map = {
    "Uneducated": 0,
    "High School": 1,
    "College": 2,
    "Graduate": 3,
    "Post-Graduate": 4,
    "Doctorate": 5
}

X_train_clean["education_level"] = X_train_clean["education_level"].map(education_map)
X_train_clean = pd.get_dummies(
    X_train_clean,
    columns=["gender", "marital_status", "income_category", "card_category"],
    drop_first=True,
)

X_val_clean["education_level"] = X_val_clean["education_level"].map(education_map)
X_val_clean = pd.get_dummies(
    X_val_clean,
    columns=["gender", "marital_status", "income_category", "card_category"],
    drop_first=True,
)

### Model Selection

Here, we assume the best model is `LogisticRegression`

In [15]:
best_model_name = 'LogisticRegression'
model_best = LogisticRegression(
    max_iter=1000, class_weight="balanced", random_state=42
)

**Notes**

1. In the next notebook, multiple types of ML models will be compared to eachother using the `prauc` scoring metric to determine the best model. See [phase 1 from the metrics section of the project scope](https://github.com/edesz/credit-card-churn/blob/main/references/01_proposal.md#final-choice-of-metrics) for details about the choice of metric during model selection.

### Model Training and Prediction

As mentioned above, here we do not validate the model by comparing different types of models using cross-validation on the training data (`X_train_scaled` and `y_train`). Instead, we just use the model to make predictions for the validation split (`df_val`)

In [16]:
# Impute missing values
imputer = SimpleImputer(strategy="most_frequent")
X_train_imputed = imputer.fit_transform(X_train_clean)
X_val_imputed = imputer.transform(X_val_clean)

# Scale features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_imputed)
X_val_scaled = scaler.transform(X_val_imputed)

# Train Logistic Regression
model_best.fit(X_train_scaled, y_train)

# Predictions
y_pred_val = model_best.predict(X_val_scaled)
y_prob_val = model_best.predict_proba(X_val_scaled)[:, 1]

### Decision Threshold Tuning

Here, we use the default decision threshold of 0.5

In [17]:
best_decision_threshold = 0.5

**Notes**

1. In the next notebook, multiple decision thresholds will be compared to eachother using the `f2_score` scoring metric to determine the best model. See [phase 2 from the metrics section of the project scope](https://github.com/edesz/credit-card-churn/blob/main/references/01_proposal.md#final-choice-of-metrics) for details about the choice of metric for tuning the threshold.

## Model Evaluation

Model evaluation uses the test data split.

Model evaluation is performed to evaluate the performance of the model on unseen data (test split). The best model is trained on the combined training and validation data. It is then used the make predictions of the test split and these predictions are scored. See [phase 3 from the metrics section of the project scope](https://github.com/edesz/credit-card-churn/blob/main/references/01_proposal.md#final-choice-of-metrics) for details about the choice of metric for model evaluation.

### Encoding Categorical Variables

In [18]:
X_train_val_clean["education_level"] = X_train_val_clean["education_level"].map(education_map)
X_train_val_clean = pd.get_dummies(
    X_train_val_clean,
    columns=["gender", "marital_status", "income_category", "card_category"],
    drop_first=True,
)

X_test_clean["education_level"] = X_test_clean["education_level"].map(education_map)
X_test_clean = pd.get_dummies(
    X_test_clean,
    columns=["gender", "marital_status", "income_category", "card_category"],
    drop_first=True,
)

### Model Training and Prediction

The best model is now trained on the combined train+validation data (`df_train_val`). It is then used to make predictions for the test split (`df_test`) and score these predictions.

In [19]:
# Impute missing values
imputer = SimpleImputer(strategy="most_frequent")
X_train_val_imputed = imputer.fit_transform(X_train_val_clean)
X_test_imputed = imputer.transform(X_test_clean)

# Scale features
scaler = StandardScaler()
X_train_val_scaled = scaler.fit_transform(X_train_val_imputed)
X_test_scaled = scaler.transform(X_test_imputed)

# Train best model
model_best.fit(X_train_val_scaled, y_train_val)

# Predictions
y_pred_test = model_best.predict(X_test_scaled)
y_prob_test = model_best.predict_proba(X_test_scaled)[:, 1]

# Evaluation
acc = accuracy_score(y_test, y_pred_test)
auc = roc_auc_score(y_test, y_prob_test)
report = classification_report(y_test, y_pred_test)
cm = confusion_matrix(y_test, y_pred_test)

acc, auc, cm, report

(0.8544819557625145,
 0.9300186938430923,
 array([[1238,  204],
        [  46,  230]]),
 '              precision    recall  f1-score   support\n\n         0.0       0.96      0.86      0.91      1442\n         1.0       0.53      0.83      0.65       276\n\n    accuracy                           0.85      1718\n   macro avg       0.75      0.85      0.78      1718\nweighted avg       0.89      0.85      0.87      1718\n')

### Conclusion

The model achieved an **accuracy of 85.4%** and an excellent **ROC-AUC of 93%**, showing it can effectively separate churners from non-churners. The confusion matrix reveals that it correctly identified most customers, including **83% of actual churners (high recall)**, which is crucial for customer retention. However, the **precision for churners is only 53%**, meaning nearly half of the customers flagged as churners are actually loyal. This trade-off is acceptable in churn prediction since it is better to mistakenly target some loyal customers with retention offers than to miss real churners. Overall, the model provides strong recall and good overall performance, though improving precision with advanced models like Random Forest or XGBoost could make it more practical for business use.

## Model Inference

Prepare model for inference using live data in production.

### Encoding Categorical Variables

In [20]:
X_all_clean["education_level"] = X_all_clean["education_level"].map(education_map)
X_all_clean = pd.get_dummies(
    X_all_clean,
    columns=["gender", "marital_status", "income_category", "card_category"],
    drop_first=True,
)

### Model Training and Prediction

The best model is now trained on all available data train+validation+test data, so it can be used to make inference predictions on live data during production

In [21]:
# Impute missing values
imputer = SimpleImputer(strategy="most_frequent")
X_all_imputed = imputer.fit_transform(X_all_clean)

# Scale features
scaler = StandardScaler()
X_all_scaled = scaler.fit_transform(X_all_imputed)

# Train best model
model_best.fit(X_all_scaled, y_all)

0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,1.0
,fit_intercept,True
,intercept_scaling,1
,class_weight,'balanced'
,random_state,42
,solver,'lbfgs'
,max_iter,1000


## Export Project Deliverables to Private R2 Bucket

Get the current timestamp in the format `YYmmdd_HHMMSS`

In [22]:
curr_timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')

### Data Files with Churn Predictions

The following will be appended to the validation split (`df_val`) and test split (`df_test`)

1. ML model predictions (in the `y_pred` column)
2. ML model prediction probabilities (in the `y_pred_proba` column)
3. name of best model (in the `model_name` column)
4. best decision threshold ( in the `best_decision_threshold` column)

#### Validation Split

Append to the validation split (`df_val`)

In [23]:
%%time
df_val_pred = (
    df_val
    .assign(
        # 4.best model name
        model_name=best_model_name,
        # 1. ML model predictions
        y_pred_proba=pd.Series(
            y_prob_val, index=X_val_clean.index, dtype='float64[pyarrow]'
        ),
        # 2. ML model prediction probabilities
        y_pred=pd.Series(
            y_pred_val, index=X_val_clean.index, dtype='int16[pyarrow]'
        ),
        # 3. best decision threshold
        best_decision_threshold=best_decision_threshold,
    )
    .astype(
        {
            "model_name": 'category',
            'best_decision_threshold': 'float64[pyarrow]',
        }
    )
    .astype({k: 'category' for k in list(dtypes_categoricals)})
)
print(f"Got {len(df_val_pred):,} rows of validation split predictions")
with pd.option_context('display.max_columns', None):
    display(df_val_pred.head())
df_val_pred.info(memory_usage='deep')

Got 1,427 rows of validation split predictions


Unnamed: 0,clientnum,is_churned,customer_age,gender,dependent_count,education_level,marital_status,income_category,card_category,months_on_book,num_products,months_inactive_12_mon,contacts_count_12_mon,credit_limit,total_revolv_bal,avg_open_to_buy,total_amt_chng_q4_q1,total_trans_amt,total_trans_ct,total_ct_chng_q4_q1,avg_utilization_ratio,model_name,y_pred_proba,y_pred,best_decision_threshold
0,816086508,0,47,F,2,Unknown,Married,Less than $40K,Blue,42,1,3,3,1438.3,0.0,1438.3,0.677,4624.0,73,0.587,0.0,LogisticRegression,0.565682,1,0.5
1,715749858,0,30,M,1,Unknown,Married,$40K - $60K,Blue,18,5,1,3,4726.0,1380.0,3346.0,0.622,1723.0,31,0.55,0.292,LogisticRegression,0.56938,1,0.5
2,795991758,0,47,F,3,Unknown,Married,$40K - $60K,Blue,41,5,3,3,7553.0,660.0,6893.0,0.549,4079.0,67,0.457,0.087,LogisticRegression,0.273788,0,0.5
3,771594783,1,56,F,1,College,Married,Unknown,Blue,44,2,4,4,1438.3,0.0,1438.3,0.859,2264.0,37,0.423,0.0,LogisticRegression,0.989876,1,0.5
4,771502383,0,54,F,1,High School,Married,Less than $40K,Blue,42,3,5,4,2474.0,1539.0,935.0,0.649,4884.0,80,0.509,0.622,LogisticRegression,0.311991,0,0.5


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1427 entries, 0 to 1426
Data columns (total 25 columns):
 #   Column                   Non-Null Count  Dtype          
---  ------                   --------------  -----          
 0   clientnum                1427 non-null   int32[pyarrow] 
 1   is_churned               1427 non-null   int8[pyarrow]  
 2   customer_age             1427 non-null   int8[pyarrow]  
 3   gender                   1427 non-null   category       
 4   dependent_count          1427 non-null   int8[pyarrow]  
 5   education_level          1427 non-null   category       
 6   marital_status           1427 non-null   category       
 7   income_category          1427 non-null   category       
 8   card_category            1427 non-null   category       
 9   months_on_book           1427 non-null   int16[pyarrow] 
 10  num_products             1427 non-null   int16[pyarrow] 
 11  months_inactive_12_mon   1427 non-null   int16[pyarrow] 
 12  contacts_count_12_mo

Next, export to a file in the R2 bucket with the following file name format `validation_predictions__logisticregression__<current-timestamp-YYmmdd_HHMMSS>.parquet.gzip`

In [24]:
%%time
export_df_to_r2(
    df_val_pred,
    bucket_name,
    (
        f"validation_predictions__{best_model_name.lower()}__"
        f"{curr_timestamp}.parquet.gzip"
    ),
)

Key validation_predictions__logisticregression__20251007_111315.parquet.gzip does not exist in bucket cc-churn-splits
Exported 1,427 rows to key: validation_predictions__logisticregression__20251007_111315.parquet.gzip
CPU times: user 454 ms, sys: 2.34 ms, total: 456 ms
Wall time: 453 ms


#### Test Split

Append to the test split (`df_test`)

In [25]:
%%time
df_test_pred = (
    df_test
    .assign(
        # 4.best model name
        model_name=best_model_name,
        # 1. ML model predictions
        y_pred_proba=pd.Series(
            y_prob_test, index=X_test_clean.index, dtype='float64[pyarrow]'
        ),
        # 2. ML model prediction probabilities
        y_pred=pd.Series(
            y_pred_test, index=X_test_clean.index, dtype='int16[pyarrow]'
        ),
        # 3. best decision threshold
        best_decision_threshold=best_decision_threshold,
    )
    .astype(
        {
            "model_name": 'category',
            'best_decision_threshold': 'float64[pyarrow]',
        }
    )
    .astype({k: 'category' for k in list(dtypes_categoricals)})
)
print(f"Got {len(df_test_pred):,} rows of test split predictions")
with pd.option_context('display.max_columns', None):
    display(df_test_pred.head())
df_test_pred.info(memory_usage='deep')

Got 1,718 rows of test split predictions


Unnamed: 0,clientnum,is_churned,customer_age,gender,dependent_count,education_level,marital_status,income_category,card_category,months_on_book,num_products,months_inactive_12_mon,contacts_count_12_mon,credit_limit,total_revolv_bal,avg_open_to_buy,total_amt_chng_q4_q1,total_trans_amt,total_trans_ct,total_ct_chng_q4_q1,avg_utilization_ratio,model_name,y_pred_proba,y_pred,best_decision_threshold
0,721418133,0,45,F,3,Graduate,Single,Less than $40K,Blue,33,6,2,3,2016.0,972.0,1044.0,0.685,4535.0,82,0.745,0.482,LogisticRegression,0.028622,0,0.5
1,719513733,0,48,M,3,Doctorate,Married,$80K - $120K,Blue,41,3,3,4,8739.0,0.0,8739.0,0.715,4276.0,69,0.769,0.0,LogisticRegression,0.495681,0,0.5
2,721043058,0,58,M,4,Unknown,Married,$60K - $80K,Blue,44,5,1,3,12010.0,2149.0,9861.0,0.801,1700.0,35,1.5,0.179,LogisticRegression,0.042856,0,0.5
3,718435158,0,50,M,3,High School,Married,$120K +,Blue,40,5,1,4,34516.0,0.0,34516.0,0.986,1930.0,36,0.44,0.0,LogisticRegression,0.82207,1,0.5
4,711336033,0,26,F,0,Unknown,Single,Unknown,Blue,36,3,3,2,7595.0,1032.0,6563.0,1.035,4080.0,72,0.8,0.136,LogisticRegression,0.151568,0,0.5


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1718 entries, 0 to 1717
Data columns (total 25 columns):
 #   Column                   Non-Null Count  Dtype          
---  ------                   --------------  -----          
 0   clientnum                1718 non-null   int32[pyarrow] 
 1   is_churned               1718 non-null   int8[pyarrow]  
 2   customer_age             1718 non-null   int8[pyarrow]  
 3   gender                   1718 non-null   category       
 4   dependent_count          1718 non-null   int8[pyarrow]  
 5   education_level          1718 non-null   category       
 6   marital_status           1718 non-null   category       
 7   income_category          1718 non-null   category       
 8   card_category            1718 non-null   category       
 9   months_on_book           1718 non-null   int16[pyarrow] 
 10  num_products             1718 non-null   int16[pyarrow] 
 11  months_inactive_12_mon   1718 non-null   int16[pyarrow] 
 12  contacts_count_12_mo

Next, export to a file in the R2 bucket with the following file name format `test_predictions__logisticregression__<current-timestamp-YYmmdd_HHMMSS>.parquet.gzip`

In [26]:
%%time
export_df_to_r2(
    df_test_pred,
    bucket_name,
    (
        f"test_predictions__{best_model_name.lower()}__"
        f"{curr_timestamp}.parquet.gzip"
    ),
)

Key test_predictions__logisticregression__20251007_111315.parquet.gzip does not exist in bucket cc-churn-splits
Exported 1,718 rows to key: test_predictions__logisticregression__20251007_111315.parquet.gzip
CPU times: user 34.6 ms, sys: 1.05 ms, total: 35.6 ms
Wall time: 618 ms


### Best Trained ML Model Object

#### Trained on Combined Training and Validataion Data (Train+Validation)

Train the best model on the combined training and validation data (`X_train_val_scaled` and `y_train_val`)

In [27]:
%%time
model_best.fit(X_train_val_scaled, y_train_val)

CPU times: user 66.3 ms, sys: 0 ns, total: 66.3 ms
Wall time: 15.2 ms


0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,1.0
,fit_intercept,True
,intercept_scaling,1
,class_weight,'balanced'
,random_state,42
,solver,'lbfgs'
,max_iter,1000


Export to `.joblib` file in R2 bucket

In [28]:
%%time
with tempfile.TemporaryFile() as fp:
    # Dump the object to the in-memory file
    joblib.dump(model_best, fp)

    # Seek to the beginning of the file before uploading
    fp.seek(0)

    # Upload the in-memory file to our team's R2 bucket
    s3_client.put_object(
        Body=fp.read(),
        Bucket=bucket_name,
        Key=(
            f"best_model__{best_model_name}__train_val__{curr_timestamp}.joblib"
        ),
    )

CPU times: user 855 ms, sys: 4.84 ms, total: 860 ms
Wall time: 238 ms


#### Trained on All Available Data (Train+Validation+Test)

Train the best model on all available data (`X_all_scaled` and `y_all`)

In [29]:
%%time
model_best.fit(X_all_scaled, y_all)

CPU times: user 88.1 ms, sys: 1.89 ms, total: 90 ms
Wall time: 20.6 ms


0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,1.0
,fit_intercept,True
,intercept_scaling,1
,class_weight,'balanced'
,random_state,42
,solver,'lbfgs'
,max_iter,1000


Export to `.joblib` file in R2 bucket

In [30]:
%%time
with tempfile.TemporaryFile() as fp:
    # Dump the object to the in-memory file
    joblib.dump(model_best, fp)

    # Seek to the beginning of the file before uploading
    fp.seek(0)

    # Upload the in-memory file to our team's R2 bucket
    s3_client.put_object(
        Body=fp.read(),
        Bucket=bucket_name,
        Key=(
            f"best_model__{best_model_name}__all__{curr_timestamp}.joblib"
        ),
    )

CPU times: user 861 ms, sys: 1.06 ms, total: 862 ms
Wall time: 233 ms
