In [11]:
import pandas as pd
import numpy as np
import boto3
import sagemaker
from sagemaker import get_execution_role
from sagemaker.xgboost import XGBoost
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score
import matplotlib.pyplot as plt
import seaborn as sns

# Setup
role = get_execution_role()
sagemaker_session = sagemaker.Session()
bucket = sagemaker_session.default_bucket()

print(f"SageMaker role: {role}")
print(f"S3 bucket: {bucket}")

SageMaker role: arn:aws:iam::158878148642:role/ChurnPredictorSageMakerRole
S3 bucket: sagemaker-us-east-1-158878148642


In [12]:
import boto3

# Initialize S3 client
s3 = boto3.client('s3')

# List files in the processed data folder
bucket_name = 'churn-predictor-bucket'
prefix = 'processed/featured_data/'

print("Files in the processed data folder:")
response = s3.list_objects_v2(Bucket=bucket_name, Prefix=prefix)

csv_files = []
for obj in response.get('Contents', []):
    filename = obj['Key']
    if filename.endswith('.csv'):
        csv_files.append(filename)
        print(f"Found CSV: {filename}")

# Read the CSV file
if csv_files:
    # Use the first (or only) CSV file
    csv_file_path = f"s3://{bucket_name}/{csv_files[0]}"
    print(f"\nReading file: {csv_file_path}")
    
    # Read the CSV
    df = pd.read_csv(csv_file_path)
    
    print(f"Data shape: {df.shape}")
    print(f"Columns ({len(df.columns)}): {list(df.columns)}")
    print(f"\nFirst few rows:")
    print(df.head())
    
    # Check for our engineered features
    engineered_features = ['churn_binary', 'tenure_years', 'charges_per_tenure', 'total_services']
    print(f"\nEngineered features present:")
    for feature in engineered_features:
        if feature in df.columns:
            print(f" {feature}")
        else:
            print(f"{feature}")
else:
    print("No CSV files found!")

Files in the processed data folder:
Found CSV: processed/featured_data/part-00000-f645e2f0-0a7c-40ec-809b-8ba1df393048-c000.csv

Reading file: s3://churn-predictor-bucket/processed/featured_data/part-00000-f645e2f0-0a7c-40ec-809b-8ba1df393048-c000.csv
Data shape: (7043, 33)
Columns (33): ['customerID', 'gender', 'SeniorCitizen', 'Partner', 'Dependents', 'tenure', 'PhoneService', 'MultipleLines', 'InternetService', 'OnlineSecurity', 'OnlineBackup', 'DeviceProtection', 'TechSupport', 'StreamingTV', 'StreamingMovies', 'Contract', 'PaperlessBilling', 'PaymentMethod', 'MonthlyCharges', 'TotalCharges', 'Churn', 'churn_binary', 'tenure_years', 'charges_per_tenure', 'avg_monthly_charges', 'total_services', 'service_adoption_rate', 'high_risk_payment', 'month_to_month_risk', 'has_fiber_optic', 'is_senior_citizen', 'fiber_high_charges', 'senior_short_tenure']

First few rows:
   customerID  gender  SeniorCitizen Partner Dependents  tenure PhoneService  \
0  7590-VHVEG  Female              0     

In [13]:
# Data preprocessing for training
print("=== DATA PREPROCESSING ===")

# Check basic info
print(f"Dataset shape: {df.shape}")
print(f"Target distribution:")
print(df['churn_binary'].value_counts())
print(f"Churn rate: {df['churn_binary'].mean():.3f}")

# Check for missing values
print(f"\nMissing values:")
print(df.isnull().sum().sum())

# Identify categorical and numerical columns
categorical_cols = df.select_dtypes(include=['object']).columns.tolist()
numerical_cols = df.select_dtypes(include=['int64', 'float64']).columns.tolist()

# Remove target and ID columns from features
if 'churn_binary' in numerical_cols:
    numerical_cols.remove('churn_binary')
if 'Churn' in categorical_cols:
    categorical_cols.remove('Churn')
if 'customerID' in categorical_cols:
    categorical_cols.remove('customerID')

print(f"\nCategorical columns ({len(categorical_cols)}): {categorical_cols}")
print(f"Numerical columns ({len(numerical_cols)}): {numerical_cols}")

=== DATA PREPROCESSING ===
Dataset shape: (7043, 33)
Target distribution:
churn_binary
0    5174
1    1869
Name: count, dtype: int64
Churn rate: 0.265

Missing values:
0

Categorical columns (15): ['gender', 'Partner', 'Dependents', 'PhoneService', 'MultipleLines', 'InternetService', 'OnlineSecurity', 'OnlineBackup', 'DeviceProtection', 'TechSupport', 'StreamingTV', 'StreamingMovies', 'Contract', 'PaperlessBilling', 'PaymentMethod']
Numerical columns (15): ['SeniorCitizen', 'tenure', 'MonthlyCharges', 'TotalCharges', 'tenure_years', 'charges_per_tenure', 'avg_monthly_charges', 'total_services', 'service_adoption_rate', 'high_risk_payment', 'month_to_month_risk', 'has_fiber_optic', 'is_senior_citizen', 'fiber_high_charges', 'senior_short_tenure']


In [14]:
# Prepare features for machine learning
from sklearn.preprocessing import LabelEncoder

# Create a copy for processing
df_ml = df.copy()

# Encode categorical variables
label_encoders = {}
for col in categorical_cols:
    if col in df_ml.columns:
        le = LabelEncoder()
        df_ml[col] = le.fit_transform(df_ml[col].astype(str))
        label_encoders[col] = le

# Select features for training (exclude ID and original target)
exclude_cols = ['customerID', 'Churn']
feature_cols = [col for col in df_ml.columns if col not in exclude_cols and col != 'churn_binary']

X = df_ml[feature_cols]
y = df_ml['churn_binary']

print(f"Features for training ({len(feature_cols)}): {feature_cols}")
print(f"Feature matrix shape: {X.shape}")
print(f"Target shape: {y.shape}")

# Check final data
print(f"\nFinal dataset info:")
print(f"Features: {X.shape[1]} columns, {X.shape[0]} rows")
print(f"Target distribution: {y.value_counts().to_dict()}")

Features for training (30): ['gender', 'SeniorCitizen', 'Partner', 'Dependents', 'tenure', 'PhoneService', 'MultipleLines', 'InternetService', 'OnlineSecurity', 'OnlineBackup', 'DeviceProtection', 'TechSupport', 'StreamingTV', 'StreamingMovies', 'Contract', 'PaperlessBilling', 'PaymentMethod', 'MonthlyCharges', 'TotalCharges', 'tenure_years', 'charges_per_tenure', 'avg_monthly_charges', 'total_services', 'service_adoption_rate', 'high_risk_payment', 'month_to_month_risk', 'has_fiber_optic', 'is_senior_citizen', 'fiber_high_charges', 'senior_short_tenure']
Feature matrix shape: (7043, 30)
Target shape: (7043,)

Final dataset info:
Features: 30 columns, 7043 rows
Target distribution: {0: 5174, 1: 1869}


In [15]:
# Calculate class imbalance ratio
from sklearn.model_selection import train_test_split

# Calculate scale_pos_weight for XGBoost
negative_samples = (y == 0).sum()  # No churn
positive_samples = (y == 1).sum()  # Churn
scale_pos_weight = negative_samples / positive_samples

print(f"=== CLASS IMBALANCE ANALYSIS ===")
print(f"No churn (0): {negative_samples} samples ({negative_samples/len(y)*100:.1f}%)")
print(f"Churn (1): {positive_samples} samples ({positive_samples/len(y)*100:.1f}%)")
print(f"Imbalance ratio: {scale_pos_weight:.2f}")
print(f"We'll use scale_pos_weight = {scale_pos_weight:.2f} in XGBoost")

# Split the data (stratified to maintain class distribution)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, 
    test_size=0.2, 
    random_state=42, 
    stratify=y  # This ensures same churn rate in train/test
)

print(f"\n=== DATA SPLIT ===")
print(f"Training set: {X_train.shape[0]} samples")
print(f"Test set: {X_test.shape[0]} samples")
print(f"Training churn rate: {y_train.mean():.3f}")
print(f"Test churn rate: {y_test.mean():.3f}")

=== CLASS IMBALANCE ANALYSIS ===
No churn (0): 5174 samples (73.5%)
Churn (1): 1869 samples (26.5%)
Imbalance ratio: 2.77
We'll use scale_pos_weight = 2.77 in XGBoost

=== DATA SPLIT ===
Training set: 5634 samples
Test set: 1409 samples
Training churn rate: 0.265
Test churn rate: 0.265


In [16]:
# Prepare data for SageMaker XGBoost (needs CSV format with target as first column)
import os

# Create training data with target as first column (SageMaker XGBoost requirement)
train_data = pd.concat([y_train.reset_index(drop=True), X_train.reset_index(drop=True)], axis=1)
test_data = pd.concat([y_test.reset_index(drop=True), X_test.reset_index(drop=True)], axis=1)

print(f"Training data shape: {train_data.shape}")
print(f"Test data shape: {test_data.shape}")

# Save to local files first
train_data.to_csv('train.csv', index=False, header=False)
test_data.to_csv('test.csv', index=False, header=False)

# Upload to S3
s3_train_path = f's3://{bucket_name}/training-data/train.csv'
s3_test_path = f's3://{bucket_name}/training-data/test.csv'

# Upload to S3
boto3.Session().resource('s3').Bucket(bucket_name).Object('training-data/train.csv').upload_file('train.csv')
boto3.Session().resource('s3').Bucket(bucket_name).Object('training-data/test.csv').upload_file('test.csv')

print(f"Training data uploaded to: {s3_train_path}")
print(f"Test data uploaded to: {s3_test_path}")

Training data shape: (5634, 31)
Test data shape: (1409, 31)
Training data uploaded to: s3://churn-predictor-bucket/training-data/train.csv
Test data uploaded to: s3://churn-predictor-bucket/training-data/test.csv


In [17]:
from sagemaker.xgboost import XGBoost
from sagemaker.inputs import TrainingInput

# Set up XGBoost estimator with imbalance handling
xgb_estimator = XGBoost(
    entry_point='train.py',  # We'll create this next
    role=role,
    instance_type='ml.m5.xlarge',
    instance_count=1,
    framework_version='1.5-1',
    py_version='py3',
    hyperparameters={
        'objective': 'binary:logistic',
        'eval_metric': 'auc,error',
        'scale_pos_weight': 2.77,  # Handle class imbalance
        'max_depth': 6,
        'eta': 0.1,
        'subsample': 0.8,
        'colsample_bytree': 0.8,
        'num_round': 100,
        'early_stopping_rounds': 10
    }
)

# Define training and validation inputs
train_input = TrainingInput(s3_train_path, content_type='text/csv')
test_input = TrainingInput(s3_test_path, content_type='text/csv')

print("XGBoost estimator configured with class imbalance handling")
print(f"   - scale_pos_weight: 2.77")
print(f"   - objective: binary:logistic")
print(f"   - eval_metric: auc,error")

XGBoost estimator configured with class imbalance handling
   - scale_pos_weight: 2.77
   - objective: binary:logistic
   - eval_metric: auc,error


In [18]:
%%writefile train.py
import argparse
import os
import pandas as pd
import xgboost as xgb
import joblib
from sklearn.metrics import accuracy_score, classification_report, roc_auc_score

def model_fn(model_dir):
    """Load model for SageMaker inference"""
    model = joblib.load(os.path.join(model_dir, "model.joblib"))
    return model

if __name__ == '__main__':
    parser = argparse.ArgumentParser()
    
    # Hyperparameters
    parser.add_argument('--max_depth', type=int, default=6)
    parser.add_argument('--eta', type=float, default=0.1)
    parser.add_argument('--subsample', type=float, default=0.8)
    parser.add_argument('--colsample_bytree', type=float, default=0.8)
    parser.add_argument('--num_round', type=int, default=100)
    parser.add_argument('--scale_pos_weight', type=float, default=2.77)
    parser.add_argument('--early_stopping_rounds', type=int, default=10)
    
    # Data directories
    parser.add_argument('--model_dir', type=str, default=os.environ.get('SM_MODEL_DIR'))
    parser.add_argument('--train', type=str, default=os.environ.get('SM_CHANNEL_TRAIN'))
    parser.add_argument('--validation', type=str, default=os.environ.get('SM_CHANNEL_VALIDATION'))
    
    args = parser.parse_args()
    
    # Load training data
    train_data = pd.read_csv(os.path.join(args.train, 'train.csv'), header=None)
    train_y = train_data.iloc[:, 0]
    train_X = train_data.iloc[:, 1:]
    
    # Load validation data
    val_data = pd.read_csv(os.path.join(args.validation, 'test.csv'), header=None)
    val_y = val_data.iloc[:, 0]
    val_X = val_data.iloc[:, 1:]
    
    # Create DMatrix for XGBoost
    dtrain = xgb.DMatrix(train_X, label=train_y)
    dval = xgb.DMatrix(val_X, label=val_y)
    
    # Set parameters
    params = {
        'objective': 'binary:logistic',
        'eval_metric': ['auc', 'error'],
        'max_depth': args.max_depth,
        'eta': args.eta,
        'subsample': args.subsample,
        'colsample_bytree': args.colsample_bytree,
        'scale_pos_weight': args.scale_pos_weight,
        'seed': 42
    }
    
    # Train model
    model = xgb.train(
        params=params,
        dtrain=dtrain,
        num_boost_round=args.num_round,
        evals=[(dtrain, 'train'), (dval, 'validation')],
        early_stopping_rounds=args.early_stopping_rounds,
        verbose_eval=True
    )
    
    # Make predictions
    train_pred = model.predict(dtrain)
    val_pred = model.predict(dval)
    
    # Calculate metrics
    train_auc = roc_auc_score(train_y, train_pred)
    val_auc = roc_auc_score(val_y, val_pred)
    
    print(f"\n=== TRAINING RESULTS ===")
    print(f"Training AUC: {train_auc:.4f}")
    print(f"Validation AUC: {val_auc:.4f}")
    
    # Convert to binary predictions for classification report
    train_pred_binary = (train_pred > 0.5).astype(int)
    val_pred_binary = (val_pred > 0.5).astype(int)
    
    print(f"\n=== VALIDATION CLASSIFICATION REPORT ===")
    print(classification_report(val_y, val_pred_binary))
    
    # Save model
    joblib.dump(model, os.path.join(args.model_dir, "model.joblib"))
    print(f"\n Model saved to {args.model_dir}")

Writing train.py


In [20]:
# Update XGBoost estimator with correct paths
from sagemaker.xgboost import XGBoost
from sagemaker.inputs import TrainingInput

# Configure estimator
xgb_estimator = XGBoost(
    entry_point='train.py',
    role=role,
    instance_type='ml.m5.xlarge',
    instance_count=1,
    framework_version='1.5-1',
    py_version='py3',
    hyperparameters={
        'max_depth': 6,
        'eta': 0.1,
        'subsample': 0.8,
        'colsample_bytree': 0.8,
        'num_round': 100,
        'scale_pos_weight': 2.77,
        'early_stopping_rounds': 10
    }
)

# Set up training inputs using YOUR bucket
train_input = TrainingInput(f's3://churn-predictor-bucket/training-data/train.csv', content_type='text/csv')
validation_input = TrainingInput(f's3://churn-predictor-bucket/training-data/test.csv', content_type='text/csv')

print(" XGBoost training job configured!")
print("   - Using churn-predictor-bucket")
print("   - scale_pos_weight: 2.77 for imbalance handling")
print("   - Early stopping enabled")

 XGBoost training job configured!
   - Using churn-predictor-bucket
   - scale_pos_weight: 2.77 for imbalance handling
   - Early stopping enabled


In [21]:
# Start the training job
print("Starting XGBoost training job...")
print("This will take about 5-10 minutes")

xgb_estimator.fit({
    'train': train_input,
    'validation': validation_input
})

print("Training job completed!")

Starting XGBoost training job...
This will take about 5-10 minutes


INFO:sagemaker:Creating training-job with name: sagemaker-xgboost-2025-07-26-00-23-35-166


2025-07-26 00:23:36 Starting - Starting the training job...
2025-07-26 00:24:10 Downloading - Downloading input data...
2025-07-26 00:24:36 Downloading - Downloading the training image......
  from pandas import MultiIndex, Int64Index[0m
[34m[2025-07-26 00:25:29.252 ip-10-0-133-23.ec2.internal:7 INFO utils.py:28] RULE_JOB_STOP_SIGNAL_FILENAME: None[0m
[34m[2025-07-26 00:25:29.274 ip-10-0-133-23.ec2.internal:7 INFO profiler_config_parser.py:111] User has disabled profiler.[0m
[34m[2025-07-26:00:25:29:INFO] Imported framework sagemaker_xgboost_container.training[0m
[34m[2025-07-26:00:25:29:INFO] No GPUs detected (normal if no gpus installed)[0m
[34m[2025-07-26:00:25:29:INFO] Invoking user training script.[0m
[34m[2025-07-26:00:25:29:INFO] Module train does not provide a setup.py. [0m
[34mGenerating setup.py[0m
[34m[2025-07-26:00:25:29:INFO] Generating setup.cfg[0m
[34m[2025-07-26:00:25:29:INFO] Generating MANIFEST.in[0m
[34m[2025-07-26:00:25:29:INFO] Installing module

In [24]:
from sagemaker.tuner import HyperparameterTuner, IntegerParameter, ContinuousParameter

print("Setting up Hyperparameter Tuning Job (Fixed)...")

# Define TUNABLE hyperparameter ranges (AWS approved only)
hyperparameter_ranges = {
    'max_depth': IntegerParameter(3, 10),
    'eta': ContinuousParameter(0.01, 0.3),
    'subsample': ContinuousParameter(0.5, 1.0),
    'colsample_bytree': ContinuousParameter(0.5, 1.0),
    'colsample_bylevel': ContinuousParameter(0.5, 1.0),
    'num_round': IntegerParameter(50, 200),
    'lambda': ContinuousParameter(0, 10),  # L2 regularization
    'alpha': ContinuousParameter(0, 10),   # L1 regularization
    'min_child_weight': ContinuousParameter(0.5, 10),
    'gamma': ContinuousParameter(0, 5)     # Minimum loss reduction
}

# Create estimator with FIXED hyperparameters
xgb_tuning_estimator = XGBoost(
    entry_point='train.py',
    framework_version='1.7-1',
    instance_type='ml.m5.large',
    instance_count=1,
    role=role,
    base_job_name='churn-xgb-tuning',
    output_path=f's3://{bucket_name}/models/tuned/',
    hyperparameters={
        'scale_pos_weight': 2.77,  # Keep our calculated value FIXED
        'early_stopping_rounds': 10
    }
)

print("Tuning estimator configured with AWS-approved parameters!")

INFO:sagemaker.image_uris:Ignoring unnecessary Python version: py3.
INFO:sagemaker.image_uris:Ignoring unnecessary instance type: ml.m5.large.


Setting up Hyperparameter Tuning Job (Fixed)...
Tuning estimator configured with AWS-approved parameters!


In [27]:
# Create hyperparameter tuner
tuner_fixed = HyperparameterTuner(
    estimator=xgb_tuning_estimator,
    objective_metric_name='validation:auc',
    objective_type='Maximize', 
    hyperparameter_ranges=hyperparameter_ranges,
    max_jobs=10,
    max_parallel_jobs=3,
    base_tuning_job_name='churn-hyperparameter-tuning-fnal'
)

print("Starting FIXED Hyperparameter Tuning Job...")
print("This will take about 15-25 minutes (10 training jobs)")

tuner_fixed.fit({
    'train': train_input,
    'validation': validation_input
})


print("Hyperparameter tuning completed!")

INFO:sagemaker:Creating hyperparameter tuning job with name: churn-hyperparameter-250726-0050


Starting FIXED Hyperparameter Tuning Job...
This will take about 15-25 minutes (10 training jobs)
................................................................................!
Hyperparameter tuning completed!


In [30]:
import boto3

# Check the status of tuning job first
sm_client = boto3.client('sagemaker')

# Get tuning job name from our tuner object
try:
    tuning_job_name = tuner_fixed.latest_tuning_job.name
    print(f"Tuning Job Name: {tuning_job_name}")
except:
    print("Getting latest tuning job...")
    tuning_jobs = sm_client.list_hyper_parameter_tuning_jobs(
        SortBy='CreationTime',
        SortOrder='Descending',
        MaxResults=1
    )
    tuning_job_name = tuning_jobs['HyperParameterTuningJobSummaries'][0]['HyperParameterTuningJobName']
    print(f"Found tuning job: {tuning_job_name}")

# Check job status
tuning_job_desc = sm_client.describe_hyper_parameter_tuning_job(
    HyperParameterTuningJobName=tuning_job_name
)

status = tuning_job_desc['HyperParameterTuningJobStatus']
print(f"Tuning Job Status: {status}")

if status == 'Completed':
    print("Tuning job completed successfully!")
    
    # Get best training job info
    best_training_job_info = tuning_job_desc['BestTrainingJob']
    best_job_name = best_training_job_info['TrainingJobName']
    best_metric = best_training_job_info['FinalHyperParameterTuningJobObjectiveMetric']
    best_hyperparams = best_training_job_info['TunedHyperParameters']
    
    print(f"\nBest Training Job: {best_job_name}")
    print(f"Best Validation AUC: {best_metric['Value']:.4f}")
    
    print(f"\nBest Hyperparameters:")
    print("="*50)
    for param, value in best_hyperparams.items():
        print(f"   {param}: {value}")
    
    # Compare with baseline
    baseline_auc = 0.8402
    improvement = best_metric['Value'] - baseline_auc
    print(f"\nPerformance Comparison:")
    print(f"   Baseline AUC: {baseline_auc}")
    print(f"   Tuned AUC: {best_metric['Value']:.4f}")
    print(f"   Improvement: +{improvement:.4f}")
    
elif status == 'InProgress':
    print("Tuning job still running...")
    print(f"Training Jobs Completed: {tuning_job_desc['TrainingJobStatusCounters']['Completed']}")
    print(f"Training Jobs In Progress: {tuning_job_desc['TrainingJobStatusCounters']['InProgress']}")
    
    # Show current best if available
    if 'BestTrainingJob' in tuning_job_desc:
        current_best = tuning_job_desc['BestTrainingJob']['FinalHyperParameterTuningJobObjectiveMetric']
        print(f"Current Best AUC: {current_best['Value']:.4f}")
    
elif status == 'Failed':
    print("Tuning job failed!")
    print(f"Failure Reason: {tuning_job_desc.get('FailureReason', 'Unknown')}")
    
else:
    print(f"Tuning job status: {status}")

Tuning Job Name: churn-hyperparameter-250726-0050
Tuning Job Status: Completed
Tuning job completed successfully!

Best Training Job: churn-hyperparameter-250726-0050-002-ad20805b
Best Validation AUC: 0.8455

Best Hyperparameters:
   alpha: 0.4044029206673527
   colsample_bylevel: 0.6305152280242621
   colsample_bytree: 0.7258087420433601
   eta: 0.1451735633349137
   gamma: 2.0410273295911328
   lambda: 9.474743922415719
   max_depth: 9
   min_child_weight: 4.047031166565295
   num_round: 159
   subsample: 0.8099786964323195

Performance Comparison:
   Baseline AUC: 0.8402
   Tuned AUC: 0.8455
   Improvement: +0.0053


In [31]:
# Get the best model artifacts location
best_training_job_details = sm_client.describe_training_job(
    TrainingJobName=best_job_name
)

model_artifacts_uri = best_training_job_details['ModelArtifacts']['S3ModelArtifacts']
print(f"Best Model Artifacts: {model_artifacts_uri}")

# Create model package for registry
model_package_group_name = "churn-prediction-models"

print(f"Creating model package in group: {model_package_group_name}")

# First, create model package group if it doesn't exist
try:
    response = sm_client.create_model_package_group(
        ModelPackageGroupName=model_package_group_name,
        ModelPackageGroupDescription="Churn prediction models for customer retention"
    )
    print("Model package group created successfully")
except sm_client.exceptions.ValidationException as e:
    if "already exists" in str(e):
        print("Model package group already exists")
    else:
        raise e

# Create the model package
from datetime import datetime
timestamp = datetime.now().strftime("%Y%m%d-%H%M%S")

model_package_response = sm_client.create_model_package(
    ModelPackageGroupName=model_package_group_name,
    ModelPackageDescription=f"XGBoost churn prediction model - Tuned {timestamp}",
    InferenceSpecification={
        'Containers': [{
            'Image': '683313688378.dkr.ecr.us-east-1.amazonaws.com/sagemaker-xgboost:1.7-1',
            'ModelDataUrl': model_artifacts_uri
        }],
        'SupportedContentTypes': ['text/csv'],
        'SupportedResponseMIMETypes': ['text/csv']
    },
    ModelMetrics={
        'ModelQuality': {
            'Statistics': {
                'ContentType': 'application/json',
                'S3Uri': f's3://{bucket}/model-metrics/statistics.json'
            }
        }
    },
    ModelApprovalStatus='PendingManualApproval'
)

model_package_arn = model_package_response['ModelPackageArn']
print(f"Model package created: {model_package_arn}")

Best Model Artifacts: s3://churn-predictor-bucket/models/tuned/churn-hyperparameter-250726-0050-002-ad20805b/output/model.tar.gz
Creating model package in group: churn-prediction-models
Model package group created successfully
Model package created: arn:aws:sagemaker:us-east-1:158878148642:model-package/churn-prediction-models/1


In [32]:
# Create model metrics file
import json

model_metrics = {
    "binary_classification_metrics": {
        "validation_auc": {
            "value": 0.8455,
            "standard_deviation": 0.0
        },
        "baseline_auc": {
            "value": 0.8402,
            "standard_deviation": 0.0  
        },
        "improvement": {
            "value": 0.0053,
            "standard_deviation": 0.0
        }
    },
    "hyperparameters": best_hyperparams,
    "training_job": best_job_name,
    "training_data_location": f"s3://{bucket}/training-data/",
    "feature_engineering_job": "feature-engineering-job",
    "model_type": "XGBoost Binary Classification",
    "target_variable": "churn_binary",
    "class_balance": {
        "negative_class": 5174,
        "positive_class": 1869,
        "scale_pos_weight": 2.77
    }
}

# Save metrics to S3
metrics_json = json.dumps(model_metrics, indent=2)
s3_client = boto3.client('s3')

# Create model-metrics folder and upload
s3_client.put_object(
    Bucket=bucket,
    Key='model-metrics/statistics.json',
    Body=metrics_json,
    ContentType='application/json'
)

print("Model metrics uploaded to S3")
print("Model successfully registered in Model Registry!")

# Get model package details
model_package_details = sm_client.describe_model_package(
    ModelPackageName=model_package_arn
)

print(f"\nModel Package Status: {model_package_details['ModelApprovalStatus']}")
print(f"Model Package ARN: {model_package_arn}")

Model metrics uploaded to S3
Model successfully registered in Model Registry!

Model Package Status: PendingManualApproval
Model Package ARN: arn:aws:sagemaker:us-east-1:158878148642:model-package/churn-prediction-models/1


In [33]:
# Approve the model for deployment
model_package_arn = "arn:aws:sagemaker:us-east-1:158878148642:model-package/churn-prediction-models/1"

print("Approving model for deployment...")

response = sm_client.update_model_package(
    ModelPackageArn=model_package_arn,
    ModelApprovalStatus='Approved',
    ApprovalDescription='Model approved for deployment - AUC 0.8455, improved from baseline 0.8402'
)

print("Model approved for deployment!")

# Verify the approval
model_details = sm_client.describe_model_package(ModelPackageName=model_package_arn)
print(f"New Status: {model_details['ModelApprovalStatus']}")

Approving model for deployment...
Model approved for deployment!
New Status: Approved


In [34]:
# Create SageMaker model from the registered model package
from datetime import datetime
timestamp = datetime.now().strftime("%Y%m%d-%H%M%S")

model_name = f"churn-prediction-model-{timestamp}"
print(f"Creating SageMaker model: {model_name}")

# Create model from model package
create_model_response = sm_client.create_model(
    ModelName=model_name,
    PrimaryContainer={
        'ModelPackageName': model_package_arn
    },
    ExecutionRoleArn=role
)

print(f"Model created successfully: {model_name}")
print(f"Model ARN: {create_model_response['ModelArn']}")

Creating SageMaker model: churn-prediction-model-20250726-011802
Model created successfully: churn-prediction-model-20250726-011802
Model ARN: arn:aws:sagemaker:us-east-1:158878148642:model/churn-prediction-model-20250726-011802


In [35]:
# Create endpoint configuration
endpoint_config_name = f"churn-prediction-config-{timestamp}"
print(f"Creating endpoint configuration: {endpoint_config_name}")

create_endpoint_config_response = sm_client.create_endpoint_config(
    EndpointConfigName=endpoint_config_name,
    ProductionVariants=[
        {
            'VariantName': 'primary',
            'ModelName': model_name,
            'InitialInstanceCount': 1,
            'InstanceType': 'ml.m5.large',
            'InitialVariantWeight': 1.0
        }
    ],
    DataCaptureConfig={
        'EnableCapture': True,
        'InitialSamplingPercentage': 100,
        'DestinationS3Uri': f's3://{bucket}/endpoint-data-capture/',
        'CaptureOptions': [
            {'CaptureMode': 'Input'},
            {'CaptureMode': 'Output'}
        ]
    }
)

print(f"Endpoint configuration created: {endpoint_config_name}")

Creating endpoint configuration: churn-prediction-config-20250726-011802
Endpoint configuration created: churn-prediction-config-20250726-011802


In [36]:
# Create the endpoint
endpoint_name = f"churn-prediction-endpoint-{timestamp}"
print(f"Creating endpoint: {endpoint_name}")
print("This will take about 5-8 minutes...")

create_endpoint_response = sm_client.create_endpoint(
    EndpointName=endpoint_name,
    EndpointConfigName=endpoint_config_name
)

print(f"Endpoint creation started: {endpoint_name}")
print(f"Endpoint ARN: {create_endpoint_response['EndpointArn']}")

# Monitor endpoint creation
import time

def wait_for_endpoint(endpoint_name, max_wait_time=600):
    """Wait for endpoint to be in service"""
    start_time = time.time()
    
    while time.time() - start_time < max_wait_time:
        response = sm_client.describe_endpoint(EndpointName=endpoint_name)
        status = response['EndpointStatus']
        
        print(f"Endpoint Status: {status}")
        
        if status == 'InService':
            print("Endpoint is ready for predictions!")
            return True
        elif status == 'Failed':
            print(f"Endpoint creation failed: {response.get('FailureReason', 'Unknown')}")
            return False
        
        print("Waiting 30 seconds...")
        time.sleep(30)
    
    print("Timeout waiting for endpoint")
    return False

# Wait for endpoint to be ready
wait_for_endpoint(endpoint_name)

Creating endpoint: churn-prediction-endpoint-20250726-011802
This will take about 5-8 minutes...
Endpoint creation started: churn-prediction-endpoint-20250726-011802
Endpoint ARN: arn:aws:sagemaker:us-east-1:158878148642:endpoint/churn-prediction-endpoint-20250726-011802
Endpoint Status: Creating
Waiting 30 seconds...
Endpoint Status: Creating
Waiting 30 seconds...
Endpoint Status: Creating
Waiting 30 seconds...
Endpoint Status: Creating
Waiting 30 seconds...
Endpoint Status: Creating
Waiting 30 seconds...
Endpoint Status: Creating
Waiting 30 seconds...
Endpoint Status: Creating
Waiting 30 seconds...
Endpoint Status: InService
Endpoint is ready for predictions!


True

In [38]:
from io import StringIO

# Prepare test data for inference
print("Preparing sample data for testing...")

# Get a few rows from our test data for inference
sample_data = pd.read_csv(f's3://{bucket_name}/training-data/test.csv', header=None, nrows=5)

# Remove the target column (first column) for inference
sample_features = sample_data.iloc[:, 1:]
print(f"Sample data shape: {sample_features.shape}")
print("Sample features:")
print(sample_features.head())

# Convert to CSV format for SageMaker endpoint
csv_buffer = StringIO()
sample_features.to_csv(csv_buffer, header=False, index=False)
csv_data = csv_buffer.getvalue()

print(f"\nSample CSV data for inference:")
print(csv_data[:200] + "...")

Preparing sample data for testing...
Sample data shape: (5, 30)
Sample features:
   1   2   3   4   5   6   7   8   9   10  ...         21          22  23  \
0   1   0   1   1  72   1   2   1   2   2  ...   1.562329  117.613889   8   
1   0   1   0   0   8   1   2   1   0   0  ...  11.127778  113.568750   5   
2   0   0   1   1  41   1   2   0   2   2  ...   1.865476   78.321951   6   
3   1   0   1   0  18   1   0   1   0   0  ...   4.115789   81.597222   3   
4   0   0   1   0  72   1   2   0   2   2  ...   1.132192   82.213194   7   

      24  25  26  27  28  29  30  
0  1.000   0   0   1   0   1   0  
1  0.625   0   1   1   1   1   1  
2  0.750   0   0   0   0   0   0  
3  0.375   1   1   1   0   0   0  
4  0.875   0   0   0   0   0   0  

[5 rows x 30 columns]

Sample CSV data for inference:
1,0,1,1,72,1,2,1,2,2,2,2,2,2,2,1,1,114.05,8468.2,6.0,1.5623287671232875,117.6138888888889,8,1.0,0,0,1,0,1,0
0,1,0,0,8,1,2,1,0,0,0,2,2,2,0,1,1,100.15,908.55,0.6666666666666666,11.1277777777777

In [39]:
# Import required libraries for inference
import boto3
from io import StringIO

# Create SageMaker runtime client for inference
runtime_client = boto3.client('sagemaker-runtime')

print("Making predictions on sample data...")

# Invoke the endpoint
response = runtime_client.invoke_endpoint(
    EndpointName=endpoint_name,
    ContentType='text/csv',
    Body=csv_data
)

# Get predictions
predictions = response['Body'].read().decode('utf-8')
prediction_values = [float(x.strip()) for x in predictions.strip().split('\n') if x.strip()]

print("Predictions (churn probabilities):")
print("="*50)
for i, prob in enumerate(prediction_values):
    churn_risk = "HIGH" if prob > 0.5 else "LOW"
    print(f"Customer {i+1}: {prob:.4f} ({churn_risk} risk)")

# Show actual vs predicted (first column of sample_data contains true labels)
actual_labels = sample_data.iloc[:, 0].values

print(f"\nActual vs Predicted:")
print("="*50)
for i, (actual, predicted) in enumerate(zip(actual_labels, prediction_values)):
    actual_label = "CHURN" if actual == 1 else "STAY"
    predicted_label = "CHURN" if predicted > 0.5 else "STAY"
    match = "CORRECT" if (actual == 1) == (predicted > 0.5) else "WRONG"
    print(f"Customer {i+1}: Actual={actual_label}, Predicted={predicted_label} ({match})")

Making predictions on sample data...
Predictions (churn probabilities):
Customer 1: 0.4210 (LOW risk)
Customer 2: 0.5993 (HIGH risk)
Customer 3: 0.4460 (LOW risk)
Customer 4: 0.5554 (HIGH risk)
Customer 5: 0.3866 (LOW risk)

Actual vs Predicted:
Customer 1: Actual=STAY, Predicted=STAY (CORRECT)
Customer 2: Actual=STAY, Predicted=CHURN (WRONG)
Customer 3: Actual=STAY, Predicted=STAY (CORRECT)
Customer 4: Actual=STAY, Predicted=CHURN (WRONG)
Customer 5: Actual=STAY, Predicted=STAY (CORRECT)


In [40]:
# Delete SageMaker endpoint and related resources
print("Deleting SageMaker resources to avoid charges...")

try:
    # Delete the endpoint
    print(f"Deleting endpoint: {endpoint_name}")
    sm_client.delete_endpoint(EndpointName=endpoint_name)
    print("Endpoint deleted")
except Exception as e:
    print(f"Error deleting endpoint: {e}")

try:
    # Delete endpoint configuration
    print(f"Deleting endpoint configuration: {endpoint_config_name}")
    sm_client.delete_endpoint_config(EndpointConfigName=endpoint_config_name)
    print("Endpoint configuration deleted")
except Exception as e:
    print(f"Error deleting endpoint config: {e}")

try:
    # Delete the model
    print(f"Deleting model: {model_name}")
    sm_client.delete_model(ModelName=model_name)
    print("Model deleted")
except Exception as e:
    print(f"Error deleting model: {e}")

print("\n" + "="*50)
print("CLEANUP COMPLETE!")
print("="*50)
print("No ongoing SageMaker endpoint charges")
print("Model remains in Model Registry for future use")
print("S3 data preserved for retraining")
print("Training artifacts saved")

Deleting SageMaker resources to avoid charges...
Deleting endpoint: churn-prediction-endpoint-20250726-011802
Endpoint deleted
Deleting endpoint configuration: churn-prediction-config-20250726-011802
Endpoint configuration deleted
Deleting model: churn-prediction-model-20250726-011802
Model deleted

CLEANUP COMPLETE!
No ongoing SageMaker endpoint charges
Model remains in Model Registry for future use
S3 data preserved for retraining
Training artifacts saved
