# Churn Mini Project

In [None]:
!pip install -q boto3 sagemaker pandas scikit-learn

In [None]:
import boto3, sagemaker, time, os, json
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sagemaker import image_uris
from sagemaker.serializers import CSVSerializer
from sagemaker.deserializers import CSVDeserializer

# ---- Configure AWS (EDIT THESE IF YOU'RE NOT IN SAGEMAKER STUDIO) ----
AWS_REGION = os.environ.get("AWS_REGION", "us-east-1")
SESSION = boto3.session.Session(region_name=AWS_REGION)
S3 = SESSION.resource("s3")
S3_CLIENT = SESSION.client("s3")
SM_SESSION = sagemaker.Session(boto_session=SESSION)

# Role: inside SageMaker, this picks up the execution role automatically.
try:
    from sagemaker import get_execution_role
    ROLE = get_execution_role()
except Exception:
    # If running locally, set your IAM role ARN with SageMaker permissions
    ROLE = "arn:aws:iam::<YOUR-AWS-ACCOUNT-ID>:role/<YOUR-SAGEMAKER-ROLE-NAME>"  # <-- EDIT

# S3 bucket: create or reuse
ACCOUNT_ID = SESSION.client("sts").get_caller_identity()["Account"]
default_bucket = SM_SESSION.default_bucket()
BUCKET = os.environ.get("S3_BUCKET", default_bucket)  # reuse SageMaker default if available
print("Using S3 bucket:", BUCKET)


In [None]:
# Simple, synthetic features for demo; replace with your real data if you have it.
rng = np.random.default_rng(42)
N = 20000
df = pd.DataFrame({
    "tenure_months": rng.integers(1, 60, size=N),
    "monthly_charges": rng.normal(65, 25, size=N).clip(5, 200),
    "total_charges": lambda x: (x["tenure_months"] * x["monthly_charges"]).clip(10, 10000),
    "support_calls_last_90d": rng.poisson(1.8, size=N).clip(0, 20),
    "is_promo": rng.integers(0, 2, size=N),
    "contract_type": rng.integers(0, 3, size=N),  # 0=month-to-month, 1=1yr, 2=2yr
})

# Churn: higher with short tenure, high calls, non-promo, month-to-month
logit = (
    -2.0
    + 0.03*(200 - df["monthly_charges"].values)/10
    + 0.08*(5 - df["tenure_months"].values)/2
    + 0.35*df["support_calls_last_90d"].values
    + 0.25*(1 - df["is_promo"].values)
    + 0.30*(df["contract_type"].values == 0).astype(float)
)
p = 1 / (1 + np.exp(-logit))
df["churn"] = (rng.random(N) < p).astype(int)

# Train/val/test split
train_df, temp_df = train_test_split(df, test_size=0.3, random_state=8, stratify=df["churn"])
val_df, test_df = train_test_split(temp_df, test_size=0.5, random_state=8, stratify=temp_df["churn"])

print("Train/Val/Test sizes:", len(train_df), len(val_df), len(test_df))


In [None]:
# XGBoost expects label in first column for CSV input
def to_smxgb_csv(d: pd.DataFrame, label_col="churn"):
    cols = [label_col] + [c for c in d.columns if c != label_col]
    return d[cols]

prefix = "churn-simple-xgb"
local_train = "train.csv"; local_val = "validation.csv"; local_test = "test.csv"

to_smxgb_csv(train_df).to_csv(local_train, index=False, header=False)
to_smxgb_csv(val_df).to_csv(local_val, index=False, header=False)
to_smxgb_csv(test_df).to_csv(local_test, index=False, header=False)

train_s3 = f"s3://{BUCKET}/{prefix}/data/{local_train}"
val_s3   = f"s3://{BUCKET}/{prefix}/data/{local_val}"
test_s3  = f"s3://{BUCKET}/{prefix}/data/{local_test}"

S3_CLIENT.upload_file(local_train, BUCKET, f"{prefix}/data/{local_train}")
S3_CLIENT.upload_file(local_val,   BUCKET, f"{prefix}/data/{local_val}")
S3_CLIENT.upload_file(local_test,  BUCKET, f"{prefix}/data/{local_test}")

print("Uploaded:")
print(train_s3)
print(val_s3)
print(test_s3)


In [None]:
xgb_image = image_uris.retrieve(framework="xgboost", region=AWS_REGION, version="1.5-1")
output_path = f"s3://{BUCKET}/{prefix}/output"

xgb = sagemaker.estimator.Estimator(
    image_uri=xgb_image,
    role=ROLE,
    instance_count=1,
    instance_type="ml.m5.xlarge",
    output_path=output_path,
    sagemaker_session=SM_SESSION,
)

# Reasonable starter hyperparameters (tweak later)
xgb.set_hyperparameters(
    objective="binary:logistic",
    eval_metric="auc",
    max_depth=6,
    eta=0.2,
    subsample=0.8,
    colsample_bytree=0.8,
    num_round=200,
    min_child_weight=1,
    verbosity=1,
)


In [None]:
s3_inputs = {
    "train": sagemaker.inputs.TrainingInput(train_s3, content_type="text/csv"),
    "validation": sagemaker.inputs.TrainingInput(val_s3, content_type="text/csv"),
}
xgb.fit(s3_inputs, logs=True)


In [None]:
endpoint_name = f"churn-xgb-{int(time.time())}"
predictor = xgb.deploy(
    initial_instance_count=1,
    instance_type="ml.m5.large",
    endpoint_name=endpoint_name,
)

# CSV in/out serializer
predictor.serializer = CSVSerializer()
predictor.deserializer = CSVDeserializer()
print("Endpoint:", endpoint_name)


In [None]:
# Prepare test without label for inference (drop 'churn')
test_features = test_df.drop(columns=["churn"])
# XGBoost expects features only for prediction, in the same order used for training
feature_order = [c for c in to_smxgb_csv(train_df).columns if c != "churn"]
test_matrix = test_features[feature_order].astype(float)

# Take a small sample to invoke
sample = test_matrix.head(10)
payload = "\n".join([",".join(map(str, row)) for row in sample.values.tolist()])

preds = predictor.predict(payload)
# preds is list of lists like [['0.123'], ['0.876'], ...]
scores = np.array([float(p[0]) for p in preds])
pd.DataFrame({"score": scores})


In [None]:
from sklearn.metrics import roc_auc_score, average_precision_score, brier_score_loss

# Score a larger chunk (e.g., 500 rows)
eval_chunk = test_matrix.head(500)
true_y = test_df["churn"].head(500).values
payload = "\n".join([",".join(map(str, row)) for row in eval_chunk.values.tolist()])
preds = predictor.predict(payload)
scores = np.array([float(p[0]) for p in preds])

auc = roc_auc_score(true_y, scores)
aps = average_precision_score(true_y, scores)
brier = brier_score_loss(true_y, scores)
print(f"AUC: {auc:.4f} | AP: {aps:.4f} | Brier: {brier:.4f}")


In [None]:
try:
    predictor.delete_endpoint(delete_endpoint_config=True)
    print("Deleted endpoint:", endpoint_name)
except Exception as e:
    print("Cleanup error:", e)
