# CrossFit Retention Project — 02 Modeling
Build a simple churn model to identify at-risk members using PushPress exports.

In [None]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

# Load data
members = pd.read_csv(r"/mnt/data/Members.csv", parse_dates=["Join_Date"])
attendance = pd.read_csv(r"/mnt/data/Attendance.csv", parse_dates=["Date"])
sales = pd.read_csv(r"/mnt/data/Store_Sales.csv", parse_dates=["Purchase_Date"])
cancellations = pd.read_csv(r"/mnt/data/Cancellations.csv", parse_dates=["Cancel_Date"])

## Feature Engineering

In [None]:
# Snapshot date = most recent attendance date
snapshot_date = attendance["Date"].max()

# Attendance-based features
att_ok = attendance[attendance["Status"].eq("Checked In")].copy()
last_seen = att_ok.groupby("Member_ID")["Date"].max().rename("Last_Checkin")
first_seen = att_ok.groupby("Member_ID")["Date"].min().rename("First_Checkin")
totals = att_ok.groupby("Member_ID").size().rename("Total_Checkins")
by_month = att_ok.assign(month=att_ok["Date"].dt.to_period("M").dt.to_timestamp())
visits_monthly = by_month.groupby(["Member_ID", "month"]).size().rename("Visits_Month")

# Variability of attendance
var_att = visits_monthly.groupby("Member_ID").std().rename("Visits_Month_STD")

# Recency + frequency
feats = members.set_index("Member_ID").join(last_seen).join(first_seen).join(totals).reset_index()
feats["Days_Since_Last"] = (snapshot_date - feats["Last_Checkin"]).dt.days
feats["Months_Active"] = (
    (feats["Last_Checkin"].dt.to_period("M") - feats["First_Checkin"].dt.to_period("M")).apply(
        lambda p: p.n
    )
    + 1
).clip(lower=1)
feats["Visits_per_Month"] = (feats["Total_Checkins"] / feats["Months_Active"]).fillna(0)

# Retail spend
spend = sales.groupby("Member_ID")["Amount_USD"].sum().rename("Retail_Spend_USD")
feats = feats.set_index("Member_ID").join(spend).reset_index().fillna({"Retail_Spend_USD": 0})

# Membership dummies
feats = pd.get_dummies(feats, columns=["Membership_Type", "Referral_Source"], drop_first=True)

# Label churned (historical)
churn_ids = set(cancellations["Member_ID"])
feats["Churned"] = feats["Member_ID"].isin(churn_ids).astype(int)

# Fill remaining NaNs
for col in ["Days_Since_Last", "Visits_per_Month", "Visits_Month_STD"]:
    if col not in feats.columns:
        feats[col] = np.nan
feats = feats.set_index("Member_ID").join(var_att).reset_index()
feats["Visits_Month_STD"] = feats["Visits_Month_STD"].fillna(0)
feats["Days_Since_Last"] = feats["Days_Since_Last"].fillna(
    (snapshot_date - members["Join_Date"].min()).days
)
feats["Visits_per_Month"] = feats["Visits_per_Month"].fillna(0)

# Drop leakage columns
drop_cols = ["First_Checkin", "Last_Checkin", "Join_Date"]
feats = feats.drop(columns=[c for c in drop_cols if c in feats.columns])
feats.head()

## Train/Test Split & Baselines

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import (
    PrecisionRecallDisplay,
    RocCurveDisplay,
    classification_report,
    confusion_matrix,
    roc_auc_score,
)
from sklearn.model_selection import train_test_split

target = "Churned"
X = feats.drop(columns=[target])
y = feats[target]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.25, random_state=42, stratify=y
)

# Logistic Regression
lr = LogisticRegression(max_iter=2000)
lr.fit(X_train, y_train)
lr_probs = lr.predict_proba(X_test)[:, 1]
lr_auc = roc_auc_score(y_test, lr_probs)
print(f"Logistic ROC AUC: {lr_auc:.3f}")

# Random Forest
rf = RandomForestClassifier(n_estimators=300, random_state=42)
rf.fit(X_train, y_train)
rf_probs = rf.predict_proba(X_test)[:, 1]
rf_auc = roc_auc_score(y_test, rf_probs)
print(f"RandomForest ROC AUC: {rf_auc:.3f}")

## Curves (one plot per figure)

In [None]:
RocCurveDisplay.from_predictions(y_test, lr_probs)
plt.title("Logistic Regression ROC Curve")
plt.tight_layout()
plt.show()

RocCurveDisplay.from_predictions(y_test, rf_probs)
plt.title("Random Forest ROC Curve")
plt.tight_layout()
plt.show()

PrecisionRecallDisplay.from_predictions(y_test, lr_probs)
plt.title("Logistic Regression Precision-Recall")
plt.tight_layout()
plt.show()

PrecisionRecallDisplay.from_predictions(y_test, rf_probs)
plt.title("Random Forest Precision-Recall")
plt.tight_layout()
plt.show()

## Thresholding & Confusion Matrix

In [None]:
import numpy as np
from sklearn.metrics import ConfusionMatrixDisplay


def evaluate_threshold(probs, y_true, threshold=0.5, title="Model"):
    preds = (probs >= threshold).astype(int)
    print(title, "Threshold:", threshold)
    print(classification_report(y_true, preds, digits=3))
    cm = confusion_matrix(y_true, preds)
    disp = ConfusionMatrixDisplay(confusion_matrix=cm)
    disp.plot()
    plt.title(f"{title} Confusion Matrix (thr={threshold})")
    plt.tight_layout()
    plt.show()


evaluate_threshold(rf_probs, y_test, threshold=0.4, title="Random Forest")
evaluate_threshold(lr_probs, y_test, threshold=0.4, title="Logistic Regression")

## Feature Importance & Top Risk List

In [None]:
# Random Forest feature importance
import pandas as pd

fi = pd.Series(rf.feature_importances_, index=X.columns).sort_values(ascending=False)
display(fi.head(15).to_frame("Importance"))

plt.figure()
fi.head(15).sort_values().plot(kind="barh")
plt.title("Top Feature Importances (Random Forest)")
plt.tight_layout()
plt.show()

# Create a scored list of current members (inference on whole set)
rf_all_probs = rf.predict_proba(X)[:, 1]
scored = feats.copy()
scored["Churn_Risk_Score"] = rf_all_probs
# Example outreach list: the top 10 at-risk active members
active_ids = set(members["Member_ID"]) - set(cancellations["Member_ID"])
outreach = (
    scored[scored["Member_ID"].isin(active_ids)]
    .sort_values("Churn_Risk_Score", ascending=False)
    .head(10)
)
display(
    outreach[
        ["Member_ID", "Days_Since_Last", "Visits_per_Month", "Retail_Spend_USD", "Churn_Risk_Score"]
    ]
)

### Notes for the Owner
- Members with **high days since last visit** and **low visits per month** trend higher risk.
- Consider personal coach outreach at 10–14 days inactive, plus a community nudge (buddy text, event invite).