# Churn Prediction Model

This notebook creates a machine learning model to predict user churn risk based on behavioral features.

**Churn Prediction Features:**
- `days_since_signup`: Days since user signed up
- `total_sessions`: Total number of sessions
- `avg_session_duration`: Average session duration (minutes)
- `streak_length`: Current streak length (days)
- `last_login_days_ago`: Days since last login
- `content_completion_rate`: Rate of content completion (0-1)
- `notification_response_rate`: Response rate to notifications (0-1)
- `goal_progress_percentage`: Progress towards goals (0-100)

The trained model will be saved as `churn_model.joblib` for use by the application.

In [None]:
# Import required libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import (
    classification_report, confusion_matrix, accuracy_score, 
    precision_score, recall_score, f1_score, roc_auc_score, roc_curve
)
import joblib
import os
import warnings
warnings.filterwarnings('ignore')

# Set random seed for reproducibility
np.random.seed(42)

# Configure plotting
plt.style.use('default')
sns.set_palette("husl")

print("Libraries imported successfully!")
print("Ready to build churn prediction model...")

In [None]:
# Load the training dataset
data_path = "../data/training_dataset.csv"

if not os.path.exists(data_path):
    print("❌ Training dataset not found!")
    print("Please run the data_creation.ipynb notebook first to generate the dataset.")
else:
    df = pd.read_csv(data_path)
    print(f"✅ Loaded dataset with {len(df):,} users")
    print(f"Dataset shape: {df.shape}")
    print(f"Churn rate: {df['churn'].mean():.2%}")
    display(df.head())

In [None]:
# Analyze churn distribution
print("=== Churn Distribution Analysis ===")

churn_counts = df['churn'].value_counts()
print(f"Active users (0): {churn_counts[0]:,} ({churn_counts[0]/len(df)*100:.1f}%)")
print(f"Churned users (1): {churn_counts[1]:,} ({churn_counts[1]/len(df)*100:.1f}%)")

# Churn by user type
print("\nChurn by user type:")
churn_by_type = df.groupby('user_type')['churn'].agg(['count', 'sum', 'mean'])
churn_by_type.columns = ['total_users', 'churned_users', 'churn_rate']
churn_by_type['churn_rate'] = churn_by_type['churn_rate'].apply(lambda x: f"{x:.1%}")
display(churn_by_type)

# Visualize churn distribution
fig, axes = plt.subplots(1, 2, figsize=(15, 5))

# Overall churn distribution
axes[0].bar(['Active', 'Churned'], churn_counts.values, color=['green', 'red'], alpha=0.7)
axes[0].set_title('Overall Churn Distribution')
axes[0].set_ylabel('Number of Users')
for i, v in enumerate(churn_counts.values):
    axes[0].text(i, v + 10, f'{v:,}\n({v/len(df)*100:.1f}%)', ha='center')

# Churn by user type
user_type_churn = df.groupby(['user_type', 'churn']).size().unstack()
user_type_churn.plot(kind='bar', stacked=True, color=['green', 'red'], alpha=0.7, ax=axes[1])
axes[1].set_title('Churn Distribution by User Type')
axes[1].set_xlabel('User Type')
axes[1].set_ylabel('Number of Users')
axes[1].legend(['Active', 'Churned'])
axes[1].tick_params(axis='x', rotation=45)

plt.tight_layout()
plt.show()

In [None]:
# Prepare churn prediction features (exactly as defined in FeaturePreparator)
churn_features = [
    "days_since_signup",
    "total_sessions", 
    "avg_session_duration",
    "streak_length",
    "last_login_days_ago",
    "content_completion_rate",
    "notification_response_rate",
    "goal_progress_percentage"
]

X = df[churn_features].copy()
y = df['churn'].copy()

print("Churn prediction features:")
for i, feature in enumerate(churn_features, 1):
    print(f"  {i}. {feature}")

print(f"\nFeature matrix shape: {X.shape}")
print(f"Target distribution: {y.value_counts().to_dict()}")
print("\nFeature statistics:")
display(X.describe())

In [None]:
# Analyze feature correlations with churn
feature_churn_corr = df[churn_features + ['churn']].corr()['churn'].drop('churn').sort_values(key=abs, ascending=False)

print("Feature correlations with churn:")
for feature, corr in feature_churn_corr.items():
    direction = "↑" if corr > 0 else "↓"
    print(f"  {feature}: {corr:.3f} {direction}")

# Visualize correlations
plt.figure(figsize=(10, 6))
sns.barplot(x=feature_churn_corr.abs().values, y=feature_churn_corr.index, 
            palette=['red' if x > 0 else 'blue' for x in feature_churn_corr.values])
plt.title('Feature Correlations with Churn (Absolute Values)')
plt.xlabel('Absolute Correlation with Churn')
plt.tight_layout()
plt.show()

In [None]:
# Split the data
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print(f"Training set: {X_train.shape[0]:,} samples")
print(f"Test set: {X_test.shape[0]:,} samples")
print(f"Training churn rate: {y_train.mean():.2%}")
print(f"Test churn rate: {y_test.mean():.2%}")

# Scale features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

print("\n✅ Features scaled successfully")

In [None]:
# Train baseline Random Forest model
print("Training baseline Random Forest model...")

rf_baseline = RandomForestClassifier(
    n_estimators=100,
    random_state=42,
    class_weight='balanced'  # Handle class imbalance
)

rf_baseline.fit(X_train_scaled, y_train)

# Make predictions
y_pred_baseline = rf_baseline.predict(X_test_scaled)
y_pred_proba_baseline = rf_baseline.predict_proba(X_test_scaled)[:, 1]

# Evaluate baseline model
print("\n=== Baseline Model Performance ===")
print(f"Accuracy: {accuracy_score(y_test, y_pred_baseline):.3f}")
print(f"Precision: {precision_score(y_test, y_pred_baseline):.3f}")
print(f"Recall: {recall_score(y_test, y_pred_baseline):.3f}")
print(f"F1-Score: {f1_score(y_test, y_pred_baseline):.3f}")
print(f"ROC-AUC: {roc_auc_score(y_test, y_pred_proba_baseline):.3f}")

print("\nClassification Report:")
print(classification_report(y_test, y_pred_baseline, target_names=['Active', 'Churned']))

In [None]:
# Hyperparameter tuning with GridSearchCV
print("Performing hyperparameter tuning...")

param_grid = {
    'n_estimators': [100, 200],
    'max_depth': [10, 15, 20],
    'min_samples_split': [5, 10],
    'min_samples_leaf': [2, 4]
}

rf_grid = RandomForestClassifier(
    random_state=42, 
    class_weight='balanced'
)

grid_search = GridSearchCV(
    rf_grid, 
    param_grid, 
    cv=5, 
    scoring='roc_auc',
    n_jobs=-1,
    verbose=1
)

grid_search.fit(X_train_scaled, y_train)

print(f"\n✅ Best parameters: {grid_search.best_params_}")
print(f"Best cross-validation ROC-AUC: {grid_search.best_score_:.3f}")

# Train final model with best parameters
rf_final = grid_search.best_estimator_
y_pred_final = rf_final.predict(X_test_scaled)
y_pred_proba_final = rf_final.predict_proba(X_test_scaled)[:, 1]

In [None]:
# Evaluate final model
print("=== Final Model Performance ===")
print(f"Accuracy: {accuracy_score(y_test, y_pred_final):.3f}")
print(f"Precision: {precision_score(y_test, y_pred_final):.3f}")
print(f"Recall: {recall_score(y_test, y_pred_final):.3f}")
print(f"F1-Score: {f1_score(y_test, y_pred_final):.3f}")
print(f"ROC-AUC: {roc_auc_score(y_test, y_pred_proba_final):.3f}")

print("\nClassification Report:")
print(classification_report(y_test, y_pred_final, target_names=['Active', 'Churned']))

# Confusion Matrix
cm = confusion_matrix(y_test, y_pred_final)
print("\nConfusion Matrix:")
print(cm)

# Cross-validation scores
cv_scores = cross_val_score(rf_final, X_train_scaled, y_train, cv=5, scoring='roc_auc')
print(f"\nCross-validation ROC-AUC: {cv_scores.mean():.3f} (+/- {cv_scores.std() * 2:.3f})")

In [None]:
# Feature importance analysis
feature_importance = pd.DataFrame({
    'feature': churn_features,
    'importance': rf_final.feature_importances_
}).sort_values('importance', ascending=False)

print("=== Feature Importance ===")
display(feature_importance)

# Visualize feature importance
plt.figure(figsize=(10, 6))
sns.barplot(data=feature_importance, x='importance', y='feature', palette='viridis')
plt.title('Feature Importance for Churn Prediction')
plt.xlabel('Importance Score')
plt.tight_layout()
plt.show()

# Top 3 most important features
top_features = feature_importance.head(3)
print(f"\n🏆 Top 3 Most Important Features:")
for i, (_, row) in enumerate(top_features.iterrows(), 1):
    print(f"  {i}. {row['feature']}: {row['importance']:.3f}")

In [None]:
# Model performance visualizations
fig, axes = plt.subplots(2, 2, figsize=(15, 12))

# 1. ROC Curve
fpr, tpr, _ = roc_curve(y_test, y_pred_proba_final)
auc_score = roc_auc_score(y_test, y_pred_proba_final)
axes[0,0].plot(fpr, tpr, label=f'ROC Curve (AUC = {auc_score:.3f})', color='blue')
axes[0,0].plot([0, 1], [0, 1], 'k--', label='Random Classifier')
axes[0,0].set_xlabel('False Positive Rate')
axes[0,0].set_ylabel('True Positive Rate')
axes[0,0].set_title('ROC Curve')
axes[0,0].legend()
axes[0,0].grid(True)

# 2. Confusion Matrix Heatmap
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', 
            xticklabels=['Active', 'Churned'], 
            yticklabels=['Active', 'Churned'], ax=axes[0,1])
axes[0,1].set_title('Confusion Matrix')
axes[0,1].set_ylabel('True Label')
axes[0,1].set_xlabel('Predicted Label')

# 3. Prediction Probability Distribution
axes[1,0].hist(y_pred_proba_final[y_test==0], bins=20, alpha=0.7, label='Active Users', color='green')
axes[1,0].hist(y_pred_proba_final[y_test==1], bins=20, alpha=0.7, label='Churned Users', color='red')
axes[1,0].set_xlabel('Churn Probability')
axes[1,0].set_ylabel('Count')
axes[1,0].set_title('Churn Probability Distribution')
axes[1,0].legend()

# 4. Risk Level Distribution (matching application logic)
risk_levels = []
for prob in y_pred_proba_final:
    if prob >= 0.7:
        risk_levels.append("High")
    elif prob >= 0.4:
        risk_levels.append("Medium")
    else:
        risk_levels.append("Low")

risk_counts = pd.Series(risk_levels).value_counts()
colors_risk = ['green', 'orange', 'red']
axes[1,1].bar(risk_counts.index, risk_counts.values, color=colors_risk, alpha=0.7)
axes[1,1].set_title('Risk Level Distribution')
axes[1,1].set_ylabel('Number of Users')
for i, v in enumerate(risk_counts.values):
    axes[1,1].text(i, v + 5, str(v), ha='center')

plt.tight_layout()
plt.show()

print("Risk level thresholds (matching application logic):")
print("  • Low risk: < 0.4 churn probability")
print("  • Medium risk: 0.4 - 0.7 churn probability") 
print("  • High risk: > 0.7 churn probability")

In [None]:
# Analyze model predictions by user type
df_test = df.loc[X_test.index].copy()
df_test['predicted_churn_prob'] = y_pred_proba_final
df_test['predicted_churn'] = y_pred_final

print("=== Model Performance by User Type ===")

for user_type in df_test['user_type'].unique():
    subset = df_test[df_test['user_type'] == user_type]
    actual_churn_rate = subset['churn'].mean()
    predicted_churn_rate = subset['predicted_churn'].mean()
    avg_churn_prob = subset['predicted_churn_prob'].mean()
    
    print(f"\n{user_type.title()} Users:")
    print(f"  Sample size: {len(subset):,}")
    print(f"  Actual churn rate: {actual_churn_rate:.1%}")
    print(f"  Predicted churn rate: {predicted_churn_rate:.1%}")
    print(f"  Average churn probability: {avg_churn_prob:.3f}")
    
    # Calculate accuracy for this user type
    accuracy = accuracy_score(subset['churn'], subset['predicted_churn'])
    print(f"  Accuracy: {accuracy:.3f}")

In [None]:
# Save the trained model and supporting data
data_dir = "../data"
os.makedirs(data_dir, exist_ok=True)

# Save churn prediction model
churn_model_path = os.path.join(data_dir, "churn_model.joblib")
joblib.dump(rf_final, churn_model_path)
print(f"✅ Saved churn model to {churn_model_path}")

# Save feature scaler
churn_scaler_path = os.path.join(data_dir, "churn_scaler.joblib")
joblib.dump(scaler, churn_scaler_path)
print(f"✅ Saved feature scaler to {churn_scaler_path}")

# Save feature importance
feature_importance_path = os.path.join(data_dir, "churn_feature_importance.csv")
feature_importance.to_csv(feature_importance_path, index=False)
print(f"✅ Saved feature importance to {feature_importance_path}")

# Save model metadata
model_metadata = {
    "model_type": "RandomForestClassifier",
    "feature_names": churn_features,
    "n_features": len(churn_features),
    "training_samples": len(X_train),
    "test_samples": len(X_test),
    "best_params": grid_search.best_params_,
    "performance_metrics": {
        "accuracy": float(accuracy_score(y_test, y_pred_final)),
        "precision": float(precision_score(y_test, y_pred_final)),
        "recall": float(recall_score(y_test, y_pred_final)),
        "f1_score": float(f1_score(y_test, y_pred_final)),
        "roc_auc": float(roc_auc_score(y_test, y_pred_proba_final))
    },
    "risk_thresholds": {
        "low": "< 0.4",
        "medium": "0.4 - 0.7",
        "high": "> 0.7"
    }
}

import json
metadata_path = os.path.join(data_dir, "churn_model_metadata.json")
with open(metadata_path, 'w') as f:
    json.dump(model_metadata, f, indent=2)
print(f"✅ Saved model metadata to {metadata_path}")

print(f"\n🎉 Churn prediction model training completed successfully!")
print(f"\n📁 Files created:")
print(f"   • {churn_model_path}")
print(f"   • {churn_scaler_path}")
print(f"   • {feature_importance_path}")
print(f"   • {metadata_path}")

print(f"\n📈 Final Model Performance:")
print(f"   • Algorithm: Random Forest with {rf_final.n_estimators} trees")
print(f"   • Features: {len(churn_features)} behavioral features")
print(f"   • Accuracy: {accuracy_score(y_test, y_pred_final):.3f}")
print(f"   • ROC-AUC: {roc_auc_score(y_test, y_pred_proba_final):.3f}")
print(f"   • Training samples: {len(X_train):,} users")
print(f"   • Test samples: {len(X_test):,} users")

In [None]:
# Test the saved model
print("=== Testing Saved Model ===")

# Load the model
loaded_churn_model = joblib.load(churn_model_path)
loaded_scaler = joblib.load(churn_scaler_path)
with open(metadata_path, 'r') as f:
    loaded_metadata = json.load(f)

print("✅ All models and metadata loaded successfully")

# Test with sample user data (matching the expected API format)
test_users = [
    {
        "name": "High-risk user",
        "data": {
            "days_since_signup": 15,
            "total_sessions": 3,
            "avg_session_duration": 2.5,
            "streak_length": 0,
            "last_login_days_ago": 10,
            "content_completion_rate": 0.2,
            "notification_response_rate": 0.1,
            "goal_progress_percentage": 15.0
        }
    },
    {
        "name": "Low-risk user",
        "data": {
            "days_since_signup": 120,
            "total_sessions": 45,
            "avg_session_duration": 12.0,
            "streak_length": 15,
            "last_login_days_ago": 1,
            "content_completion_rate": 0.85,
            "notification_response_rate": 0.75,
            "goal_progress_percentage": 80.0
        }
    }
]

print(f"\n🧪 Testing with sample users:")

for test_user in test_users:
    # Prepare features (simulate FeaturePreparator logic)
    test_features = np.array([
        test_user["data"]["days_since_signup"],
        test_user["data"]["total_sessions"],
        test_user["data"]["avg_session_duration"],
        test_user["data"]["streak_length"],
        test_user["data"]["last_login_days_ago"],
        test_user["data"]["content_completion_rate"],
        test_user["data"]["notification_response_rate"],
        test_user["data"]["goal_progress_percentage"]
    ]).reshape(1, -1)
    
    # Scale features
    test_features_scaled = loaded_scaler.transform(test_features)
    
    # Make prediction
    churn_probability = loaded_churn_model.predict_proba(test_features_scaled)[0][1]
    
    # Determine risk level (matching application logic)
    if churn_probability >= 0.7:
        risk_level = "high"
    elif churn_probability >= 0.4:
        risk_level = "medium"
    else:
        risk_level = "low"
    
    print(f"\n{test_user['name']}:")
    print(f"  Churn probability: {churn_probability:.3f}")
    print(f"  Risk level: {risk_level}")
    print(f"  Input features: {test_user['data']}")

print(f"\n✅ Model is working correctly and ready for production use!")
print(f"\n🔧 Model compatible with:")
print(f"   • src/models/churn_model.py (ChurnPredictor)")
print(f"   • src/utils/feature_prep.py (FeaturePreparator)")
print(f"   • src/utils/model_loader.py (ModelLoader)")