In [41]:
# E-commerce Customer Analytics Project
# Notebook 03: Machine Learning for Customer Lifetime Value Prediction

"""
🎯 What we'll accomplish in this notebook:

1. Load RFM analysis results from Notebook 02
2. Engineer advanced features for machine learning
3. Build Customer Lifetime Value (CLV) prediction model
4. Create customer churn prediction model
5. Evaluate model performance and business impact
6. Generate automated customer scoring system
7. Create predictive business recommendations

🧠 Machine Learning Concepts We'll Learn:
- Supervised Learning (predicting outcomes with known data)
- Feature Engineering (creating predictive variables)
- Random Forest (ensemble learning algorithm)
- Model Training, Validation, and Testing
- Performance Evaluation (R², MAE, Classification Reports)
- Business Application of ML Results

💰 Business Impact:
- Predict which new customers will become high-value
- Identify customers likely to churn before they leave
- Automate customer scoring for marketing campaigns
- Optimize marketing budget allocation based on predictions
- Create early warning systems for customer retention
"""

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime, timedelta

# Machine Learning libraries
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score, classification_report, confusion_matrix
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.cluster import KMeans

import warnings
warnings.filterwarnings('ignore')

plt.style.use('default')
sns.set_palette("husl")

print("✅ Machine Learning Environment Ready!")
print("🤖 Let's build predictive models for customer analytics...")
print("📚 We'll start simple and build up to advanced predictions!")

✅ Machine Learning Environment Ready!
🤖 Let's build predictive models for customer analytics...
📚 We'll start simple and build up to advanced predictions!


In [42]:
# Load data from previous notebooks
print("📂 Loading customer data and RFM analysis results...")

try:
    # Load original data
    customers_df = pd.read_csv('../data/raw/customers.csv')
    orders_df = pd.read_csv('../data/raw/orders.csv')
    order_items_df = pd.read_csv('../data/raw/order_items.csv')
    
    # Load RFM results from Notebook 02
    try:
        rfm_segments_df = pd.read_csv('../data/processed/rfm_customer_segments.csv')
        print("✅ RFM segments loaded from Notebook 02")
    except:
        print("⚠️ RFM segments not found - we'll recreate them")
        rfm_segments_df = None
    
    # Convert dates
    orders_df['order_date'] = pd.to_datetime(orders_df['order_date'])
    customers_df['registration_date'] = pd.to_datetime(customers_df['registration_date'])
    
    print(f"✅ Loaded {len(customers_df):,} customers")
    print(f"✅ Loaded {len(orders_df):,} orders")
    print(f"✅ Loaded {len(order_items_df):,} order items")
    
    # Quick business overview
    total_revenue = orders_df['order_total'].sum()
    avg_order_value = orders_df['order_total'].mean()
    
    print(f"\n📊 Business Overview:")
    print(f"Total Revenue: ${total_revenue:,.2f}")
    print(f"Average Order Value: ${avg_order_value:.2f}")
    print(f"Date Range: {orders_df['order_date'].min().date()} to {orders_df['order_date'].max().date()}")
    
except FileNotFoundError:
    print("❌ Data files not found! Please run Notebooks 01 and 02 first.")

📂 Loading customer data and RFM analysis results...
✅ RFM segments loaded from Notebook 02
✅ Loaded 1,000 customers
✅ Loaded 2,894 orders
✅ Loaded 5,695 order items

📊 Business Overview:
Total Revenue: $1,566,776.24
Average Order Value: $541.39
Date Range: 2023-06-15 to 2025-06-08


In [43]:
# Create comprehensive features for machine learning - SIMPLE VERSION
print("🔧 MACHINE LEARNING FEATURE ENGINEERING")
print("=" * 50)

def create_ml_features_simple(orders_df, customers_df, order_items_df):
    """
    Create features for machine learning - simplified to avoid categorical issues
    """
    
    analysis_date = datetime.now().date()
    print(f"Analysis date: {analysis_date}")
    
    print("\n1️⃣ Creating basic customer metrics...")
    
    # Basic RFM metrics
    customer_metrics = orders_df.groupby('customer_id').agg({
        'order_date': [
            lambda x: (analysis_date - x.max().date()).days,  # Recency
            'count'  # Frequency
        ],
        'order_total': ['sum', 'mean', 'std'],  # Monetary
        'num_items': ['sum', 'mean']
    }).round(2)
        # 🔹 Features extracted:
        # Recency: Days since the customer’s last purchase.
        # Frequency: Total number of orders.
        # Total Spent: Sum of all orders' total.
        # Average Order Value: Mean value of each order.
        # Spending Std: Variability in spending.
        # Total Items: Total number of items purchased.
        # Average Items per Order: Measures cart size.
    
    # Flatten column names
    # Converts hierarchical column names (from aggregation) into flat column names.
    customer_metrics.columns = [
        'recency', 'frequency', 'total_spent', 'avg_order_value', 'spending_std', 'total_items', 'avg_items'
    ]
    print("2️⃣ Adding customer demographics...")
    
    # Add customer info (convert everything to simple types)
    customer_demo = customers_df.set_index('customer_id')[['age', 'gender']].copy()
    
    # Convert gender to numbers (simple approach)
    customer_demo['gender_num'] = (customer_demo['gender'] == 'Female').astype(int)  # 1=Female, 0=Male
    # Machine learning models work better with numerical data than strings or categories (especially in simplified versions).
    
    print("3️⃣ Creating product category features...")
    
    # Category analysis
    order_with_categories = order_items_df.merge(orders_df[['order_id', 'customer_id']], on='order_id')
    category_diversity = order_with_categories.groupby('customer_id')['product_category'].nunique()
    # This tells us how diverse a customer’s interests are — a proxy for engagement and cross-category shopping.
      
    print("4️⃣ Combining features...")
    
    # Combine everything
    ml_features = customer_metrics.copy()
    ml_features = ml_features.join(customer_demo[['age', 'gender_num']], how='left')
    ml_features = ml_features.join(category_diversity.rename('category_diversity'), how='left')
    
    # Fill missing values (simple approach)
    ml_features.fillna(0, inplace=True)
    
    print("5️⃣ Creating target variables...")
    
    # Target variables for prediction
    ml_features['clv_current'] = ml_features['total_spent']
    
    # Simple future CLV prediction based on current patterns
    ml_features['clv_potential'] = (ml_features['avg_order_value'] * 6).clip(upper=ml_features['total_spent'] * 2)
    
    # At-risk customers (haven't bought in 60+ days)
    ml_features['is_at_risk'] = (ml_features['recency'] > 60).astype(int)
    
    # Customer value tier (0=Low, 1=Medium, 2=High)
    ml_features['value_tier'] = pd.qcut(ml_features['total_spent'], 3, labels=[0, 1, 2])
    
    return ml_features

# Create the dataset with simple approach
ml_data = create_ml_features_simple(orders_df, customers_df, order_items_df)

print(f"\n✅ Machine Learning Dataset Created!")
print(f"📊 Shape: {ml_data.shape[0]:,} customers × {ml_data.shape[1]} features")

print(f"\n🎯 Key Metrics:")
print(f"Average CLV: ${ml_data['clv_current'].mean():.2f}")
print(f"Average Future Potential: ${ml_data['clv_potential'].mean():.2f}")
print(f"At-risk customers: {ml_data['is_at_risk'].sum():,} ({ml_data['is_at_risk'].mean()*100:.1f}%)")

print(f"\n👀 Dataset Preview:")
print(ml_data.head())

print(f"\n📋 All Features Created:")
print(list(ml_data.columns))

🔧 MACHINE LEARNING FEATURE ENGINEERING
Analysis date: 2025-06-09

1️⃣ Creating basic customer metrics...
2️⃣ Adding customer demographics...
3️⃣ Creating product category features...
4️⃣ Combining features...
5️⃣ Creating target variables...

✅ Machine Learning Dataset Created!
📊 Shape: 1,000 customers × 14 features

🎯 Key Metrics:
Average CLV: $1566.78
Average Future Potential: $2567.37
At-risk customers: 512 (51.2%)

👀 Dataset Preview:
             recency  frequency  total_spent  avg_order_value  spending_std  \
customer_id                                                                   
1                  7          6      1575.32           262.55        199.09   
2                 96          2       464.72           232.36        108.92   
3                 70          4      1781.83           445.46        238.16   
4                  6          6      3948.70           658.12        338.57   
5                 44          4      3453.65           863.41        607.95   

    

In [44]:
# Build Customer Lifetime Value Prediction Model
print("🤖 BUILDING CLV PREDICTION MODEL")
print("=" * 50)

# Select features for prediction
feature_columns = [
    'recency', 'frequency', 'avg_order_value', 'total_items', 
    'avg_items', 'age', 'gender_num', 'category_diversity'
]

X = ml_data[feature_columns].copy()
y = ml_data['clv_potential'].copy()  # Predict future CLV potential

print(f"🎯 Predicting: Customer Lifetime Value Potential")
print(f"📊 Using {len(feature_columns)} features to predict CLV")
print(f"🔢 Training on {len(X):,} customers")

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

print(f"📚 Training set: {len(X_train):,} customers")
print(f"🧪 Testing set: {len(X_test):,} customers")

# Build and train the model
print(f"\n🏗️ Training Random Forest Model...")

clv_model = RandomForestRegressor(
    n_estimators=100,      # 100 decision trees
    random_state=42,       # For reproducible results
    max_depth=10,          # Prevent overfitting
    min_samples_split=5    # Minimum samples to split
)

# Train the model
clv_model.fit(X_train, y_train)

# Make predictions
y_pred_train = clv_model.predict(X_train)
y_pred_test = clv_model.predict(X_test)

print("✅ Model training complete!")

# Evaluate model performance
print(f"\n📊 MODEL PERFORMANCE:")
print("=" * 30)

# Calculate performance metrics
train_r2 = r2_score(y_train, y_pred_train)
test_r2 = r2_score(y_test, y_pred_test)
train_rmse = np.sqrt(mean_squared_error(y_train, y_pred_train))
test_rmse = np.sqrt(mean_squared_error(y_test, y_pred_test))

print(f"Training R² Score: {train_r2:.3f}")
print(f"Testing R² Score: {test_r2:.3f}")
print(f"Training RMSE: ${train_rmse:.2f}")
print(f"Testing RMSE: ${test_rmse:.2f}")

# Interpret the results
if test_r2 > 0.7:
    print("🎉 Excellent model performance! (R² > 0.7)")
elif test_r2 > 0.5:
    print("✅ Good model performance! (R² > 0.5)")
else:
    print("⚠️ Model needs improvement (R² < 0.5)")

# Feature importance analysis
print(f"\n🔍 FEATURE IMPORTANCE:")
print("=" * 25)

feature_importance = pd.DataFrame({
    'feature': feature_columns,
    'importance': clv_model.feature_importances_
}).sort_values('importance', ascending=False)

print("What drives Customer Lifetime Value:")
for idx, row in feature_importance.iterrows():
    print(f"{row['feature']:<20}: {row['importance']:.3f}")

# ⭐ THIS IS THE KEY LINE - Add predictions to dataset
ml_data['clv_predicted'] = clv_model.predict(X)

print(f"\n✅ CLV predictions added to dataset!")

🤖 BUILDING CLV PREDICTION MODEL
🎯 Predicting: Customer Lifetime Value Potential
📊 Using 8 features to predict CLV
🔢 Training on 1,000 customers
📚 Training set: 800 customers
🧪 Testing set: 200 customers

🏗️ Training Random Forest Model...
✅ Model training complete!

📊 MODEL PERFORMANCE:
Training R² Score: 0.999
Testing R² Score: 0.991
Training RMSE: $55.09
Testing RMSE: $154.53
🎉 Excellent model performance! (R² > 0.7)

🔍 FEATURE IMPORTANCE:
What drives Customer Lifetime Value:
avg_order_value     : 0.836
frequency           : 0.146
total_items         : 0.016
recency             : 0.001
age                 : 0.000
category_diversity  : 0.000
avg_items           : 0.000
gender_num          : 0.000

✅ CLV predictions added to dataset!


In [45]:
# Build Churn Prediction Model
print("🚨 BUILDING CHURN PREDICTION MODEL")
print("=" * 50)

# Use same features for churn prediction
X_churn = ml_data[feature_columns].copy()
y_churn = ml_data['is_at_risk'].copy()  # Predict if customer is at risk

print(f"🎯 Predicting: Customer Churn Risk")
print(f"📊 At-risk customers: {y_churn.sum():,} out of {len(y_churn):,} total")
print(f"📈 Churn rate: {y_churn.mean()*100:.1f}%")

# Split data for churn prediction
X_churn_train, X_churn_test, y_churn_train, y_churn_test = train_test_split(
    X_churn, y_churn, test_size=0.2, random_state=42, stratify=y_churn
)

# Build churn prediction model
print(f"\n🏗️ Training Churn Prediction Model...")

churn_model = RandomForestClassifier(
    n_estimators=100,
    random_state=42,
    max_depth=10,
    min_samples_split=5,
    class_weight='balanced'  # Handle imbalanced data
)

# Train the model
churn_model.fit(X_churn_train, y_churn_train)

# Make predictions
churn_pred_train = churn_model.predict(X_churn_train)
churn_pred_test = churn_model.predict(X_churn_test)
churn_pred_proba = churn_model.predict_proba(X_churn_test)[:, 1]  # Probability of churn

print("✅ Churn model training complete!")

# Evaluate churn model
print(f"\n📊 CHURN MODEL PERFORMANCE:")
print("=" * 35)

print("Classification Report:")
print(classification_report(y_churn_test, churn_pred_test))

# Feature importance for churn
churn_importance = pd.DataFrame({
    'feature': feature_columns,
    'importance': churn_model.feature_importances_
}).sort_values('importance', ascending=False)

print(f"\n🔍 CHURN PREDICTION - MOST IMPORTANT FEATURES:")
print("What predicts if a customer will leave:")
for idx, row in churn_importance.head(5).iterrows():
    print(f"{row['feature']:<20}: {row['importance']:.3f}")

# Add churn predictions to dataset
ml_data['churn_probability'] = churn_model.predict_proba(X_churn)[:, 1]
ml_data['churn_predicted'] = churn_model.predict(X_churn)

# High-risk customers for immediate action
high_risk_customers = ml_data[
    (ml_data['churn_probability'] > 0.7) & 
    (ml_data['total_spent'] > ml_data['total_spent'].median())
]

print(f"\n🚨 HIGH-RISK VALUABLE CUSTOMERS (Need immediate attention):")
print(f"Found {len(high_risk_customers)} customers at high risk of leaving")
if len(high_risk_customers) > 0:
    print(high_risk_customers[['recency', 'frequency', 'total_spent', 'churn_probability']].round(3))

🚨 BUILDING CHURN PREDICTION MODEL
🎯 Predicting: Customer Churn Risk
📊 At-risk customers: 512 out of 1,000 total
📈 Churn rate: 51.2%

🏗️ Training Churn Prediction Model...
✅ Churn model training complete!

📊 CHURN MODEL PERFORMANCE:
Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        98
           1       1.00      1.00      1.00       102

    accuracy                           1.00       200
   macro avg       1.00      1.00      1.00       200
weighted avg       1.00      1.00      1.00       200


🔍 CHURN PREDICTION - MOST IMPORTANT FEATURES:
What predicts if a customer will leave:
recency             : 0.885
avg_order_value     : 0.028
age                 : 0.020
total_items         : 0.020
frequency           : 0.017

🚨 HIGH-RISK VALUABLE CUSTOMERS (Need immediate attention):
Found 227 customers at high risk of leaving
             recency  frequency  total_spent  churn_probability
customer_id            

In [46]:
# Build Churn Prediction Model
print("🚨 BUILDING CHURN PREDICTION MODEL")
print("=" * 50)

# Use same features for churn prediction
X_churn = ml_data[feature_columns].copy()
y_churn = ml_data['is_at_risk'].copy()  # Predict if customer is at risk

print(f"🎯 Predicting: Customer Churn Risk")
print(f"📊 At-risk customers: {y_churn.sum():,} out of {len(y_churn):,} total")
print(f"📈 Churn rate: {y_churn.mean()*100:.1f}%")

# Split data for churn prediction
X_churn_train, X_churn_test, y_churn_train, y_churn_test = train_test_split(
    X_churn, y_churn, test_size=0.2, random_state=42, stratify=y_churn
)

# Build churn prediction model
print(f"\n🏗️ Training Churn Prediction Model...")

churn_model = RandomForestClassifier(
    n_estimators=100,
    random_state=42,
    max_depth=10,
    min_samples_split=5,
    class_weight='balanced'  # Handle imbalanced data
)

# Train the model
churn_model.fit(X_churn_train, y_churn_train)

# Make predictions
churn_pred_train = churn_model.predict(X_churn_train)
churn_pred_test = churn_model.predict(X_churn_test)

print("✅ Churn model training complete!")

# ⭐ THIS IS THE KEY LINE - Add churn predictions to dataset
ml_data['churn_probability'] = churn_model.predict_proba(X_churn)[:, 1]
ml_data['churn_predicted'] = churn_model.predict(X_churn)

print(f"\n✅ Churn predictions added to dataset!")

# Quick evaluation
from sklearn.metrics import classification_report
churn_pred_test = churn_model.predict(X_churn_test)
print(f"\n📊 CHURN MODEL PERFORMANCE:")
print(classification_report(y_churn_test, churn_pred_test))

🚨 BUILDING CHURN PREDICTION MODEL
🎯 Predicting: Customer Churn Risk
📊 At-risk customers: 512 out of 1,000 total
📈 Churn rate: 51.2%

🏗️ Training Churn Prediction Model...
✅ Churn model training complete!

✅ Churn predictions added to dataset!

📊 CHURN MODEL PERFORMANCE:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        98
           1       1.00      1.00      1.00       102

    accuracy                           1.00       200
   macro avg       1.00      1.00      1.00       200
weighted avg       1.00      1.00      1.00       200



In [47]:
# Create Comprehensive Customer Scoring System
print("🎯 AUTOMATED CUSTOMER SCORING SYSTEM")
print("=" * 50)

def create_customer_scores(ml_data):
    """
    Create comprehensive customer scores for business use
    """
    
    print("🔢 Calculating customer scores...")
    
    scores_df = ml_data.copy()
    
    # 1. CLV Score (0-100)
    scores_df['clv_score'] = ((scores_df['clv_predicted'] - scores_df['clv_predicted'].min()) / 
                             (scores_df['clv_predicted'].max() - scores_df['clv_predicted'].min()) * 100).round(0)
    
    # 2. Engagement Score (0-100)
    engagement_components = ['frequency', 'category_diversity', 'avg_items']
    engagement_normalized = scores_df[engagement_components].copy()
    
    for col in engagement_components:
        engagement_normalized[col] = (engagement_normalized[col] / engagement_normalized[col].max())
    
    scores_df['engagement_score'] = (engagement_normalized.mean(axis=1) * 100).round(0)
    
    # 3. Risk Score (0-100, where 100 = highest risk)
    scores_df['risk_score'] = (scores_df['churn_probability'] * 100).round(0)
    
    # 4. Overall Customer Score (weighted combination)
    scores_df['overall_score'] = (
        scores_df['clv_score'] * 0.4 +           # 40% CLV
        scores_df['engagement_score'] * 0.3 +    # 30% Engagement  
        (100 - scores_df['risk_score']) * 0.3    # 30% Retention (inverse of risk)
    ).round(0)
    
    # 5. Customer Tier based on overall score
    def assign_tier(score):
        if score >= 80:
            return 'Platinum'
        elif score >= 60:
            return 'Gold'
        elif score >= 40:
            return 'Silver'
        else:
            return 'Bronze'
    
    scores_df['customer_tier'] = scores_df['overall_score'].apply(assign_tier)
    
    return scores_df

# Create customer scoring system
customer_scores = create_customer_scores(ml_data)

print("✅ Customer scoring system created!")

# Display scoring summary
print(f"\n📊 CUSTOMER SCORING SUMMARY:")
print("=" * 35)

print("Customer Tier Distribution:")
print(customer_scores['customer_tier'].value_counts())

print(f"\n💰 Revenue by Customer Tier:")
tier_revenue = customer_scores.groupby('customer_tier')['total_spent'].sum().sort_values(ascending=False)
for tier, revenue in tier_revenue.items():
    count = customer_scores[customer_scores['customer_tier'] == tier].shape[0]
    print(f"{tier}: ${revenue:,.0f} from {count} customers")

# Top customers in each tier
print(f"\n🏆 TOP CUSTOMERS BY TIER:")
for tier in ['Platinum', 'Gold', 'Silver', 'Bronze']:
    tier_customers = customer_scores[customer_scores['customer_tier'] == tier]
    if len(tier_customers) > 0:
        top_in_tier = tier_customers.nlargest(3, 'overall_score')
        print(f"\n{tier} Tier (Top 3):")
        print(top_in_tier[['recency', 'frequency', 'total_spent', 'overall_score', 'clv_score', 'risk_score']].round(1))

🎯 AUTOMATED CUSTOMER SCORING SYSTEM
🔢 Calculating customer scores...
✅ Customer scoring system created!

📊 CUSTOMER SCORING SUMMARY:
Customer Tier Distribution:
customer_tier
Bronze      456
Silver      312
Gold        216
Platinum     16
Name: count, dtype: int64

💰 Revenue by Customer Tier:
Gold: $569,335 from 216 customers
Bronze: $467,874 from 456 customers
Silver: $461,071 from 312 customers
Platinum: $68,496 from 16 customers

🏆 TOP CUSTOMERS BY TIER:

Platinum Tier (Top 3):
             recency  frequency  total_spent  overall_score  clv_score  \
customer_id                                                              
374               38          3       4638.6           90.0      100.0   
538               10          4       5198.2           88.0       89.0   
889               40          4       5268.1           88.0       90.0   

             risk_score  
customer_id              
374                 6.0  
538                 2.0  
889                 1.0  

Gold Tier (T

In [48]:
# Generate ML-Based Business Action Plan
print("💼 ML-POWERED BUSINESS ACTION PLAN")
print("=" * 50)

def generate_ml_action_plan(customer_scores):
    """
    Generate specific actions based on ML predictions
    """
    
    # Define action strategies
    strategies = {
        'Platinum': {
            'focus': 'Retain at all costs',
            'actions': [
                '💎 White-glove customer service',
                '🎁 Exclusive products and early access',
                '📞 Dedicated account manager',
                '⭐ Brand ambassador programs'
            ]
        },
        'Gold': {
            'focus': 'Upsell and expand relationship',
            'actions': [
                '📈 Premium product recommendations',
                '💳 Credit line increases',
                '🎯 Cross-category promotions',
                '🏆 Loyalty program upgrades'
            ]
        },
        'Silver': {
            'focus': 'Increase engagement and frequency',
            'actions': [
                '📱 Personalized email campaigns',
                '🎈 Free shipping incentives',
                '🔄 Subscription program offers',
                '📊 Behavior tracking and optimization'
            ]
        },
        'Bronze': {
            'focus': 'Basic retention and education',
            'actions': [
                '📧 Educational content and tips',
                '💰 Discount offers for next purchase',
                '🆕 New product announcements',
                '📋 Feedback surveys'
            ]
        }
    }
    
    print("🎯 TIER-BASED ACTION STRATEGIES:")
    print("=" * 35)
    
    for tier, strategy in strategies.items():
        tier_data = customer_scores[customer_scores['customer_tier'] == tier]
        
        if len(tier_data) > 0:
            avg_clv = tier_data['clv_predicted'].mean()
            avg_risk = tier_data['risk_score'].mean()
            total_revenue = tier_data['total_spent'].sum()
            
            print(f"\n🏷️  {tier.upper()} TIER")
            print(f"   👥 Customers: {len(tier_data)}")
            print(f"   💰 Revenue: ${total_revenue:,.0f}")
            print(f"   📈 Avg Predicted CLV: ${avg_clv:.0f}")
            print(f"   ⚠️  Avg Risk Score: {avg_risk:.0f}%")
            print(f"   🎯 Focus: {strategy['focus']}")
            print("   📋 Actions:")
            for action in strategy['actions']:
                print(f"      {action}")
    
    # Special attention lists
    print(f"\n🚨 SPECIAL ATTENTION CUSTOMERS:")
    print("=" * 35)
    
    # High value, high risk
    platinum_at_risk = customer_scores[
        (customer_scores['customer_tier'] == 'Platinum') & 
        (customer_scores['risk_score'] > 50)
    ]
    
    if len(platinum_at_risk) > 0:
        print(f"⚠️  HIGH-VALUE AT-RISK: {len(platinum_at_risk)} Platinum customers with risk >50%")
        print("   → Immediate CEO/VP intervention required")
    else:
        print("✅ No Platinum customers at high risk")
    
    # High potential, low engagement
    high_potential_low_engagement = customer_scores[
        (customer_scores['clv_score'] > 70) & 
        (customer_scores['engagement_score'] < 30)
    ]
    
    if len(high_potential_low_engagement) > 0:
        print(f"💎 UNTAPPED POTENTIAL: {len(high_potential_low_engagement)} customers with high CLV but low engagement")
        print("   → Focus on activation campaigns")
    
    # Budget allocation recommendations
    print(f"\n💰 MARKETING BUDGET ALLOCATION:")
    print("=" * 35)
    
    total_customers = len(customer_scores)
    for tier in ['Platinum', 'Gold', 'Silver', 'Bronze']:
        tier_count = len(customer_scores[customer_scores['customer_tier'] == tier])
        tier_revenue = customer_scores[customer_scores['customer_tier'] == tier]['total_spent'].sum()
        tier_percentage = (tier_count / total_customers) * 100
        revenue_percentage = (tier_revenue / customer_scores['total_spent'].sum()) * 100
        
        # Suggested budget allocation (higher for revenue-generating tiers)
        if tier == 'Platinum':
            suggested_budget = 40
        elif tier == 'Gold':
            suggested_budget = 35
        elif tier == 'Silver':
            suggested_budget = 20
        else:
            suggested_budget = 5
            
        print(f"{tier}: {tier_percentage:.1f}% customers, {revenue_percentage:.1f}% revenue → {suggested_budget}% budget")

# Generate the action plan
generate_ml_action_plan(customer_scores)

# ROI Calculations
print(f"\n📈 PREDICTED ROI FROM ML INSIGHTS:")
print("=" * 40)

total_predicted_clv = customer_scores['clv_predicted'].sum()
total_current_clv = customer_scores['clv_current'].sum()
potential_growth = total_predicted_clv - total_current_clv

print(f"Current Total CLV: ${total_current_clv:,.0f}")
print(f"Predicted Total CLV: ${total_predicted_clv:,.0f}")
print(f"Growth Potential: ${potential_growth:,.0f} ({(potential_growth/total_current_clv)*100:.1f}% increase)")

# High-risk revenue at stake
at_risk_revenue = customer_scores[customer_scores['risk_score'] > 70]['total_spent'].sum()
print(f"Revenue at Risk (high churn probability): ${at_risk_revenue:,.0f}")
print(f"Potential Recovery (30% success rate): ${at_risk_revenue * 0.3:,.0f}")

💼 ML-POWERED BUSINESS ACTION PLAN
🎯 TIER-BASED ACTION STRATEGIES:

🏷️  PLATINUM TIER
   👥 Customers: 16
   💰 Revenue: $68,496
   📈 Avg Predicted CLV: $7105
   ⚠️  Avg Risk Score: 3%
   🎯 Focus: Retain at all costs
   📋 Actions:
      💎 White-glove customer service
      🎁 Exclusive products and early access
      📞 Dedicated account manager
      ⭐ Brand ambassador programs

🏷️  GOLD TIER
   👥 Customers: 216
   💰 Revenue: $569,335
   📈 Avg Predicted CLV: $3835
   ⚠️  Avg Risk Score: 3%
   🎯 Focus: Upsell and expand relationship
   📋 Actions:
      📈 Premium product recommendations
      💳 Credit line increases
      🎯 Cross-category promotions
      🏆 Loyalty program upgrades

🏷️  SILVER TIER
   👥 Customers: 312
   💰 Revenue: $461,071
   📈 Avg Predicted CLV: $2458
   ⚠️  Avg Risk Score: 25%
   🎯 Focus: Increase engagement and frequency
   📋 Actions:
      📱 Personalized email campaigns
      🎈 Free shipping incentives
      🔄 Subscription program offers
      📊 Behavior tracking and op

In [52]:
# Save ML Results and Project Conclusion
print("💾 SAVING MACHINE LEARNING RESULTS")
print("=" * 50)

# Reset index to make customer_id a column
customer_scores_save = customer_scores.reset_index()

# Save the complete customer scoring results
customer_scores_save.to_csv('../data/processed/ml_customer_scores.csv', index=False)
print("✅ Complete customer scores saved")

# Save tier-specific customer lists for marketing teams
for tier in ['Platinum', 'Gold', 'Silver', 'Bronze']:
    tier_customers = customer_scores_save[customer_scores_save['customer_tier'] == tier]
    if len(tier_customers) > 0:
        tier_customers[['customer_id', 'recency', 'frequency', 'total_spent', 
                       'clv_predicted', 'churn_probability', 'overall_score']].to_csv(
            f'../data/processed/ml_{tier.lower()}_customers.csv', index=False
        )
        print(f"✅ {tier} customer list saved ({len(tier_customers)} customers)")

# Save high-priority action lists
high_risk_valuable = customer_scores_save[
    (customer_scores_save['risk_score'] > 60) & 
    (customer_scores_save['total_spent'] > customer_scores_save['total_spent'].median())
]

if len(high_risk_valuable) > 0:
    high_risk_valuable.to_csv('../data/processed/high_risk_valuable_customers.csv', index=False)
    print(f"✅ High-risk valuable customers list saved ({len(high_risk_valuable)} customers)")
else:
    print("✅ No high-risk valuable customers found")

print("\n🎉 NOTEBOOK 03 COMPLETE!")
print("=" * 40)
print("""
📊 What We Accomplished:
✅ Built Customer Lifetime Value prediction model
✅ Created customer churn prediction model  
✅ Generated automated customer scoring system
✅ Created ML-powered business action plans
✅ Identified high-value and at-risk customers
✅ Provided data-driven budget allocation recommendations

🤖 Machine Learning Models Performance:
- CLV Prediction Model: R² score shows prediction accuracy
- Churn Prediction Model: Identifies customers likely to leave
- Customer Scoring System: Automated tier classification

🎯 Business Impact:
- Predict future customer value before it happens
- Identify customers at risk of leaving before they do
- Automate customer segmentation for marketing campaigns
- Optimize marketing budget based on ML predictions
- Create early warning systems for customer retention

💰 Key Insights:
- Growth potential identified through predictive modeling
- Revenue at risk quantified through churn prediction
- Customer tiers created for targeted marketing strategies
- ROI potential calculated for business investment decisions

🚀 Next Steps:
- Implement automated scoring in production systems
- Create real-time dashboards for customer monitoring
- Set up automated alerts for high-risk customers
- Deploy models for ongoing customer prediction

💡 Files Created:
- Complete customer scoring database with ML predictions
- Tier-specific customer lists for marketing campaigns
- High-priority action lists for immediate intervention
- Predictive models ready for production deployment

This completes your end-to-end customer analytics project - from raw data 
to machine learning predictions to actionable business strategy! 🎯
""")

# Final Project Summary
print("\n🏆 PROJECT PORTFOLIO SUMMARY")
print("=" * 50)
print(f"""
Your Complete E-commerce Customer Analytics Project:

📁 NOTEBOOK 01: Data Generation
   • Created realistic e-commerce dataset (1000 customers, 3000+ orders)
   • Built order and product structure
   • Established data foundation

📁 NOTEBOOK 02: RFM Analysis & Customer Segmentation  
   • Calculated Recency, Frequency, Monetary metrics
   • Created customer segments (Champions, At Risk, etc.)
   • Generated business action plans

📁 NOTEBOOK 03: Machine Learning & Predictive Analytics
   • Built CLV prediction model (R² = {test_r2:.3f})
   • Created churn prediction system
   • Automated customer scoring and tier classification

🎯 Business Value Created:
   • Predictive customer analytics system
   • Automated marketing campaign targeting
   • Revenue optimization recommendations
   • Customer retention early warning system
""")

💾 SAVING MACHINE LEARNING RESULTS
✅ Complete customer scores saved
✅ Platinum customer list saved (16 customers)
✅ Gold customer list saved (216 customers)
✅ Silver customer list saved (312 customers)
✅ Bronze customer list saved (456 customers)
✅ High-risk valuable customers list saved (227 customers)

🎉 NOTEBOOK 03 COMPLETE!

📊 What We Accomplished:
✅ Built Customer Lifetime Value prediction model
✅ Created customer churn prediction model  
✅ Generated automated customer scoring system
✅ Created ML-powered business action plans
✅ Identified high-value and at-risk customers
✅ Provided data-driven budget allocation recommendations

🤖 Machine Learning Models Performance:
- CLV Prediction Model: R² score shows prediction accuracy
- Churn Prediction Model: Identifies customers likely to leave
- Customer Scoring System: Automated tier classification

🎯 Business Impact:
- Predict future customer value before it happens
- Identify customers at risk of leaving before they do
- Automate custom