In [1]:
# Parameters
CLUSTER_RESULT_PATH = "data/processed/customer_clusters_from_rules.csv"
RULES_INPUT_PATH = "data/processed/rules_fpgrowth_filtered.csv"
CLEANED_DATA_PATH = "data/processed/cleaned_uk_data.csv"
TOP_RULES_PER_CLUSTER = 10
TOP_PRODUCTS_PER_CLUSTER = 15
MIN_RULE_CONFIDENCE = 0.3
PROFILING_OUTPUT_PATH = "data/processed/cluster_profiles_detailed.csv"
MARKETING_RECOMMENDATIONS_PATH = "data/processed/marketing_recommendations.csv"
PLOT_RFM_COMPARISON = False
PLOT_PRODUCT_HEATMAP = False
PLOT_CLUSTER_RADAR = False


In [2]:
# Cell 1: PARAMETERS (must have "parameters" tag)
# parameters
CLUSTER_RESULT_PATH = "data/processed/customer_clusters_from_rules.csv"
RULES_INPUT_PATH = "data/processed/rules_fpgrowth_filtered.csv"
CLEANED_DATA_PATH = "data/processed/cleaned_uk_data.csv"
TOP_RULES_PER_CLUSTER = 10
TOP_PRODUCTS_PER_CLUSTER = 15
MIN_RULE_CONFIDENCE = 0.3
PROFILING_OUTPUT_PATH = "data/processed/cluster_profiles_detailed.csv"
MARKETING_RECOMMENDATIONS_PATH = "data/processed/marketing_recommendations.csv"
PLOT_RFM_COMPARISON = False
PLOT_PRODUCT_HEATMAP = False
PLOT_CLUSTER_RADAR = False

In [3]:
# Cell 2: Import libraries (KHÔNG import cluster_library)
import os
import sys
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime

sns.set(style="whitegrid")
plt.rcParams["figure.figsize"] = (12, 8)
plt.rcParams["axes.titlesize"] = 16
plt.rcParams["axes.labelsize"] = 14

print("="*80)
print("STEP 7: CLUSTER PROFILING AND INTERPRETATION")
print("="*80)

STEP 7: CLUSTER PROFILING AND INTERPRETATION


In [4]:
# Cell 3: Load data
print("\n1. LOADING DATA:")

cluster_results = pd.read_csv(CLUSTER_RESULT_PATH)
cleaned_data = pd.read_csv(CLEANED_DATA_PATH, parse_dates=["InvoiceDate"])

# Try to load rules file (may have different names)
rules_files = [
    "data/processed/rules_fpgrowth_filtered.csv",
    "data/processed/rules_apriori_filtered.csv",
    "data/processed/top_k_rules_fp.csv",
    "data/processed/top_k_rules.csv"
]

rules_df = None
for file in rules_files:
    if os.path.exists(file):
        rules_df = pd.read_csv(file)
        print(f"• Loaded rules from: {file}")
        break

if rules_df is None:
    print("⚠️ No rules file found, creating empty dataframe")
    rules_df = pd.DataFrame(columns=['antecedents_str', 'consequents_str', 'support', 'confidence', 'lift'])

print(f"• Cluster results: {cluster_results.shape[0]} customers, {cluster_results['cluster'].nunique()} clusters")
print(f"• Transaction data: {cleaned_data.shape[0]} transactions")
print(f"• Association rules: {rules_df.shape[0]} rules")


1. LOADING DATA:


• Loaded rules from: data/processed/rules_fpgrowth_filtered.csv
• Cluster results: 3921 customers, 2 clusters
• Transaction data: 485123 transactions
• Association rules: 21 rules


  cleaned_data = pd.read_csv(CLEANED_DATA_PATH, parse_dates=["InvoiceDate"])


In [5]:
# Cell 4: Define helper functions (thay vì import từ library)
def analyze_rfm_by_cluster(cluster_results, cleaned_data):
    """Calculate RFM metrics for each cluster"""
    # Merge data
    merged = pd.merge(
        cleaned_data,
        cluster_results[['CustomerID', 'cluster']],
        on='CustomerID',
        how='inner'
    )
    
    # Reference date
    reference_date = merged['InvoiceDate'].max() + pd.Timedelta(days=1)
    
    # Calculate RFM per customer
    rfm_customer = merged.groupby(['CustomerID', 'cluster']).agg({
        'InvoiceDate': lambda x: (reference_date - x.max()).days,
        'InvoiceNo': 'nunique',
        'TotalPrice': 'sum'
    }).rename(columns={
        'InvoiceDate': 'Recency',
        'InvoiceNo': 'Frequency',
        'TotalPrice': 'Monetary'
    }).reset_index()
    
    # Aggregate by cluster
    rfm_by_cluster = rfm_customer.groupby('cluster').agg({
        'CustomerID': 'count',
        'Recency': 'mean',
        'Frequency': 'mean',
        'Monetary': 'mean'
    }).rename(columns={'CustomerID': 'n_customers'}).reset_index()
    
    return rfm_by_cluster

def assign_cluster_names(rfm_by_cluster):
    """Assign meaningful names to clusters"""
    cluster_names = {}
    
    for _, row in rfm_by_cluster.iterrows():
        cluster_id = row['cluster']
        recency = row['Recency']
        frequency = row['Frequency']
        monetary = row['Monetary']
        
        # Simple classification
        if recency < 30 and frequency > 5 and monetary > 100:
            english_name = "VIP Loyal Customers"
            vietnamese_name = "Khách VIP trung thành"
            persona = "High-value frequent buyers"
        elif recency < 60 and monetary > 50:
            english_name = "Regular Value Customers"
            vietnamese_name = "Khách hàng giá trị thường xuyên"
            persona = "Regular customers with decent spending"
        elif recency > 90:
            english_name = "At-Risk Inactive Customers"
            vietnamese_name = "Khách hàng ngủ đông có nguy cơ mất"
            persona = "Inactive customers"
        elif frequency > 3:
            english_name = "Frequent Low-Spend Customers"
            vietnamese_name = "Khách hàng thường xuyên chi tiêu thấp"
            persona = "Frequent buyers with low value"
        else:
            english_name = "Occasional Shoppers"
            vietnamese_name = "Khách hàng mua sắm không thường xuyên"
            persona = "Occasional shoppers"
        
        cluster_names[cluster_id] = {
            'english_name': english_name,
            'vietnamese_name': vietnamese_name,
            'persona_description': persona
        }
    
    return cluster_names

def generate_marketing_recommendations(rfm_by_cluster, cluster_names):
    """Generate marketing recommendations"""
    recommendations = []
    
    for _, row in rfm_by_cluster.iterrows():
        cluster_id = row['cluster']
        names = cluster_names[cluster_id]
        
        # Recommendation 1: Based on recency
        if row['Recency'] > 90:
            recommendations.append({
                'cluster': cluster_id,
                'strategy_type': 'Reactivation Campaign',
                'recommendation': 'Send "We miss you" email with 20% discount',
                'rationale': f'Customers inactive for {int(row["Recency"])} days',
                'timing': 'Immediate',
                'expected_kpi': '15% reactivation rate'
            })
        
        # Recommendation 2: Based on monetary value
        if row['Monetary'] > 100:
            recommendations.append({
                'cluster': cluster_id,
                'strategy_type': 'VIP Treatment',
                'recommendation': 'Offer exclusive early access',
                'rationale': f'High-value customers (avg spend £{row["Monetary"]:.0f})',
                'timing': 'Next product launch',
                'expected_kpi': 'Increase loyalty by 25%'
            })
        
        # Recommendation 3: General recommendation
        recommendations.append({
            'cluster': cluster_id,
            'strategy_type': 'Targeted Promotion',
            'recommendation': f'Create personalized offers for {names["vietnamese_name"]}',
            'rationale': f'Segment identified through clustering analysis',
            'timing': 'Next 30 days',
            'expected_kpi': 'Increase conversion by 10%'
        })
    
    return pd.DataFrame(recommendations)

In [6]:
# Cell 5: Perform analysis
print("\n" + "="*80)
print("2. RFM ANALYSIS BY CLUSTER")
print("="*80)

rfm_by_cluster = analyze_rfm_by_cluster(cluster_results, cleaned_data)
print("\nRFM averages by cluster:")
print("-" * 60)
print(rfm_by_cluster.round(2).to_string())

if PLOT_RFM_COMPARISON:
    fig, axes = plt.subplots(1, 3, figsize=(15, 4))
    metrics = ['Recency', 'Frequency', 'Monetary']
    colors = ['red', 'green', 'blue']
    
    for idx, (metric, color) in enumerate(zip(metrics, colors)):
        axes[idx].bar(rfm_by_cluster['cluster'], rfm_by_cluster[metric], color=color, alpha=0.7)
        axes[idx].set_xlabel('Cluster')
        axes[idx].set_ylabel(metric)
        axes[idx].set_title(f'{metric} by Cluster')
        axes[idx].grid(True, alpha=0.3)
    
    plt.tight_layout()
    plt.show()


2. RFM ANALYSIS BY CLUSTER



RFM averages by cluster:
------------------------------------------------------------
   cluster  n_customers  Recency  Frequency  Monetary
0        0         3623    94.53       3.85   1646.20
1        1          298    63.67      13.61  10271.91


In [7]:
# Cell 6: Create cluster profiles
print("\n" + "="*80)
print("3. CREATING CLUSTER PROFILES")
print("="*80)

# Add percentage
rfm_by_cluster['customer_percentage'] = rfm_by_cluster['n_customers'] / rfm_by_cluster['n_customers'].sum()
profiles = rfm_by_cluster.copy()

print("\nCluster profiles:")
print("-" * 120)
print(profiles.round(2).to_string())


3. CREATING CLUSTER PROFILES

Cluster profiles:
------------------------------------------------------------------------------------------------------------------------
   cluster  n_customers  Recency  Frequency  Monetary  customer_percentage
0        0         3623    94.53       3.85   1646.20                 0.92
1        1          298    63.67      13.61  10271.91                 0.08


In [8]:
# Cell 7: Assign cluster names
print("\n" + "="*80)
print("4. ASSIGNING CLUSTER NAMES")
print("="*80)

cluster_names = assign_cluster_names(rfm_by_cluster)
print("\nCluster names and personas:")
print("-" * 80)

for cluster_id, names in cluster_names.items():
    cluster_data = rfm_by_cluster[rfm_by_cluster['cluster'] == cluster_id].iloc[0]
    print(f"\n• CLUSTER {cluster_id}:")
    print(f"  English name: {names['english_name']}")
    print(f"  Vietnamese name: {names['vietnamese_name']}")
    print(f"  Persona: {names['persona_description']}")
    print(f"  Customers: {cluster_data['n_customers']} ({cluster_data['customer_percentage']:.1%})")
    print(f"  RFM: R={cluster_data['Recency']:.0f} days, F={cluster_data['Frequency']:.1f}, M=£{cluster_data['Monetary']:.1f}")


4. ASSIGNING CLUSTER NAMES

Cluster names and personas:
--------------------------------------------------------------------------------

• CLUSTER 0.0:
  English name: At-Risk Inactive Customers
  Vietnamese name: Khách hàng ngủ đông có nguy cơ mất
  Persona: Inactive customers
  Customers: 3623.0 (92.4%)
  RFM: R=95 days, F=3.9, M=£1646.2

• CLUSTER 1.0:
  English name: Frequent Low-Spend Customers
  Vietnamese name: Khách hàng thường xuyên chi tiêu thấp
  Persona: Frequent buyers with low value
  Customers: 298.0 (7.6%)
  RFM: R=64 days, F=13.6, M=£10271.9


In [9]:
# Cell 8: Generate marketing recommendations
print("\n" + "="*80)
print("5. GENERATING MARKETING RECOMMENDATIONS")
print("="*80)

recommendations = generate_marketing_recommendations(rfm_by_cluster, cluster_names)
print("\nMarketing recommendations:")
print("-" * 100)

for _, rec in recommendations.iterrows():
    print(f"\n• CLUSTER {rec['cluster']} - {rec['strategy_type']}:")
    print(f"  Recommendation: {rec['recommendation']}")
    print(f"  Rationale: {rec['rationale']}")
    print(f"  Timing: {rec['timing']}")
    print(f"  Expected KPI: {rec['expected_kpi']}")


5. GENERATING MARKETING RECOMMENDATIONS

Marketing recommendations:
----------------------------------------------------------------------------------------------------

• CLUSTER 0.0 - Reactivation Campaign:
  Recommendation: Send "We miss you" email with 20% discount
  Rationale: Customers inactive for 94 days
  Timing: Immediate
  Expected KPI: 15% reactivation rate

• CLUSTER 0.0 - VIP Treatment:
  Recommendation: Offer exclusive early access
  Rationale: High-value customers (avg spend £1646)
  Timing: Next product launch
  Expected KPI: Increase loyalty by 25%

• CLUSTER 0.0 - Targeted Promotion:
  Recommendation: Create personalized offers for Khách hàng ngủ đông có nguy cơ mất
  Rationale: Segment identified through clustering analysis
  Timing: Next 30 days
  Expected KPI: Increase conversion by 10%

• CLUSTER 1.0 - VIP Treatment:
  Recommendation: Offer exclusive early access
  Rationale: High-value customers (avg spend £10272)
  Timing: Next product launch
  Expected KPI: I

In [10]:
# Cell 9: Save results
print("\n" + "="*80)
print("6. SAVING RESULTS")
print("="*80)

# Add cluster names to profiles
profiles['english_name'] = profiles['cluster'].map(lambda x: cluster_names.get(x, {}).get('english_name', ''))
profiles['vietnamese_name'] = profiles['cluster'].map(lambda x: cluster_names.get(x, {}).get('vietnamese_name', ''))
profiles['persona_description'] = profiles['cluster'].map(lambda x: cluster_names.get(x, {}).get('persona_description', ''))

# Save profiles
profiles.to_csv(PROFILING_OUTPUT_PATH, index=False)
print(f"✓ Cluster profiles saved to: {PROFILING_OUTPUT_PATH}")

# Save recommendations
recommendations.to_csv(MARKETING_RECOMMENDATIONS_PATH, index=False)
print(f"✓ Marketing recommendations saved to: {MARKETING_RECOMMENDATIONS_PATH}")

# Create legacy files for compatibility
import shutil
try:
    shutil.copy(PROFILING_OUTPUT_PATH, "data/processed/cluster_profiles.csv")
    print("✓ Created legacy file: data/processed/cluster_profiles.csv")
    
    # Copy rules file
    for file in rules_files:
        if os.path.exists(file):
            shutil.copy(file, "data/processed/top_k_rules.csv")
            print(f"✓ Created legacy file: data/processed/top_k_rules.csv")
            break
except Exception as e:
    print(f"⚠️ Could not create legacy files: {e}")

print("\n" + "="*80)
print("✅ STEP 7 COMPLETED SUCCESSFULLY!")
print("="*80)


6. SAVING RESULTS
✓ Cluster profiles saved to: data/processed/cluster_profiles_detailed.csv
✓ Marketing recommendations saved to: data/processed/marketing_recommendations.csv
✓ Created legacy file: data/processed/cluster_profiles.csv
✓ Created legacy file: data/processed/top_k_rules.csv

✅ STEP 7 COMPLETED SUCCESSFULLY!
