In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
def load_data(file_path):
    """Load the Online Retail Dataset."""
    return pd.read_excel(file_path)

In [None]:
df = load_data('http://archive.ics.uci.edu/ml/machine-learning-databases/00352/Online%20Retail.xlsx')
df.head()

In [None]:
print(f"Shape: {df.shape}")

In [None]:
def preprocess_data(df):
    """Handle missing values, convert date, and remove cancelled orders."""
    df.dropna(inplace=True)
    df['InvoiceDate'] = pd.to_datetime(df['InvoiceDate'])
    return df[~df['InvoiceNo'].astype(str).str.contains('C')]


df_cleaned = preprocess_data(df)
df_cleaned.head()

In [None]:
print(f"Shape: {df_cleaned.shape}")

In [None]:
def engineer_features(df):
    """Calculate RFM (Recency, Frequency, Monetary) features."""
    df['TotalAmount'] = df['Quantity'] * df['UnitPrice']
    customer_features = df.groupby('CustomerID').agg({
        'InvoiceDate': lambda x: (df['InvoiceDate'].max() - x.max()).days,
        'InvoiceNo': 'count',
        'TotalAmount': 'sum'
    })
    customer_features.columns = ['Recency', 'Frequency', 'Monetary']
    return customer_features


customer_features = engineer_features(df_cleaned)
customer_features.head()

In [None]:
print(f"Shape: {customer_features.shape}")

In [None]:
def apply_pca(data, n_components=2):
    """Apply PCA for dimensionality reduction."""
    scaler = StandardScaler()
    data_scaled = scaler.fit_transform(data)
    pca = PCA(n_components=n_components)
    pca_result = pca.fit_transform(data_scaled)
    return pca_result, pca


pca_result, pca = apply_pca(customer_features)
print(f"PCA shape: {pca_result.shape}")
print(f"Explained variance ratio: {pca.explained_variance_ratio_}")

In [None]:
def find_optimal_clusters(data, max_k):
    """Use the elbow method to find the optimal number of clusters."""
    wcss = [KMeans(n_clusters=k, init='k-means++', random_state=42).fit(data).inertia_
            for k in range(1, max_k+1)]
    
    plt.figure(figsize=(10, 6))
    plt.plot(range(1, max_k+1), wcss, marker='o')
    plt.title('Elbow Method for Optimal k')
    plt.xlabel('Number of clusters')
    plt.ylabel('WCSS')
    plt.show()


find_optimal_clusters(pca_result, 10)

In [None]:
def perform_kmeans(data, n_clusters):
    """Perform K-means clustering."""
    kmeans = KMeans(n_clusters=n_clusters, init='k-means++', random_state=42, n_init=10)
    return kmeans.fit_predict(data)

In [None]:
n_clusters = 4  # Assuming 4 clusters from elbow method
cluster_labels = perform_kmeans(pca_result, n_clusters)
print(f"Number of clusters: {n_clusters}")
print(f"Cluster labels shape: {cluster_labels.shape}")

In [None]:
def profile_clusters(data, labels):
    """Profile the clusters based on mean values of features."""
    data['Cluster'] = labels
    return data.groupby('Cluster').mean()


cluster_profile = profile_clusters(customer_features, cluster_labels)
print(cluster_profile)

In [None]:
def visualize_clusters(data, labels):
    """Visualize the clusters using a scatter plot."""
    plt.figure(figsize=(12, 8))
    scatter = plt.scatter(data[:, 0], data[:, 1], c=labels, cmap='viridis', alpha=0.7)
    plt.title('Customer Segments')
    plt.xlabel('PCA 1')
    plt.ylabel('PCA 2')
    plt.colorbar(scatter)
    plt.show()


visualize_clusters(pca_result, cluster_labels)

In [None]:
def evaluate_clustering(data, labels):
    """Evaluate the clustering using silhouette score."""
    silhouette_avg = silhouette_score(data, labels)
    print(f"The average silhouette score is: {silhouette_avg:.2f}")
    return silhouette_avg


silhouette_avg = evaluate_clustering(pca_result, cluster_labels)

In [None]:
def personalize_recommendations(cluster_profile):
    """Generate personalized recommendations based on cluster profiles."""
    recommendations = {}
    overall_mean = cluster_profile.mean()
    overall_std = cluster_profile.std()
    
    print("\nDetailed Cluster Analysis:")
    print(f"Overall Mean: {overall_mean}")
    print(f"Overall Std Dev: {overall_std}")
    
    for cluster, profile in cluster_profile.iterrows():
        print(f"\nCluster {cluster}:")
        print(f"Profile: {profile}")
        print(f"Monetary vs Mean: {profile['Monetary']} vs {overall_mean['Monetary']}")
        print(f"Frequency vs Mean: {profile['Frequency']} vs {overall_mean['Frequency']}")
        print(f"Recency vs Mean: {profile['Recency']} vs {overall_mean['Recency']}")
        
        if (profile['Monetary'] > overall_mean['Monetary'] + overall_std['Monetary'] and 
            profile['Frequency'] > overall_mean['Frequency'] + overall_std['Frequency']):
            recommendations[cluster] = "High-value, frequent customers. Focus on retention, premium products, and exclusive offers."
        elif (profile['Monetary'] > overall_mean['Monetary'] + overall_std['Monetary'] and 
              profile['Recency'] < overall_mean['Recency']):
            recommendations[cluster] = "High-value, recent customers. Encourage continued engagement with personalized recommendations."
        elif (profile['Frequency'] > overall_mean['Frequency'] and 
              profile['Recency'] < overall_mean['Recency']):
            recommendations[cluster] = "Frequent, recent buyers. Offer loyalty programs and cross-sell opportunities."
        elif profile['Recency'] > overall_mean['Recency'] + overall_std['Recency']:
            recommendations[cluster] = "Less recent customers. Re-engagement campaign needed with special comeback offers."
        else:
            recommendations[cluster] = "Average customers. Enhance engagement with targeted promotions and product recommendations."
        
        print(f"Recommendation: {recommendations[cluster]}")
    
    return recommendations


print("\n--- Testing personalize_recommendations ---")
recommendations = personalize_recommendations(cluster_profile)