# E-Commerce Customer Segmentation Pipeline

This notebook implements an unsupervised learning pipeline for customer segmentation in an e-commerce scenario. The goal is to identify distinct customer groups based on behavioral and transactional data using dimensionality reduction and clustering techniques.

## Table of Contents
1. [Setup and Imports](#setup)
2. [Data Acquisition and Exploration](#data)
3. [Data Preprocessing](#preprocessing)
4. [Dimensionality Reduction](#dim_reduction)
5. [Clustering](#clustering)
6. [Evaluation and Visualization](#evaluation)
7. [Cluster Analysis and Interpretation](#analysis)
8. [Edge Cases and Robustness](#edge_cases)
9. [Conclusion](#conclusion)

<a id='setup'></a>
## 1. Setup and Imports

In [None]:
# Standard libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
from IPython.display import display

# Import custom modules
import sys
import os

# Add the src directory to the path
sys.path.append(os.path.abspath('../'))

from src.data_preprocessing import load_data, load_online_retail_data, preprocess_data
from src.dimensionality_reduction import reduce_dimensions, compare_dimensionality_reduction_methods
from src.clustering import find_optimal_k, cluster_data, compare_clustering_methods
from src.evaluation import evaluate_clustering, compare_clustering_results, analyze_clusters, generate_cluster_labels
from src.visualization import plot_elbow_method, plot_clusters_2d, plot_clusters_3d, plot_cluster_profiles, create_interactive_scatter
from src.config import get_config

# Set plotting style
plt.style.use('seaborn-v0_8-whitegrid')
sns.set_theme(style="whitegrid")

# Configure pandas display options
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 100)
pd.set_option('display.float_format', '{:.3f}'.format)

# Set random seed for reproducibility
np.random.seed(42)

In [None]:
# Load configuration
config = get_config()
print("Configuration loaded:")
print(f"- Dimensionality reduction method: {config['dimensionality_reduction']['method']}")
print(f"- Clustering method: {config['clustering']['method']}")
print(f"- Number of components: {config['dimensionality_reduction']['n_components']}")
if config['clustering']['method'] == 'kmeans' or config['clustering']['method'] == 'agglomerative':
    print(f"- Number of clusters: {config['clustering'][config['clustering']['method']]['n_clusters']}")

<a id='data'></a>
## 2. Data Acquisition and Exploration

In [None]:
# Generate synthetic data or load real data
if config['data']['use_synthetic']:
    print(f"Generating synthetic data with {config['data']['n_samples']} samples...")
    # Load real data from Online Retail dataset
    df, customer_ids = load_online_retail_data("None")
else:
    if 'path' in config['data']:
        print(f"Loading data from {config['data']['path']}...")
        df = load_data(config['data']['path'])
    else:
        print("No data path specified. Generating synthetic data instead...")
    # Load real data from Online Retail dataset
    df, customer_ids = load_online_retail_data("None")

print(f"Data shape: {df.shape}")
df.head()

In [None]:
# Display basic information about the data
print("Data information:")
df.info()

In [None]:
# Display summary statistics
print("Summary statistics:")
df.describe()

In [None]:
# Check for missing values
print("Missing values:")
missing_values = df.isnull().sum()
missing_percentage = (missing_values / len(df)) * 100
missing_df = pd.DataFrame({
    'Missing Values': missing_values,
    'Percentage': missing_percentage
})
missing_df[missing_df['Missing Values'] > 0]

In [None]:
# Visualize distributions of numerical features
numerical_cols = df.select_dtypes(include=['int64', 'float64']).columns.tolist()

# Remove customer_id if present
if 'customer_id' in numerical_cols:
    numerical_cols.remove('customer_id')

# Create histograms
fig, axes = plt.subplots(len(numerical_cols), 1, figsize=(10, 4 * len(numerical_cols)))

for i, col in enumerate(numerical_cols):
    sns.histplot(df[col].dropna(), kde=True, ax=axes[i])
    axes[i].set_title(f'Distribution of {col}')
    axes[i].set_xlabel(col)
    axes[i].set_ylabel('Frequency')

plt.tight_layout()
plt.show()

In [None]:
# Visualize categorical features
categorical_cols = df.select_dtypes(include=['object', 'category']).columns.tolist()

# Remove customer_id if present
if 'customer_id' in categorical_cols:
    categorical_cols.remove('customer_id')

if categorical_cols:
    fig, axes = plt.subplots(len(categorical_cols), 1, figsize=(10, 4 * len(categorical_cols)))
    
    # Handle case with only one categorical column
    if len(categorical_cols) == 1:
        axes = [axes]
    
    for i, col in enumerate(categorical_cols):
        value_counts = df[col].value_counts()
        sns.barplot(x=value_counts.index, y=value_counts.values, ax=axes[i])
        axes[i].set_title(f'Distribution of {col}')
        axes[i].set_xlabel(col)
        axes[i].set_ylabel('Count')
        axes[i].tick_params(axis='x', rotation=45)
    
    plt.tight_layout()
    plt.show()

In [None]:
# Correlation matrix for numerical features
if numerical_cols:
    corr_matrix = df[numerical_cols].corr()
    
    plt.figure(figsize=(10, 8))
    sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', center=0, fmt='.2f')
    plt.title('Correlation Matrix')
    plt.tight_layout()
    plt.show()

<a id='preprocessing'></a>
## 3. Data Preprocessing

In [None]:
# Preprocess the data
print("Preprocessing data...")
print(f"- Missing values strategy: {config['preprocessing']['missing_values']['strategy']}")
print(f"- Outlier removal method: {config['preprocessing']['outliers']['method']}")
print(f"- Scaling method: {config['preprocessing']['scaling']['method']}")
print(f"- Encoding method: {config['preprocessing']['encoding']['method']}")

# Store customer IDs for later reference
customer_ids = None
if 'customer_id' in df.columns:

# Apply preprocessing pipeline
df_processed = preprocess_data(df, config=config['preprocessing'])

print(f"Original data shape: {df.shape}")
print(f"Processed data shape: {df_processed.shape}")
df_processed.head()

<a id='dim_reduction'></a>
## 4. Dimensionality Reduction

In [None]:
# Apply dimensionality reduction
print(f"Applying {config['dimensionality_reduction']['method']} dimensionality reduction...")

# Get method-specific parameters
method = config['dimensionality_reduction']['method']
n_components = config['dimensionality_reduction']['n_components']
method_params = config['dimensionality_reduction'][method]
random_state = config['dimensionality_reduction']['random_state']

# Apply dimensionality reduction
X_reduced, model, additional_info = reduce_dimensions(
    df_processed.values,
    method=method,
    n_components=n_components,
    random_state=random_state,
    **method_params
)

print(f"Reduced data shape: {X_reduced.shape}")

# Display additional information if available
if method == 'pca':
    print(f"Explained variance ratio: {additional_info}")
    print(f"Total explained variance: {sum(additional_info):.4f}")
elif method == 'mds':
    print(f"Stress: {additional_info:.4f}")

In [None]:
# Visualize reduced data
plt.figure(figsize=(10, 8))
plt.scatter(X_reduced[:, 0], X_reduced[:, 1], alpha=0.7)
plt.title(f'Data after {method.upper()} Dimensionality Reduction')
plt.xlabel('Component 1')
plt.ylabel('Component 2')
plt.grid(True)
plt.show()

In [None]:
# Compare different dimensionality reduction methods
print("Comparing different dimensionality reduction methods...")

# Define methods to compare
methods_to_compare = ['pca', 'kernel_pca', 'mds', 'umap']

# Apply each method
dim_reduction_results = compare_dimensionality_reduction_methods(
    df_processed.values,
    methods=methods_to_compare,
    n_components=2
)

# Extract reduced data for visualization
X_dict = {}
for method_name, (X_method, _, _) in dim_reduction_results.items():
    X_dict[method_name.upper()] = X_method

# Visualize results
fig, axes = plt.subplots(2, 2, figsize=(15, 12))
axes = axes.flatten()

for i, (method_name, X_method) in enumerate(X_dict.items()):
    axes[i].scatter(X_method[:, 0], X_method[:, 1], alpha=0.7)
    axes[i].set_title(f'{method_name} Reduction')
    axes[i].set_xlabel('Component 1')
    axes[i].set_ylabel('Component 2')
    axes[i].grid(True)

plt.tight_layout()
plt.show()