# Customer Segmentation Analysis

## 1. Data Preparation

In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.metrics import davies_bouldin_score
import matplotlib.pyplot as plt

# Load and prepare data
customers_df = pd.read_csv('Customers.csv')
transactions_df = pd.read_csv('Transactions.csv')

# Create customer features for clustering
def create_clustering_features(customers_df, transactions_df):
    # Calculate customer metrics
    customer_metrics = transactions_df.groupby('CustomerID').agg({
        'TotalValue': ['sum', 'mean', 'count'],
        'Quantity': ['sum', 'mean']
    })
    
    # Add customer profile information
    customer_metrics = customer_metrics.join(pd.get_dummies(customers_df.set_index('CustomerID')['Region']))
    
    return customer_metrics

clustering_features = create_clustering_features(customers_df, transactions_df)

# Scale features
scaler = StandardScaler()
scaled_features = scaler.fit_transform(clustering_features)

In [None]:
# Find optimal number of clusters
db_scores = []
K = range(2, 11)

for k in K:
    kmeans = KMeans(n_clusters=k, random_state=42)
    kmeans.fit(scaled_features)
    db_scores.append(davies_bouldin_score(scaled_features, kmeans.labels_))

# Plot DB Index scores
plt.figure(figsize=(10, 6))
plt.plot(K, db_scores, 'bx-')
plt.xlabel('k')
plt.ylabel('Davies-Bouldin Index')
plt.title('Optimal number of clusters')
plt.show()

In [None]:
# Perform final clustering
optimal_k = K[np.argmin(db_scores)]
final_kmeans = KMeans(n_clusters=optimal_k, random_state=42)
cluster_labels = final_kmeans.fit_predict(scaled_features)

# Add cluster labels to original data
clustering_features['Cluster'] = cluster_labels

# Analyze clusters
print("Cluster Sizes:")
print(clustering_features['Cluster'].value_counts())

print("\nDavies-Bouldin Index:", davies_bouldin_score(scaled_features, cluster_labels))