# Task 3: Customer Segmentation / Clustering

In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.metrics import davies_bouldin_score, silhouette_score
import umap
import matplotlib.pyplot as plt
import seaborn as sns


# Rename Price column in products to avoid conflict
products.rename(columns={'Price': 'ProductPrice'}, inplace=True)

# Merge datasets
merged_data = pd.merge(transactions, products, on='ProductID', how='inner')
merged_data = pd.merge(merged_data, customers, on='CustomerID', how='inner')

# Ensure Price column exists
if 'Price' not in merged_data.columns:
    merged_data['Price'] = merged_data['ProductPrice']

# Create customer profiles based on transaction history
customer_profiles = merged_data.groupby('CustomerID').agg({
    'TotalValue': 'sum',
    'Quantity': 'sum',
    'Price': 'mean',
    'Region': 'first',
    'SignupDate': 'first'
}).reset_index()

# Normalize the data for clustering
scaler = StandardScaler()
customer_profiles_scaled = scaler.fit_transform(customer_profiles[['TotalValue', 'Quantity', 'Price']])

# Perform KMeans clustering
kmeans = KMeans(n_clusters=4, random_state=42)
customer_profiles['Cluster'] = kmeans.fit_predict(customer_profiles_scaled)

# Calculate clustering metrics
db_index = davies_bouldin_score(customer_profiles_scaled, customer_profiles['Cluster'])
silhouette_avg = silhouette_score(customer_profiles_scaled, customer_profiles['Cluster'])

print(f"Davies-Bouldin Index: {db_index}")
print(f"Silhouette Score: {silhouette_avg}")

# Reduce dimensions to 2D using UMAP
umap_reducer = umap.UMAP(n_components=2, random_state=42)
umap_result = umap_reducer.fit_transform(customer_profiles_scaled)

customer_profiles['UMAP1'] = umap_result[:, 0]
customer_profiles['UMAP2'] = umap_result[:, 1]

# Visualize clusters using UMAP
plt.figure(figsize=(10, 6))
sns.scatterplot(x='UMAP1', y='UMAP2', hue='Cluster', data=customer_profiles, palette='viridis')
plt.title('Customer Segmentation using UMAP')
plt.show()

# Save cluster information to a CSV file
customer_profiles.to_csv('Customer_Segmentation.csv', index=False)

#RESULTS
 *  Davies-Bouldin Index: 1.12180191226693
 * Silhouette Score: 0.31097951039547694

