In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.metrics import davies_bouldin_score
import matplotlib.pyplot as plt
import seaborn as sns

# Load datasets
customers = pd.read_csv('Customers.csv')
transactions = pd.read_csv('Transactions.csv')

# Merge datasets
merged_data = transactions.merge(customers, on='CustomerID')

# Aggregate transaction data to create customer profiles
# Verify column names in the merged_data DataFrame
print(merged_data.columns)

# Updated aggregation without 'Age'
customer_profiles = merged_data.groupby('CustomerID').agg({
    'Quantity': 'sum',              # Sum of quantities purchased
    'TotalValue': 'sum',            # Total transaction value
    'Region': lambda x: x.mode()[0]  # Most common region
}).reset_index()

# Display the resulting customer profiles
print(customer_profiles.head())


# Encode categorical data (Region)
customer_profiles = pd.get_dummies(customer_profiles, columns=['Region'], drop_first=True)

# Prepare data for clustering
features = customer_profiles.drop(columns=['CustomerID'])
scaler = StandardScaler()
scaled_features = scaler.fit_transform(features)

# Determine the optimal number of clusters using the Elbow Method
sse = []
for k in range(2, 11):
    kmeans = KMeans(n_clusters=k, random_state=42)
    kmeans.fit(scaled_features)
    sse.append(kmeans.inertia_)

# Plot the Elbow Curve
plt.figure(figsize=(8, 5))
plt.plot(range(2, 11), sse, marker='o')
plt.title('Elbow Method for Optimal K')
plt.xlabel('Number of Clusters')
plt.ylabel('SSE (Sum of Squared Errors)')
plt.show()

# Choose the number of clusters (e.g., 4 based on the elbow curve)
n_clusters = 4
kmeans = KMeans(n_clusters=n_clusters, random_state=42)
customer_profiles['Cluster'] = kmeans.fit_predict(scaled_features)

# Calculate Davies-Bouldin Index
db_index = davies_bouldin_score(scaled_features, customer_profiles['Cluster'])
print(f'Davies-Bouldin Index: {db_index:.4f}')

# Visualize the clusters
plt.figure(figsize=(10, 6))
sns.scatterplot(
    x=customer_profiles['TotalValue'],
    y=customer_profiles['Quantity'],
    hue=customer_profiles['Cluster'],
    palette='viridis',
    s=100
)
plt.title('Customer Segmentation Clusters')
plt.xlabel('Total Value of Transactions')
plt.ylabel('Quantity of Products Purchased')
plt.legend(title='Cluster')
plt.show()

# Save clustering results to a CSV file
customer_profiles.to_csv('Customer_Clusters.csv', index=False)

print("Clustering completed. Results saved to Customer_Clusters.csv.")
