In [5]:
import pandas as pd
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
from sklearn.cluster import KMeans, AgglomerativeClustering
from pyclustering.cluster.kmedoids import kmedoids
from sklearn.metrics import silhouette_score, davies_bouldin_score
import numpy as np

  """!
  """!
  """!
  """!
  """!
  """!
  """!
  """!
  """!
  """!


In [6]:
# Load the dataset
df = pd.read_csv('main_df.csv')

In [7]:
# Convert `order_purchase_timestamp` to datetime
df['order_purchase_timestamp'] = pd.to_datetime(df['order_purchase_timestamp'])

In [8]:
# Create a DataFrame with unique customers and their total order value, payment value, and freight value
customer_df = df.groupby('customer_unique_id').agg({
    'order_value': 'sum',
    'payment_value': 'sum',
    'freight_value': 'sum'
}).reset_index()

In [9]:
# Merge to get `product_category` and `payment_type`
customer_df = customer_df.merge(df[['customer_unique_id', 'product_category', 'payment_type']], on='customer_unique_id', how='left')

# Drop unnecessary columns
customer_df = customer_df.drop(columns=['customer_unique_id'])

# Fill missing values with 'unknown'
customer_df['product_category'] = customer_df['product_category'].fillna('unknown')
customer_df['payment_type'] = customer_df['payment_type'].fillna('unknown')

In [10]:
# Convert categorical features to numerical representations
features = pd.get_dummies(customer_df[['product_category', 'payment_type']])
features_names = features.columns

In [11]:
# Dimensionality Reduction: PCA
pca = PCA(n_components=2)
pca_data = pca.fit_transform(features)

In [12]:
# Dimensionality Reduction: t-SNE
tsne = TSNE(n_components=2, random_state=42)
tsne_data = tsne.fit_transform(features)

Dimensionalty Reduction done

In [13]:
# Clustering with PCA-reduced data
## K-Means
kmeans_pca = KMeans(n_clusters=4, random_state=42)
labels_kmeans_pca = kmeans_pca.fit_predict(pca_data)
silhouette_kmeans_pca = silhouette_score(pca_data, labels_kmeans_pca)
db_index_kmeans_pca = davies_bouldin_score(pca_data, labels_kmeans_pca)

In [2]:
## K-Medoids using pyclustering
# Initial medoids (indices of data points)
initial_medoids_pca = [0, 1, 2, 3]  # Example: use first 4 points as initial medoids
kmedoids_instance_pca = kmedoids(pca_data.tolist(), initial_medoids_pca)
kmedoids_instance_pca.process()
labels_kmedoids_pca = kmedoids_instance_pca.predict(pca_data.tolist())
silhouette_kmedoids_pca = silhouette_score(pca_data, labels_kmedoids_pca)
db_index_kmedoids_pca = davies_bouldin_score(pca_data, labels_kmedoids_pca)


NameError: name 'kmedoids' is not defined