In [2]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans

In [3]:
# Load the dataset
df = pd.read_csv('main_df.csv')


In [4]:
# Convert `order_purchase_timestamp` to datetime
df['order_purchase_timestamp'] = pd.to_datetime(df['order_purchase_timestamp'])

In [5]:
# Create a DataFrame with unique customers and their total order value, payment value, and freight value
customer_df = df.groupby('customer_unique_id').agg({
    'order_value': 'sum',
    'payment_value': 'sum',
    'freight_value': 'sum'
}).reset_index()

In [6]:
# Merge to get `product_category` and `payment_type`
customer_df = customer_df.merge(df[['customer_unique_id', 'product_category', 'payment_type']], on='customer_unique_id', how='left')

# Drop unnecessary columns
customer_df = customer_df.drop(columns=['customer_unique_id'])

# Fill missing values with 'unknown'
customer_df['product_category'] = customer_df['product_category'].fillna('unknown')
customer_df['payment_type'] = customer_df['payment_type'].fillna('unknown')

In [7]:
# Convert categorical features to numerical representations
features = pd.get_dummies(customer_df[['product_category', 'payment_type']])
features_names = features.columns

In [8]:
# Standardize the features
scaler = StandardScaler()
features = scaler.fit_transform(features)


In [9]:
# Apply dimensionality reduction
pca = PCA(n_components=2)
pca_features = pca.fit_transform(features)

In [10]:
# Apply clustering model
kmeans = KMeans(n_clusters=3, random_state=42)
kmeans = kmeans.fit(pca_features)

In [11]:
# Add the cluster column to `df_pca_kmeans`
df_pca_kmeans = pd.concat([customer_df, pd.DataFrame(pca_features)], axis=1)
df_pca_kmeans['cluster'] = kmeans.labels_

In [13]:
# Analyze the clusters
cluster_means = df_pca_kmeans.groupby('cluster').agg({
    'order_value': 'mean',
    'payment_value': 'mean',
    'freight_value': 'mean'
})
print('Cluster Means:')
print(cluster_means.to_markdown(numalign="left", stralign="left"))

Cluster Means:
| cluster   | order_value   | payment_value   | freight_value   |
|:----------|:--------------|:----------------|:----------------|
| 0         | 704.187       | 269.001         | 118.115         |
| 1         | 207.139       | 361.806         | 30.538          |
| 2         | 187.494       | 455.738         | 31.2517         |


In [14]:
# Get the size of each cluster
cluster_sizes = df_pca_kmeans.groupby('cluster').size()
print('Cluster Sizes:')
print(cluster_sizes.to_markdown(numalign="left", stralign="left"))

Cluster Sizes:
| cluster   | 0     |
|:----------|:------|
| 0         | 6509  |
| 1         | 86769 |
| 2         | 24323 |


In [16]:
# Get the most frequent product categories and payment types for each cluster
for cluster in range(3):
    print(f'\nCluster {cluster}:')
    for col in ['product_category', 'payment_type']:
        print(f'  Top 5 most frequent {col}s:')
        print(df_pca_kmeans[df_pca_kmeans['cluster'] == cluster][col].value_counts().head(5).to_markdown(numalign="left", stralign="left"))


Cluster 0:
  Top 5 most frequent product_categorys:
| product_category   | count   |
|:-------------------|:--------|
| bed_bath_table     | 989     |
| furniture_decor    | 530     |
| housewares         | 505     |
| sports_leisure     | 411     |
| health_beauty      | 389     |
  Top 5 most frequent payment_types:
| payment_type   | count   |
|:---------------|:--------|
| voucher        | 6274    |
| debit_card     | 175     |
| boleto         | 60      |

Cluster 1:
  Top 5 most frequent product_categorys:
| product_category      | count   |
|:----------------------|:--------|
| bed_bath_table        | 8959    |
| health_beauty         | 7566    |
| sports_leisure        | 6635    |
| furniture_decor       | 6379    |
| computers_accessories | 5436    |
  Top 5 most frequent payment_types:
| payment_type   | count   |
|:---------------|:--------|
| credit_card    | 86769   |

Cluster 2:
  Top 5 most frequent product_categorys:
| product_category      | count   |
|:--------------