In [None]:
1. ### Import, Filter, Format Dataset ###

# Import packages
import pandas as pd
import numpy as np
from datetime import datetime
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
import seaborn as sns

# Read the CSV file
data = pd.read_csv("LOCATION/FILE.csv", delimiter=";", decimal=",")
data.columns = ["cust_id", "date", "ordnum", "rev"]

# Check: Filter rows where revenue is greater than 0
data = data[data['rev'] > 0]

# Convert date column to datetime object (format inferred automatically)
data['date'] = pd.to_datetime(data['date'])

In [None]:
2. ### Creating RFM dataset ###

# Calculate Recency
max_date = data['date'].max()
r = data.groupby('cust_id')['date'].max().reset_index()
r['recency'] = (max_date - r['date']).dt.days

# Calculate Frequency
f = data.groupby('cust_id').agg(frequency=('ordnum', 'nunique')).reset_index()

# Calculate Monetary
m = data.groupby('cust_id')['rev'].sum().reset_index()
m.rename(columns={'rev': 'monetary'}, inplace=True)

# Merge to create data_rfm
data_rfm = pd.merge(pd.merge(f, r, on='cust_id'), m, on='cust_id')

In [None]:
3. ### Determining amount of Clusters ###

# Selecting columns for clustering
data_rfm_elbow = data_rfm[['recency', 'frequency', 'monetary']]

# Function to calculate within-cluster sum of squares (WSS)
def calculate_wss(data, k_max):
    wss = []
    for k in range(1, k_max + 1):
        kmeans = KMeans(n_clusters=k, random_state=1234).fit(data)
        wss.append(kmeans.inertia_)
    return wss

# Using the WSS function to find the optimal number of clusters
wss = calculate_wss(data_rfm_elbow, 5)

# Plotting the elbow plot
plt.figure(figsize=(10, 6))
plt.plot(range(1, 5), wss, marker='o', linestyle='-', color='b')
plt.title('Elbow Method For Optimal k')
plt.xlabel('Number of clusters')
plt.ylabel('Within groups sum of squares')
plt.show()

In [None]:
4. ### Performing RFM Analysis ###

# Performing KMeans clustering - INSERT AMOUNT OF CLUSTERS IN N_CLUSTERS BELOW 
kmeans_basic = KMeans(n_clusters=5, random_state=1234).fit(data_rfm_elbow)
cluster_labels = kmeans_basic.labels_

# Creating a DataFrame with cluster assignments and centers
kmeans_basic_df = pd.DataFrame({'Cluster': cluster_labels})
kmeans_basic_centers = pd.DataFrame(kmeans_basic.cluster_centers_, columns=['Center_Recency', 'Center_Frequency', 'Center_Monetary'])

# Count the number of instances in each cluster
cluster_sizes = pd.Series(cluster_labels).value_counts().sort_index()

# Add the cluster size information to the centers DataFrame
kmeans_basic_centers['GroupSize'] = cluster_sizes.values

# Add cluster label to the centers DataFrame
kmeans_basic_centers['Cluster_Label'] = kmeans_basic_centers.index

# Merging cluster labels with the original data
data_rfm['Cluster'] = cluster_labels

In [None]:
5. ### Outlier Cluster Analysis ###

# Plotting the boxplots
sns.set(style="whitegrid")
plt.rcParams['figure.figsize'] = (10, 6)

# Plotting Boxplot of Recency by Cluster
sns.boxplot(x='Cluster', y='recency', data=data_rfm, palette=["#F99E49"])
plt.title('Boxplot of Recency by Cluster', fontsize=18, fontweight='bold')
plt.xlabel('Cluster', fontsize=14, fontweight='bold')
plt.ylabel('Recency', fontsize=14, fontweight='bold')
plt.show()

# Plotting Boxplot of Frequency by Cluster
sns.boxplot(x='Cluster', y='frequency', data=data_rfm, palette=["#F99E49"])
plt.title('Boxplot of Frequency by Cluster', fontsize=18, fontweight='bold')
plt.xlabel('Cluster', fontsize=14, fontweight='bold')
plt.ylabel('Frequency', fontsize=14, fontweight='bold')
plt.show()

# Plotting Boxplot of Monetary by Cluster
sns.boxplot(x='Cluster', y='monetary', data=data_rfm, palette=["#F99E49"])
plt.title('Boxplot of Monetary by Cluster', fontsize=18, fontweight='bold')
plt.xlabel('Cluster', fontsize=14, fontweight='bold')
plt.ylabel('Monetary', fontsize=14, fontweight='bold')
plt.show()

In [None]:
6. ### Rename the Clusterlabels ###

# Define the mapping from numeric labels to text labels
cluster_name_mapping = {
    0: 'A',
    4: 'B',
    3: 'C',
    2: 'D',
    1: 'E'
}  

# Replace numeric labels with text labels in the 'Cluster_Label' column of kmeans_basic_centers
kmeans_basic_centers['Cluster_Label'] = kmeans_basic_centers['Cluster_Label'].replace(cluster_name_mapping)

# Replace numeric labels with text labels in the 'Cluster' column
data_rfm['Cluster'] = data_rfm['Cluster'].replace(cluster_name_mapping)

In [None]:
7. ### Export Datasets ###

### combine Cust_id with Cluster_labels
data_distinct = data[['cust_id']].drop_duplicates()
data_rfm_with_cust_id = pd.merge(data_distinct, data_rfm, on='cust_id')

# Export data_rfm to Excel
data_rfm_with_cust_id.to_csv('LOCATION/data_rfm.csv', index=False)
data_rfm_with_cust_id.to_excel('LOCATION/data_rfm.xlsx', index=False)