In [1]:
import pandas as pd
import plotly.express as px
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from joblib import Parallel, delayed, cpu_count
from sklearn.metrics import silhouette_score

In [2]:
#the train is already imputed
data_path = 'data/'
# Load the imputed datasets
df_train = pd.read_csv(data_path+'df_imputed_train.csv')
df_test = pd.read_csv(data_path+'df_imputed_test.csv')

In [3]:
#drop dates
df_train.drop(columns=['DELIVERY_START'], inplace=True)
df_test.drop(columns=['DELIVERY_START'], inplace=True)

In [4]:
# Dimension reduction
pca = PCA(n_components=0.99, svd_solver='full', random_state=42)
df_train_pca = pca.fit_transform(df_train)

In [5]:
# Choosing the optimal number of clusters using silhouette score
sil_scores = []
k_range = range(2, 50)

def compute_silhouette_score(k):
    kmeans = KMeans(n_clusters=k, random_state=42)
    labels = kmeans.fit_predict(df_train_pca)
    return silhouette_score(df_train_pca, labels)

# Compute silhouette scores in parallel
try : 
    n_jobs = round(cpu_count() * 0.9, 0)  # Dynamically set the number of jobs to match the available CPUs
    print(f'Running on {n_jobs} cores.') #if your pc can't handle it, you can set n_jobs to 1 or 2
    sil_scores = Parallel(n_jobs=n_jobs)(delayed(compute_silhouette_score)(k) for k in k_range)
    print('Parallel processing completed.')
except : 
    print('Parallel processing failed. Running on a single core. It might take longer. Buy a better pc dumbass.')
    sil_scores = []
    print('Running on 1 core.')
    sil_scores = [compute_silhouette_score(k) for k in k_range]
    print('Single core processing completed.')

max_silhouette_score = max(sil_scores)
optimal_k = k_range[sil_scores.index(max_silhouette_score)]

# Plot the silhouette scores
fig = px.line(x=k_range, y=sil_scores, markers=True, labels={'x': 'Number of clusters (k)', 'y': 'Silhouette score'}, title='Silhouette Method for Optimal k')
fig.show()


Running on 14.0 cores.
Parallel processing completed.


In [13]:
# Apply KMeans clustering

#We want each cluster to represent at least 1% of the dataset, to keep a sort of "representativeness"
threshold_ratio = 0.01
optimal_k_tested = optimal_k

for i in range(optimal_k, 0, -1):
    kmeans = KMeans(n_clusters=i, random_state=42)
    kmeans.fit(df_train_pca)

    df_train_cluster = pd.DataFrame(df_train, columns=[f'feature_{x}' for x in range(df_train.shape[1])])
    df_train_cluster['cluster'] = kmeans.labels_

    too_small = False
    for y in range(i):
        cluster_size = len(df_train_cluster[df_train_cluster['cluster'] == y])
        if cluster_size < len(df_train_cluster) * threshold_ratio:
            print(f'Not keeping k={i}, because cluster {y} is too small ({cluster_size})')
            too_small = True
            break  # No need to check other clusters

    if not too_small:
        optimal_k_tested = i
        break  # Found the best k where all clusters are large enough

print(f"Final selected k: {optimal_k_tested}")
print(f"Cluster sizes for k={optimal_k_tested}:")
for y in range(optimal_k_tested):
    cluster_size = len(df_train_cluster[df_train_cluster['cluster'] == y])
    print(f'Cluster {y+1}: {cluster_size}')

# Apply KMeans with the final selected k
df_train = pd.DataFrame(df_train, columns=[f'feature_{x}' for x in range(df_train.shape[1])])
df_train['cluster'] = kmeans.labels_

# graph the clusters distribution
fig = px.histogram(df_train, x='cluster', nbins=optimal_k_tested, title='Distribution of Clusters')
fig.update_layout(
    xaxis_title='Cluster',
    yaxis_title='Frequency',
    bargap=0.2
)
fig.show()

Final selected k: 2
Cluster sizes for k=2:
Cluster 1: 5392
Cluster 2: 5195


In [16]:
df_train_cluster

Unnamed: 0,feature_0,feature_1,feature_2,feature_3,feature_4,feature_5,feature_6,feature_7,feature_8,feature_9,...,feature_42,feature_43,feature_44,feature_45,feature_46,feature_47,feature_48,feature_49,feature_50,cluster
0,,,,,,,,,,,...,,,,,,,,,,1
1,,,,,,,,,,,...,,,,,,,,,,1
2,,,,,,,,,,,...,,,,,,,,,,1
3,,,,,,,,,,,...,,,,,,,,,,1
4,,,,,,,,,,,...,,,,,,,,,,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10582,,,,,,,,,,,...,,,,,,,,,,1
10583,,,,,,,,,,,...,,,,,,,,,,1
10584,,,,,,,,,,,...,,,,,,,,,,1
10585,,,,,,,,,,,...,,,,,,,,,,1


In [19]:
len(df_train_pca[0])

21