In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.cluster import KMeans
from sklearn.preprocessing import LabelEncoder, StandardScaler
from kneed import KneeLocator

In [None]:
users_df = pd.read_csv('../data/pp_users.csv')
users_df

In [None]:
clusters_df = users_df.copy()

# Convert 'country' into numerical format
label_enc = LabelEncoder()
clusters_df['country_enc'] = label_enc.fit_transform(users_df['country'])

# Select features for clustering
X = clusters_df[['age', 'country_enc']]

# Scale the features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

In [None]:
# Determine the optimal number of clusters (k) using the elbow method
cluster_range = range(1, 20)
wcss = []
for i in cluster_range:
    kmeans = KMeans(n_clusters=i, init='k-means++', max_iter=300, n_init=10, random_state=42)
    kmeans.fit(X_scaled)
    wcss.append(kmeans.inertia_)

In [None]:
# Use KneeLocator to find the elbow point
kl = KneeLocator(cluster_range, wcss, curve="convex", direction="decreasing")
k = kl.elbow

In [None]:
# Manually set the number of clusters
k = 10

In [None]:
# Plot the WCSS values
plt.figure(figsize=(10, 6))
plt.plot(cluster_range, wcss, marker='o', linestyle='--')
plt.xticks(cluster_range)
plt.title('Elbow Method')
plt.xlabel('Number of clusters')
plt.ylabel('WCSS')
plt.grid(True)

# Highlight the elbow point
plt.scatter(k, wcss[k - 1], color='red', s=100, label='Elbow Point')
plt.legend()

plt.show()

In [None]:
# Perform clustering on valid users
kmeans = KMeans(n_clusters=k, init='k-means++', max_iter=300, n_init=10, random_state=0)
clusters = kmeans.fit_predict(X_scaled)

# Map the clusters back to the valid_users
clusters_df['cluster'] = clusters
clusters_df.sort_values(by='cluster', inplace=True)

clusters_df

In [None]:
plt.figure(figsize=(10, 6))

sns.scatterplot(
    x=clusters_df['age'],
    y=clusters_df['country_enc'],
    hue=clusters_df['cluster'],
    palette="deep",
    edgecolor=".6")

plt.title('User Clusters based on Age and Country')
plt.xlabel('Age')
plt.ylabel('Encoded Country')
plt.legend(title='Cluster')
plt.grid(True)

plt.show()

In [None]:
clusters_df.drop(columns=['country_enc']).to_csv('../data/clusters.csv', index=False)