https://www.analyticsvidhya.com/blog/2021/06/kmodes-clustering-algorithm-for-categorical-data/

In [1]:
# importing necessary libraries
import pandas as pd
import numpy as np
from kmodes.kmodes import KModes
from kneed import KneeLocator
import datetime

In [2]:
# Read the data provided
data = pd.read_csv('person_data.csv')

In [3]:
# Input from the user, what to take into consideration when forming the groups
cols = ['Experience (Years)', 'Specialty', 'Major']
data2 = data[cols]

In [4]:
def find_num_clusters(min_clusters, max_clusters, data):
    # Calculate the average cost for each number of clusters
    cost = []
    lowest = 1000000
    K = range(1,10)
    for num_clusters in list(K):
        kmode = KModes(n_clusters=num_clusters, init = "random", n_init = 5, verbose=1)
        kmode.fit_predict(data)
        if kmode.cost_ < lowest:
            lowest = kmode.cost_
        cost.append(kmode.cost_)

    # Elbow curve to find optimal number of clusters
    kneedle = KneeLocator(K, cost, S=1.0, curve="convex", direction="decreasing")
    return kneedle.elbow, lowest

In [5]:
def find_initial_centroids(lowest, n_clusters, data):
    # Find the lowest cost for each group of initial centroids
    # and return the best one
    current = lowest * 1.5
    time_before = datetime.datetime.now()
    while (current > lowest * 1.15):
        kmode = KModes(n_clusters=n_clusters, init = "random", n_init = 5, verbose=1)
        kmode.fit_predict(data)
        if kmode.cost_ < current:
            current = kmode.cost_
            centroids = kmode._enc_cluster_centroids
        if (datetime.datetime.now() - time_before).seconds > 30:
            break
    return centroids

In [6]:
def make_clusters(n_clusters, centroids, data):
    # Build the model with the best number of clusters
    # and the initial centroids selected
    kmode = KModes(n_clusters=n_clusters, init = centroids, n_init = 5, verbose=1)
    clusters = kmode.fit_predict(data)
    
    return clusters

In [7]:
def group_people(data, groups):
    # Iterate through each person in the dataframe and assign them to
    # a group based on the cluster they are in
    person_index = 0
    
    while person_index < data.shape[0]:
        for i in range(0, len(groups)):
            if person_index >= data.shape[0]:
                break
            data.loc[person_index, 'Group'] = groups[i]
            person_index += 1
        i = 0

    return data

In [8]:
n_clusters, lowest_cost = find_num_clusters(1, 10, data=data2)

Init: initializing centroids
Init: initializing clusters
Starting iterations...
Run 1, iteration: 1/100, moves: 0, cost: 697.0
Init: initializing centroids
Init: initializing clusters
Starting iterations...
Run 2, iteration: 1/100, moves: 0, cost: 697.0
Init: initializing centroids
Init: initializing clusters
Starting iterations...
Run 3, iteration: 1/100, moves: 0, cost: 697.0
Init: initializing centroids
Init: initializing clusters
Starting iterations...
Run 4, iteration: 1/100, moves: 0, cost: 697.0
Init: initializing centroids
Init: initializing clusters
Starting iterations...
Run 5, iteration: 1/100, moves: 0, cost: 697.0
Best run was number 1
Init: initializing centroids
Init: initializing clusters
Starting iterations...
Run 1, iteration: 1/100, moves: 29, cost: 608.0
Init: initializing centroids
Init: initializing clusters
Starting iterations...
Run 2, iteration: 1/100, moves: 21, cost: 617.0
Init: initializing centroids
Init: initializing clusters
Starting iterations...
Run 3, 

In [9]:
best_centroids = find_initial_centroids(lowest=lowest_cost, n_clusters=n_clusters, data=data2)

Init: initializing centroids
Init: initializing clusters
Starting iterations...
Run 1, iteration: 1/100, moves: 0, cost: 534.0
Init: initializing centroids
Init: initializing clusters
Starting iterations...
Run 2, iteration: 1/100, moves: 87, cost: 497.0
Init: initializing centroids
Init: initializing clusters
Starting iterations...
Run 3, iteration: 1/100, moves: 43, cost: 519.0
Run 3, iteration: 2/100, moves: 40, cost: 505.0
Run 3, iteration: 3/100, moves: 15, cost: 505.0
Init: initializing centroids
Init: initializing clusters
Starting iterations...
Run 4, iteration: 1/100, moves: 45, cost: 511.0
Init: initializing centroids
Init: initializing clusters
Starting iterations...
Run 5, iteration: 1/100, moves: 79, cost: 472.0
Run 5, iteration: 2/100, moves: 2, cost: 472.0
Best run was number 5
Init: initializing centroids
Init: initializing clusters
Starting iterations...
Run 1, iteration: 1/100, moves: 87, cost: 489.0
Init: initializing centroids
Init: initializing clusters
Starting it

In [10]:
data['Cluster'] = make_clusters(n_clusters=n_clusters, centroids=best_centroids, data=data2)

Initialization method and algorithm are deterministic. Setting n_init to 1.
Init: initializing centroids
Init: initializing clusters
Starting iterations...
Run 1, iteration: 1/100, moves: 0, cost: 458.0


In [11]:
# Save the data
data.to_csv('clustered_data.csv')

In [12]:
# Read the saved data
clustered_data = pd.read_csv('clustered_data.csv')
clustered_data.drop(columns=['Unnamed: 0'], inplace=True)

In [13]:
# User input: number of people per group
people_group = 5

# Find number of groups and make a list of those groups
rows = clustered_data.shape[0]
number_groups = int(rows/people_group)
groups = list(range(1, number_groups))

In [14]:
clustered_data = clustered_data.sort_values(by='Cluster')
clustered_data['Group'] = -1

In [15]:
grouped_data = group_people(data=clustered_data, groups=groups)

In [16]:
clustered_data.sort_values(by='Group',inplace=True)

In [19]:
clustered_data.drop(columns=['Cluster'], in)

Unnamed: 0,Name,Experience (Years),Age,Specialty,Track,Major,Minor,Group
0,Alice Chandler,2,18,Back-end,Beginner,Cognitive Science,Communication,1
290,Edward Newman,4,19,Design,Beginner,Psychology,Physics,1
174,Brenda Haley,3,21,Full-Stack,Policy,Computer Science,Communication,1
232,Cameron Fritz,2,21,AI,Beginner,Computer Science,Finance,1
58,Adam Thomas,2,22,Back-end,Policy,Mechanical Engineering,Physics,1
...,...,...,...,...,...,...,...,...
289,John Duran,4,17,Business,General,Psychology,Communication,58
115,John Wall,3,23,AI,General,Math,,58
57,Billy Levine,1,19,AI,Beginner,Computer Engineering,,58
231,Joseph Nelson,1,17,Full-Stack,Beginner,Computer Engineering,,58
