In [1]:
import pandas as pd
import tensorflow as tf

In [2]:
# show only errors in logs
tf.logging.set_verbosity(tf.logging.ERROR)

## Reading and preparing data

In [3]:
profiles = pd.read_csv('profiles_unclustered.csv')
profiles.head()

Unnamed: 0,gender,goal,level
0,m,fit,middle
1,f,lose,begin
2,f,fit,middle
3,m,muscle,begin
4,m,muscle,advance


Create functions for converting labels data to numbers

In [4]:
def convert_gender(gender):
    switcher = {
        "m" : 0,
        "f" : 1,
    }
    return switcher.get(gender)

def convert_level(level):
    switcher = {
        "begin" : 0,
        "middle" : 1,
        "advance" : 2,
    }
    return switcher.get(level)

def convert_goal(goal):
    switcher = {
        "fit" : 0,
        "lose" : 1,
        "muscle": 2,
    }
    return switcher.get(goal)

Create functions for converting numbers back to labels

In [5]:
def convert_gender_reverse(gender):
    switcher = {
        0 : "m",
        1 : "f",
    }
    return switcher.get(gender)

def convert_level_reverse(level):
    switcher = {
        0: "begin",
        1: "middle",
        2: "advance",
    }
    return switcher.get(level)

def convert_goal_reverse(goal):
    switcher = {
        0: "fit",
        1: "lose",
        2: "muscle",
    }
    return switcher.get(goal)

Do conversion labels data to numbers

In [6]:
profiles['gender'] = profiles['gender'].apply(convert_gender)
profiles['goal'] = profiles['goal'].apply(convert_goal)
profiles['level'] = profiles['level'].apply(convert_level)

In [7]:
profiles.head()

Unnamed: 0,gender,goal,level
0,0,0,1
1,1,1,0
2,1,0,1
3,0,2,0
4,0,2,2


## Clustering prosess

In [8]:
from tensorflow.contrib.factorization import KMeansClustering

In [9]:
num_clusters = 18
kmeans = KMeansClustering(num_clusters=num_clusters, use_mini_batch=False, 
                          initial_clusters=KMeansClustering.KMEANS_PLUS_PLUS_INIT)

In [10]:
def input_fn():
    tensor = tf.convert_to_tensor(profiles.as_matrix(), dtype=tf.float32)
    return tf.train.limit_epochs(tensor, num_epochs=1)

In [11]:
kmeans.train(input_fn=input_fn)

<tensorflow.contrib.factorization.python.ops.kmeans.KMeansClustering at 0x262a90c96d8>

After the training is completed, we are able to see the centers of the clusters

In [12]:
centers = kmeans.cluster_centers()
centers

array([[1., 1., 1.],
       [0., 2., 0.],
       [0., 0., 1.],
       [0., 2., 2.],
       [1., 1., 0.],
       [0., 0., 2.],
       [1., 0., 1.],
       [1., 0., 0.],
       [0., 0., 0.],
       [1., 1., 2.],
       [0., 2., 1.],
       [0., 1., 0.],
       [1., 0., 2.],
       [0., 1., 1.],
       [1., 2., 1.],
       [1., 2., 2.],
       [1., 2., 0.],
       [0., 1., 2.]], dtype=float32)

## Assigning clusters 

In [13]:
assignments = list(kmeans.predict_cluster_index(input_fn=input_fn))

In [14]:
profiles.insert(loc=3, column="cluster", value=assignments, allow_duplicates=True)

Convert number data back to labels

In [15]:
profiles['gender'] = profiles['gender'].apply(convert_gender_reverse)
profiles['goal'] = profiles['goal'].apply(convert_goal_reverse)
profiles['level'] = profiles['level'].apply(convert_level_reverse)

In [16]:
profiles.head()

Unnamed: 0,gender,goal,level,cluster
0,m,fit,middle,2
1,f,lose,begin,4
2,f,fit,middle,6
3,m,muscle,begin,1
4,m,muscle,advance,3


Save clustered dataset to separate file

In [17]:
profiles.to_csv("profiles_clustered.csv", sep=',', encoding='utf-8', index=False)