In [13]:
import numpy as np
import pandas as pd

## Defining a function to evaluate clustering algorithm performances

In [18]:
############################################################################################
# 
# Inputs
#
# cluster_labels --> a numpy array containing cluster labels for people
# in the order of the original file
#
# hand_label_info --> the dataframe version of 'okcupid-cleaned-dropped-10k-handlabels.csv'
#
# ==========================================================================================
# 
# Outputs (in order)
#
# sim_count --> the number of pairs (out of 50) which are correctly 
# assigned to the same cluster
# 
# dissim_count --> the number of pairs (out of 51) which are correctly
# assigned to different clusters
#
############################################################################################


def eval_performance(cluster_labels, hand_label_info):
    
    # add the row number to each row of the hand_label_info df
    hand_label_info['index'] = range(0, len(hand_label_info))
    
    # get all 202 rows that have labels
    labeled_rows = hand_label_info[~hand_label_info['hand-label'].isnull()]
    
    # get all 100 rows with similar pairs of people
    sim = labeled_rows[labeled_rows['hand-label'].str.isnumeric()]
    
    # get all 102 rows with dissimilar pairs of people
    dissim = labeled_rows[~labeled_rows['hand-label'].str.isnumeric()]
    
    # sort both so similar / dissimilar pairs are consecutive
    sim = sim.sort_values(by=['hand-label'])
    dissim = dissim.sort_values(by=['hand-label'])
    
    # count the number of similar people who ended up in the same cluster
    
    # loop through every pair of similar people
    
    i = 0
    sim_count = 0
    
    while i + 1 < len(sim):
        
        person1 = sim.iloc[i]
        person2 = sim.iloc[i+1]
        
        # get the (original) row number associated with each person
        index1 = person1['index']
        index2 = person2['index']
        
        # increment the count if the two people's cluster assignments are the same
        if cluster_labels[index1] == cluster_labels[index2]:
            sim_count += 1
        
        # move to the next pair
        i += 2
 
    
    # count the number of dissimilar people who ended up in different clusters
    
    # loop through every pair of dissimilar people
    
    j = 0
    dissim_count = 0
    
    while j + 1 < len(dissim):
        
        person1 = dissim.iloc[j]
        person2 = dissim.iloc[j+1]
        
        # get the (original) row number associated with each person
        index1 = person1['index']
        index2 = person2['index']
        
        # increment the count if the two people's cluster assignments are different
        if cluster_labels[index1] != cluster_labels[index2]:
            dissim_count += 1
        
        # move to the next pair
        j += 2
        
    # print the performance 
    print(str(sim_count) + " out of 50 or " + str(round((sim_count/50)*100, 2)) + "% of similar pairs were in the same cluster")
    print(str(dissim_count) + " out of 51 or " + str(round((dissim_count/51)*100, 2)) + "% of dissimilar pairs were in different clusters")
    
    return sim_count, dissim_count
    

## Evaluating K-means performance

In [19]:
# Using the function with Andrew's K-means output

# loading cluster labels
kmeans_cluster_labels = ((pd.read_csv('12_15_kmeans_label.csv'))['0']).to_numpy()
display(kmeans_cluster_labels)

# loading hand label info
hand_label_info = pd.read_csv('okcupid-cleaned-dropped-10k-handlabels.csv')
display(hand_label_info)

eval_clustering_performance(kmeans_cluster_labels, hand_label_info)


array([11, 42, 36, ..., 41, 49,  6])

Unnamed: 0.1,Unnamed: 0,age,sex,orientation,body_type,diet,drinks,drugs,education,ethnicity,...,portuguese,russian,spanish,turkish,has kids,wants kids,likes dogs,likes cats,new_religion,hand-label
0,30518.0,26.0,f,straight,5.0,vegan,2.0,sometimes,graduated from college/university,white,...,no,no,yes,no,no,yes,yes,yes,agnosticism,T
1,56256.0,38.0,f,straight,5.0,anything,2.0,never,working on grad,white,...,no,no,no,no,no,no,yes,no,none,
2,31846.0,38.0,m,straight,5.0,anything,3.0,never,graduated from college/university,white,...,no,no,no,no,no,no,no,no,none,
3,38659.0,38.0,f,straight,4.0,anything,0.0,never,graduated from college/university,asian,...,no,no,no,no,no,no,no,no,catholicism,
4,20309.0,24.0,f,gay,4.0,anything,2.0,sometimes,graduated from high school,white,...,no,no,no,no,no,no,no,no,other,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9996,32132.0,22.0,f,bisexual,5.0,vegetarian,2.0,sometimes,graduated from college/university,black,...,no,no,no,no,no,no,yes,no,none,
9997,8119.0,22.0,m,straight,5.0,anything,2.0,sometimes,working on college/university,white,...,no,no,no,no,no,no,yes,yes,agnosticism,
9998,7887.0,20.0,f,straight,2.0,other,1.0,never,working on college/university,mixed,...,no,no,no,no,no,yes,yes,no,christianity,
9999,13868.0,27.0,f,straight,2.0,anything,2.0,never,graduated from college/university,white,...,no,no,no,no,no,no,yes,no,none,


16 out of 50 or 32.0% of similar pairs were in the same cluster
51 out of 51 or 100.0% of dissimilar pairs were in different clusters


(16, 51)

## Defining a function to evaluate K-nearest neighbor algorithm performances

In [26]:
############################################################################################
# 
# Inputs
#
# neighbor_matrix --> a numpy matrix with dimensions n x n where n is the number of peeps
# neighbor_matrix[i][j] = 1 if the person at index j is a neighbor of the person at index i
#
# hand_label_info --> the dataframe version of 'okcupid-cleaned-dropped-10k-handlabels.csv'
#
# ==========================================================================================
# 
# Outputs (in order)
#
# sim_count --> the number of pairs (out of 50) which are correctly 
# assigned to the same cluster
# 
# dissim_count --> the number of pairs (out of 51) which are correctly
# assigned to different clusters
#
############################################################################################


def eval_neighbors_performance(neighbor_matrix, hand_label_info):
    
    # add the row number to each row of the hand_label_info df
    hand_label_info['index'] = range(0, len(hand_label_info))
    
    # get all 202 rows that have labels
    labeled_rows = hand_label_info[~hand_label_info['hand-label'].isnull()]
    
    # get all 100 rows with similar pairs of people
    sim = labeled_rows[labeled_rows['hand-label'].str.isnumeric()]
    
    # get all 102 rows with dissimilar pairs of people
    dissim = labeled_rows[~labeled_rows['hand-label'].str.isnumeric()]
    
    # sort both so similar / dissimilar pairs are consecutive
    sim = sim.sort_values(by=['hand-label'])
    dissim = dissim.sort_values(by=['hand-label'])
    
    # count the number of similar people who ended up in the same cluster
    
    # loop through every pair of similar people
    
    i = 0
    sim_count = 0
    
    while i + 1 < len(sim):
        
        person1 = sim.iloc[i]
        person2 = sim.iloc[i+1]
        
        # get the (original) row number associated with each person
        index1 = person1['index']
        index2 = person2['index']
        
        # increment the count if the two people's cluster assignments are the same
        if neighbor_matrix[index1][index2] == 1:
            sim_count += 1
        
        # move to the next pair
        i += 2
 
    
    # count the number of dissimilar people who ended up in different clusters
    
    # loop through every pair of dissimilar people
    
    j = 0
    dissim_count = 0
    
    while j + 1 < len(dissim):
        
        person1 = dissim.iloc[j]
        person2 = dissim.iloc[j+1]
        
        # get the (original) row number associated with each person
        index1 = person1['index']
        index2 = person2['index']
        
        # increment the count if the two people's cluster assignments are different
        if neighbor_matrix[index1,index2] != 1:
            dissim_count += 1
        
        # move to the next pair
        j += 2
        
    # print the performance 
    print(str(sim_count) + " out of 50 or " + str(round((sim_count/50)*100, 2)) + "% of similar pairs were in the same cluster")
    print(str(dissim_count) + " out of 51 or " + str(round((dissim_count/51)*100, 2)) + "% of dissimilar pairs were in different clusters")
    
    return sim_count, dissim_count
    

## Evaluating K nearest neighbors (using Gower distance) performance

In [25]:
from sklearn.neighbors import NearestNeighbors
import gower

train_df = pd.read_csv('okcupid-cleaned-dropped-10k-handlabels.csv')
train_df = train_df.drop(['sex', 'orientation', 'Unnamed: 0', 'hand-label'], axis=1)
train_df = train_df[:-1]

print(train_df.columns[train_df.isna().any()].tolist())

dist_matrix = gower.gower_matrix(train_df)

neigh = NearestNeighbors(n_neighbors=50, metric="precomputed")
neigh.fit(dist_matrix)

neighbor_matrix = (neigh.kneighbors_graph(mode="distance")).toarray()

[]


In [27]:
eval_neighbors_performance(neighbor_matrix, hand_label_info)

0 out of 50 or 0.0% of similar pairs were in the same cluster
51 out of 51 or 100.0% of dissimilar pairs were in different clusters


(0, 51)

In [56]:
indices = np.nonzero(neighbor_matrix[1917])
hand_label_info['index'] = range(0, len(hand_label_info))

print(hand_label_info.iloc[indices[0][1]])

display(hand_label_info.iloc[indices])

Unnamed: 0                                58115.0
age                                          30.0
sex                                             f
orientation                              straight
body_type                                     5.0
diet                                     anything
drinks                                        2.0
drugs                                       never
education       graduated from college/university
ethnicity                                   white
height                                       69.0
job                             medicine / health
sign                              does_not_matter
smokes                                         no
arabic                                         no
chinese                                        no
english                                       yes
german                                         no
hindi                                          no
italian                                        no


Unnamed: 0.1,Unnamed: 0,age,sex,orientation,body_type,diet,drinks,drugs,education,ethnicity,...,russian,spanish,turkish,has kids,wants kids,likes dogs,likes cats,new_religion,hand-label,index
180,1553.0,31.0,m,straight,5.0,anything,2.0,never,graduated from college/university,white,...,no,no,no,no,no,yes,no,none,,180
305,58115.0,30.0,f,straight,5.0,anything,2.0,never,graduated from college/university,white,...,no,no,no,no,no,yes,no,agnosticism,,305
417,24638.0,29.0,m,straight,3.0,anything,2.0,never,graduated from college/university,white,...,no,no,no,no,no,yes,no,none,,417
1104,12655.0,31.0,m,straight,5.0,anything,2.0,never,graduated from college/university,white,...,no,no,no,no,no,yes,no,none,,1104
1343,12300.0,48.0,m,straight,5.0,anything,2.0,never,graduated from college/university,white,...,no,no,no,no,no,yes,no,catholicism,,1343
1365,56266.0,33.0,m,straight,5.0,anything,2.0,never,graduated from college/university,white,...,no,no,no,no,no,yes,no,none,,1365
1619,1693.0,28.0,m,straight,5.0,anything,2.0,never,graduated from college/university,white,...,no,no,no,no,no,yes,no,none,,1619
1670,38692.0,30.0,m,straight,5.0,anything,2.0,never,graduated from college/university,asian,...,no,no,no,no,no,yes,no,none,,1670
1731,58374.0,25.0,m,straight,5.0,anything,2.0,never,graduated from college/university,white,...,no,no,no,no,no,yes,no,catholicism,,1731
1996,50306.0,27.0,f,straight,5.0,anything,2.0,never,graduated from college/university,white,...,no,no,no,no,no,yes,no,catholicism,,1996
