In [1]:
import pandas as pd
from sklearn.cluster import DBSCAN

from gower.gower_dist import *

# Simple example from D'Orazio

In [2]:
df = pd.DataFrame({"Sex1": ["M", "M", "F", "F", "F", "M", "M", "F", "F", "F"],
                   "Sex2": ["M", "M", "F", "F", "F", "F", "F", "M", "M", "M"],
                   "Age1": [15] * 10,
                   "Age2": [15, 36, 58, 78, 100, 15, 36, 58, 78, 100]})

In [3]:
distances = np.diag(gower_matrix(df[["Sex1", "Age1"]].to_numpy(), df[["Sex2", "Age2"]].to_numpy(), weight="uniform"))
print(distances)
df.loc[np.argsort(distances)]

  0%|          | 0/10 [00:00<?, ?it/s]

[0.         0.12352941 0.25294118 0.37058824 0.5        0.5
 0.62352941 0.75294118 0.87058824 1.        ]


Unnamed: 0,Sex1,Sex2,Age1,Age2
0,M,M,15,15
1,M,M,15,36
2,F,F,15,58
3,F,F,15,78
4,F,F,15,100
5,M,F,15,15
6,M,F,15,36
7,F,M,15,58
8,F,M,15,78
9,F,M,15,100


In [4]:
distances = np.diag(gower_matrix(df[["Sex1", "Age1"]].to_numpy(), df[["Sex2", "Age2"]].to_numpy()))
print(distances)
df.loc[np.argsort(distances)]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/10 [00:00<?, ?it/s]

[0.         0.17758292 0.36362217 0.53274876 0.71878801 0.28121199
 0.45879491 0.64483416 0.81396075 1.        ]


Unnamed: 0,Sex1,Sex2,Age1,Age2
0,M,M,15,15
1,M,M,15,36
5,M,F,15,15
2,F,F,15,58
6,M,F,15,36
3,F,F,15,78
7,F,M,15,58
4,F,F,15,100
8,F,M,15,78
9,F,M,15,100


# Metrics

### We can also use the categorical weight function to measure cluster quality and thereby tune clustering algorithms such as DBSCAN.

##### I calculated Gower distances for each of the three datasets above using the new weighting scheme and uniform weighting, and then ran a grid search for each of the resultant sets of distances with DBSCAN based on the new weighting formula for categorical variables. The new scheme yielded higher maximum performance across all three datasets.

# Dog adoption dataset

In [5]:
dd = pd.read_csv("dog_adoption/allDogDescriptions.csv", index_col=0)
dd.set_index("id", inplace=True, drop=True)


# get rid of columns with the most destructive nans

def isolate_nans(column):
    all_other_nans = nans[[c for c in cols if c != column]].sum(axis=1)
    mask = all_other_nans < np.median(all_other_nans)
    mask &= nans[column]
    return mask.sum()


nans = dd.isna()
cols = dd.columns
isolated_nans = {c: isolate_nans(c) for c in cols}
median_count = np.median([v for v in isolated_nans.values() if v > 0])
for column, count in isolated_nans.items():
    if count > median_count:
        print(column)
        dd.drop(column, axis=1, inplace=True)
dd.dropna(inplace=True)  # dd.mode().iloc[0]

# make bools strings

is_bool = dd.dtypes == bool
dd.loc[:, is_bool] = dd[dd.columns[is_bool]].astype(str)

matrix = gower_matrix(dd.to_numpy(), chunksize=20)
samples = [{"eps": z / 100, "min_samples": 1} for z in range(1, 101)]
results = process_map(partial(do_it, matrix=matrix), samples, chunksize=1)
best_params = max(results, key=lambda z: z[1])
print(best_params)
dd["gower_dbscan"] = DBSCAN(metric="precomputed", **best_params[0]).fit_predict(matrix)
print(get_cat_weight(dd["gower_dbscan"]))
_, counts = np.unique(dd["gower_dbscan"], return_counts=True)
print(np.unique(counts, return_counts=True))

dd["label_count"] = dd.groupby("gower_dbscan").transform("count").iloc[:, 0]
dd.sort_values(["label_count", "gower_dbscan"], ascending=[False, True], inplace=True)
dd


breed_secondary
color_primary
color_secondary
color_tertiary
declawed
env_cats


  0%|          | 0/28 [00:00<?, ?it/s]

0it [00:00, ?it/s]

  0%|          | 0/13729 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

({'eps': 0.51, 'min_samples': 1}, 0.9126212324948002)
0.9126212324948002
(array([  1,   2,   3,   4,   5,   6,   7,   8,   9,  10,  11,  12,  13,
        14,  15,  16,  17,  18,  19,  20,  21,  22,  23,  24,  25,  26,
        27,  28,  29,  30,  31,  32,  33,  35,  36,  37,  38,  39,  40,
        41,  42,  43,  44,  45,  46,  47,  48,  53,  55,  58,  60,  61,
        63,  65,  69,  70,  72,  73,  82,  83,  84,  92, 102, 104, 108,
       129, 132, 141, 146, 154, 182, 252]), array([630, 305, 165, 138, 106,  77,  63,  42,  57,  33,  35,  35,  27,
        22,  17,  15,  16,  10,  13,  11,  15,   8,   8,   4,   7,   4,
         4,   5,   5,   3,   7,   3,   2,   3,   2,   1,   3,   4,   2,
         6,   1,   1,   3,   1,   1,   2,   2,   2,   1,   2,   1,   2,
         1,   1,   1,   2,   1,   1,   1,   1,   1,   1,   1,   1,   1,
         1,   1,   1,   1,   1,   1,   1]))


Unnamed: 0_level_0,org_id,url,type.x,species,breed_primary,breed_mixed,breed_unknown,age,sex,size,...,contact_city,contact_state,contact_zip,contact_country,stateQ,accessed,type.y,description,gower_dbscan,label_count
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
46021535,TX2394,https://www.petfinder.com/dog/brody-bear-46021...,Dog,Dog,Schnauzer,True,False,Baby,Male,Small,...,Brattleboro,VT,05301,US,CT,2019-09-20,Dog,Howdy! Caught you looking! My name is Brody! ...,229,252
46021378,TX2394,https://www.petfinder.com/dog/brody-46021378/c...,Dog,Dog,Schnauzer,True,False,Baby,Male,Small,...,Groton,CT,06340,US,CT,2019-09-20,Dog,Howdy! Caught you looking! My name is Brody! ...,229,252
46021361,TX2394,https://www.petfinder.com/dog/daisey-may-46021...,Dog,Dog,Weimaraner,True,False,Young,Female,Medium,...,Brattleboro,VT,05301,US,CT,2019-09-20,Dog,Howdy! Caught you looking! My name is Daisey M...,229,252
46021331,TX2394,https://www.petfinder.com/dog/daisey-mae-46021...,Dog,Dog,Weimaraner,True,False,Young,Female,Medium,...,Boston,MA,02128,US,CT,2019-09-20,Dog,Howdy! Caught you looking! My name is Daisey M...,229,252
46021276,TX2394,https://www.petfinder.com/dog/daisey-mae-3-460...,Dog,Dog,Weimaraner,True,False,Young,Female,Medium,...,Groton,CT,06340,US,CT,2019-09-20,Dog,Howdy! Caught you looking! My name is Daisey M...,229,252
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
45070846,WV80,https://www.petfinder.com/dog/manie-45070846/w...,Dog,Dog,Terrier,True,False,Adult,Male,Small,...,Scott Depot,WV,25560,US,WV,2019-09-20,Dog,Manie's birthday is 6/21/2018. He likes attent...,1946,1
44531731,WV04,https://www.petfinder.com/dog/coco-44531731/wv...,Dog,Dog,Pit Bull Terrier,True,False,Adult,Female,Medium,...,Fairmont,WV,26554,US,WV,2019-09-20,Dog,If you'd like to learn more about one of our a...,1947,1
44175060,WV37,https://www.petfinder.com/dog/skylee-44175060/...,Dog,Dog,Pug,True,False,Senior,Female,Medium,...,Dellslow,WV,26531,US,WV,2019-09-20,Dog,"Hi Skylee !! ??\nSkylee is a senior girl, who ...",1948,1
38637037,WV193,https://www.petfinder.com/dog/rocky-38637037/w...,Dog,Dog,Beagle,False,False,Adult,Male,Small,...,Belington,WV,26250,US,WV,2019-09-20,Dog,"Meet Rocky, He is a 9 year old male Chihuahua ...",1951,1


# Pokemon dataset

In [6]:
pokemon = pd.read_csv("pokemon/pokemon_alopez247.csv")
for c in pokemon.columns[pokemon.dtypes != object]:
    pokemon.loc[pokemon[c].isna(), c] = np.nanmedian(pokemon[c])
for c in pokemon.columns[pokemon.dtypes == object]:
    pokemon.loc[pokemon[c].isna(), c] = "none"

matrix2 = gower_matrix(pokemon.to_numpy(), chunksize=20)
samples = [{"eps": z / 100, "min_samples": 1} for z in range(1, 1001)]
results2 = process_map(partial(do_it, matrix=matrix2), samples, chunksize=20)
best_params2 = max(results2, key=lambda z: z[1])
print(best_params2)
pokemon["gower_dbscan"] = DBSCAN(metric="precomputed", **best_params2[0]).fit_predict(matrix2)
_, counts2 = np.unique(pokemon["gower_dbscan"], return_counts=True)
print(np.unique(counts2, return_counts=True))

pokemon["label_count"] = pokemon.groupby("gower_dbscan").transform("count").iloc[:, 0]
pokemon.sort_values(["label_count", "gower_dbscan"], ascending=[False, True], inplace=True)
pokemon


  0%|          | 0/10 [00:00<?, ?it/s]

  0%|          | 0/13 [00:00<?, ?it/s]

  0%|          | 0/721 [00:00<?, ?it/s]

  0%|          | 0/1000 [00:00<?, ?it/s]

({'eps': 0.14, 'min_samples': 1}, 0.43871969037222813)
(array([ 1,  2,  3,  4,  5,  6,  7,  8,  9, 12, 13, 23, 33, 62]), array([301,  64,  25,   2,   4,   1,   2,   1,   2,   1,   1,   1,   1,
         1]))


Unnamed: 0,Number,Name,Type_1,Type_2,Total,HP,Attack,Defense,Sp_Atk,Sp_Def,...,Pr_Male,Egg_Group_1,Egg_Group_2,hasMegaEvolution,Height_m,Weight_kg,Catch_Rate,Body_Style,gower_dbscan,label_count
18,19,Rattata,Normal,none,253,30,56,35,25,35,...,0.50,Field,none,False,0.30,3.5,255,quadruped,6,62
19,20,Raticate,Normal,none,413,55,81,60,50,70,...,0.50,Field,none,False,0.71,18.5,127,quadruped,6,62
24,25,Pikachu,Electric,none,320,35,55,40,50,50,...,0.50,Field,Fairy,False,0.41,6.0,190,quadruped,6,62
25,26,Raichu,Electric,none,485,60,90,55,90,80,...,0.50,Field,Fairy,False,0.79,30.0,75,bipedal_tailed,6,62
36,37,Vulpix,Fire,none,299,38,41,40,50,65,...,0.25,Field,none,False,0.61,9.9,190,quadruped,6,62
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
716,717,Yveltal,Dark,Flying,680,126,131,95,131,98,...,0.50,Undiscovered,none,False,5.79,203.0,45,two_wings,402,1
717,718,Zygarde,Dragon,Ground,600,108,100,121,81,95,...,0.50,Undiscovered,none,False,5.00,305.0,3,serpentine_body,403,1
718,719,Diancie,Rock,Fairy,600,50,100,150,100,150,...,0.50,Undiscovered,none,True,0.71,8.8,3,head_arms,404,1
719,720,Hoopa,Psychic,Ghost,600,80,110,60,150,130,...,0.50,Undiscovered,none,False,0.51,9.0,3,head_only,405,1


# Laptops dataset

In [7]:
laptops = pd.read_csv("laptops/laptops_train.csv", index_col=0)
print(laptops.shape)
print(laptops.apply(lambda col: col.unique(), axis=0).apply(len))
laptops.drop("Operating System Version", axis=1, inplace=True)

matrix3 = gower_matrix(laptops.to_numpy(), R=(25, 75), chunksize=20)
samples = [{"eps": z / 1000, "min_samples": 1} for z in range(1, 1001)]
results3 = process_map(partial(do_it, matrix=matrix3), samples, chunksize=20)
best_params3 = max(results3, key=lambda z: z[1])
print(best_params3)
laptops["gower_dbscan"] = DBSCAN(metric="precomputed", **best_params3[0]).fit_predict(matrix3)
_, counts3 = np.unique(laptops["gower_dbscan"], return_counts=True)
print(np.unique(counts3, return_counts=True))

laptops["label_count"] = laptops.groupby("gower_dbscan").transform("count").iloc[:, 0]
laptops.sort_values(["label_count", "gower_dbscan"], ascending=[False, True], inplace=True)
laptops


(977, 12)
Model Name                  488
Category                      6
Screen Size                  18
Screen                       38
CPU                         106
RAM                           8
 Storage                     36
GPU                          98
Operating System              7
Operating System Version      5
Weight                      166
Price                       639
dtype: int64


  0%|          | 0/10 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/977 [00:00<?, ?it/s]

  0%|          | 0/1000 [00:00<?, ?it/s]

({'eps': 0.278, 'min_samples': 1}, 0.46388359112994826)
(array([  1,   2,   3,   4,   5,   6,   7,   8,  11,  12,  17,  25,  49,
        89, 207]), array([283,  51,  14,   4,   5,  10,   1,   4,   1,   1,   1,   1,   1,
         1,   1]))


Unnamed: 0_level_0,Model Name,Category,Screen Size,Screen,CPU,RAM,Storage,GPU,Operating System,Weight,Price,gower_dbscan,label_count
Manufacturer,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
HP,250 G6,Notebook,"15.6""",Full HD 1920x1080,Intel Core i5 7200U 2.5GHz,8GB,256GB SSD,Intel HD Graphics 620,No OS,1.86kg,5112900.00,2,207
Acer,Aspire 3,Notebook,"15.6""",1366x768,AMD A9-Series 9420 3GHz,4GB,500GB HDD,AMD Radeon R5,Windows,2.1kg,3556800.00,2,207
HP,250 G6,Notebook,"15.6""",1366x768,Intel Core i5 7200U 2.5GHz,4GB,500GB HDD,Intel HD Graphics 620,No OS,1.86kg,3502558.80,2,207
HP,250 G6,Notebook,"15.6""",Full HD 1920x1080,Intel Core i3 6006U 2GHz,4GB,500GB HDD,Intel HD Graphics 520,No OS,1.86kg,3067651.08,2,207
Dell,Inspiron 3567,Notebook,"15.6""",Full HD 1920x1080,Intel Core i3 6006U 2GHz,4GB,256GB SSD,AMD Radeon R5 M430,Windows,2.2kg,4436218.80,2,207
...,...,...,...,...,...,...,...,...,...,...,...,...,...
Dell,XPS 13,Ultrabook,"13.3""",Quad HD+ 3200x1800,Intel Core i7 6500U 2.5GHz,8GB,256GB SSD,Intel HD Graphics 520,Windows,1.3kg,11275056.00,374,1
Acer,Aspire F5-573G-510L,Notebook,"15.6""",Full HD 1920x1080,Intel Core i5 7200U 2.5GHz,12GB,128GB SSD + 1TB HDD,Nvidia GeForce GTX 950M,Windows,2.4kg,8972028.00,375,1
Dell,Latitude E7470,Ultrabook,"14.0""",Touchscreen 2560x1440,Intel Core i7 6600U 2.6GHz,8GB,256GB SSD,Intel HD Graphics 520,Windows,1.5kg,17454818.16,376,1
HP,Probook 450,Notebook,"15.6""",IPS Panel Full HD 1920x1080,Intel Core i5 7200U 2.70GHz,8GB,128GB SSD + 1TB HDD,Nvidia GeForce 930MX,Windows,2.04kg,8705268.00,377,1
