# Adult Income dataset
Source: https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data

Test AntClust against K-means for a categorical dataset.

In [21]:
from sklearn.cluster import KMeans
import pandas as pd
from sklearn.metrics import jaccard_score
from sklearn.preprocessing import LabelEncoder

# Define the URL for the Adult Income dataset
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data"

column_names = [
    'age', 'workclass', 'fnlwgt', 'education', 'education-num', 'marital-status',
    'occupation', 'relationship', 'race', 'sex', 'capital-gain', 'capital-loss',
    'hours-per-week', 'native-country', 'income'
]

# Load the dataset into a Pandas DataFrame
adult_data = pd.read_csv(url, header=None, names=column_names, skipinitialspace=True)


# Define the column names based on the dataset documentation
selected_cols = [
    'age', 'workclass', 'education-num', 'marital-status',
    'occupation', 'relationship', 'race', 'sex',
    'hours-per-week', 'native-country', 'income'
]

# Load the dataset into a Pandas DataFrame
adult_data = adult_data[selected_cols]

label_encoder = LabelEncoder()
subset = 1000
adult_encoded = adult_data.iloc[:subset,:-2].apply(label_encoder.fit_transform)
adult_label = adult_data.iloc[:subset, -1]
binary_labels = label_encoder.fit_transform(adult_label)

0      <=50K
1      <=50K
2      <=50K
3      <=50K
4      <=50K
       ...  
995     >50K
996    <=50K
997     >50K
998    <=50K
999     >50K
Name: income, Length: 1000, dtype: object

K-Means

In [22]:
# Apply KMeans clustering
cluster_results = []
for k in range(2,5):
    # Initialize the KMeans model
    kmeans = KMeans(n_clusters=k, random_state=42)
    # Fit the model to the data
    kmeans.fit(adult_encoded)
    cluster_results.append((k,kmeans.labels_))

# AntClust with Jaccard score as similarity

In [23]:
# ----------------------
#       imports
# ----------------------
# import opencv
import cv2 as cv
# matplotlib
import matplotlib.pyplot as plt
from sklearn.metrics import pairwise_distances

# make AntClus dir known
import sys
sys.path.append("../AntClust")
# import AntClust
from AntClust import AntClust
from importlib import reload
import distance_classes
reload(distance_classes)
# import the rule set
from rules import labroche_rules

f_sim = [distance_classes.similarity_categorical()]
ant_clust = AntClust(f_sim, labroche_rules())
ant = [[adult_encoded.iloc[i]] for i in range(adult_encoded.shape[0])]
ant_clust.fit(ant)
clusters_found = ant_clust.get_clusters()

AntClust: phase 1 of 3 -> meeting ants
Meeting 75000 / 75000
Meeting 67500 / 75000
Meeting 60000 / 75000
Meeting 52500 / 75000
Meeting 45000 / 75000
Meeting 37500 / 75000
Meeting 30000 / 75000
Meeting 22500 / 75000
Meeting 15000 / 75000
Meeting 7500 / 75000
AntClust: phase 2 of 3 -> shrink nests
AntClust: phase 3 of 3 -> reassign ants


# Metrics

In [24]:
from sklearn import metrics
import pandas as pd
df = pd.DataFrame()


homogeneity_score = metrics.homogeneity_score(binary_labels, clusters_found)
completeness_score = metrics.completeness_score(binary_labels, clusters_found)
v_score = metrics.v_measure_score(binary_labels, clusters_found)
ari_score = metrics.adjusted_rand_score(binary_labels, clusters_found)
pd_data = {
    'Homogeneity': homogeneity_score,
    'Completeness': completeness_score,
    'V-measure': v_score,
    'Adjusted Rand-Index': ari_score,
}

# Creating a new DataFrame with the data for the new row
new_row = pd.DataFrame(pd_data, index=["AntClust (Jaccard similarity)"])
df = pd.concat([df, new_row])

for k, k_label in cluster_results:
    homogeneity_score = metrics.homogeneity_score(binary_labels, k_label)
    completeness_score = metrics.completeness_score(binary_labels, k_label)
    v_score = metrics.v_measure_score(binary_labels, k_label)
    ari_score = metrics.adjusted_rand_score(binary_labels, k_label)
    pd_data = {
        'Homogeneity': homogeneity_score,
        'Completeness': completeness_score,
        'V-measure': v_score,
        'Adjusted Rand-Index': ari_score,
    }
    new_row = pd.DataFrame(pd_data, index=[f"K-means (k={k})"])
    df = pd.concat([df, new_row])

df


Unnamed: 0,Homogeneity,Completeness,V-measure,Adjusted Rand-Index
AntClust (Jaccard similarity),0.174465,0.039372,0.064245,0.033928
K-means (k=2),0.062004,0.048914,0.054687,0.073171
K-means (k=3),0.10354,0.053535,0.070578,0.03525
K-means (k=4),0.101109,0.043136,0.060473,0.024432
