In [80]:
import pandas as pd
import numpy as np

from scipy import sparse

In [81]:
def dissim(X, x):
    return np.sum(X != x, axis=1)


def summed_dissim(X, x):
    return np.sum(dissim(X, x))


def density(X, x):
    N, m = X.shape
    return 1 - summed_dissim(X, x) / (m * N)

In [82]:
def test_argmin_argmax(X):

    if not sparse.issparse(X):

        densities = np.array([density(X, x) for x in X])
        summed_dissims = np.array([summed_dissim(X, x) for x in X])

        idx = np.argmax(densities)
        mode = X[idx]

        if np.argmax(densities) == np.argmin(summed_dissims):

            mult_dens_diss = np.delete(dissim(X, mode), idx, 0) * np.delete(
                densities, idx, 0
            )
            div_sum_diss = np.delete(summed_dissims, idx, 0) / np.delete(
                dissim(X, mode), idx, 0
            )

            return np.argmax(mult_dens_diss) == np.argmin(div_sum_diss)

        return "Maximal density not minimal summed dissimilarity"
    return "Dataset is sparse"


def average_summed_dissim(X):
    return np.mean([summed_dissim(X, x) for x in X])

# Toy vehicle dataset

In [83]:
df = pd.read_csv("../../data/toy_car_example.csv")

X = df.values

N, m = X.shape
m, N, m * N, average_summed_dissim(X)

(6, 10, 60, 37.4)

In [84]:
test_argmin_argmax(X)

True

# Zoo animal dataset

In [85]:
df = pd.read_csv("../../data/zoo.csv")

X = df[[col for col in df.columns if col not in ["animal_name", "class_type"]]].values

N, m = X.shape
m, N, m * N, average_summed_dissim(X)

(16, 101, 1616, 650.3960396039604)

In [86]:
test_argmin_argmax(X)

True

# Small soybean dataset

In [87]:
df = pd.read_csv(
    "https://archive.ics.uci.edu/ml/machine-learning-databases/soybean/soybean-small.data",
    header=None,
)
df = df.drop(35, axis=1)

X = df.values

N, m = X.shape
m, N, m * N, average_summed_dissim(X)

(35, 47, 1645, 494.0851063829787)

In [88]:
test_argmin_argmax(X)

False

# Large soybean dataset

In [89]:
df = pd.read_csv(
    "https://raw.githubusercontent.com/selva86/datasets/master/Soybean.csv"
)
df = df.drop("Class", axis=1)

X = df.values

N, m = X.shape
m, N, m * N, average_summed_dissim(X)

(35, 683, 23905, 12220.361639824305)

In [90]:
test_argmin_argmax(X)

True

# Iris dataset

In [91]:
df = pd.read_csv("../../data/Iris.csv")
df.drop(["Id", "Species"], axis=1)

X = df.values

N, m = X.shape
m, N, m * N, average_summed_dissim(X)

(6, 150, 900, 813.84)

In [92]:
test_argmin_argmax(X)

True