# Pairwise Adjusted Mutual Information

# Experiments on synthetic data

This notebook presents the experiments on synthetic data.

## Imports

In [None]:
import numpy as np
from matplotlib import pyplot as plt
from scipy import sparse
import time

In [None]:
from sklearn.metrics.cluster import contingency_matrix
from sklearn.metrics import mutual_info_score
from sklearn.metrics.cluster._expected_mutual_info_fast import expected_mutual_information

## Pairwise adjustement

In [None]:
def get_adjusted_mutual_info_pair(contingency, n_samples):
    """Return pairwise adjusted mutual information.
    
    Parameters
    ----------
    contingency: np.ndarray
        Contingency matrix
    n_samples : int
        Number of samples
    """
    k, l = contingency.shape
    a = contingency.sum(axis=1)
    b = contingency.sum(axis=0)
    c = contingency.ravel()
    # first term
    factor = c * (contingency - np.outer(a, np.ones(l)) - np.outer(np.ones(k), b) + n_samples).ravel()
    entropy = np.zeros(len(c))
    entropy[c > 0] = c[c > 0] / n_samples * np.log(c[c > 0] / n_samples)
    entropy_ = np.zeros(len(c))
    entropy_[c > 1] = (c[c > 1] - 1) / n_samples * np.log((c[c > 1] - 1) / n_samples)
    result = np.sum(factor * (entropy - entropy_)) / n_samples ** 2
    # second term
    factor = ((np.outer(a, np.ones(l)) - contingency) * (np.outer(np.ones(k), b) - contingency)).ravel()
    entropy_ = (c + 1) / n_samples * np.log((c + 1) / n_samples)
    result += np.sum(factor * (entropy - entropy_)) / n_samples ** 2
    return result

## Full adjustement

In [None]:
def get_adjusted_mutual_info_exact(contingency, n_samples):
    """Return adjusted mutual information (without normalization).
    
    Parameters
    ----------
    contingency: np.ndarray
        Contingency matrix
    n_samples : int
        Number of samples
    """
    mi = mutual_info_score(_, _, contingency=contingency)
    emi = expected_mutual_information(contingency, n_samples)
    result = mi - emi
    return result

## Symmetric clustering

In [None]:
n = 100
cluster_size = 10
labels = np.arange(n) // cluster_size

In [None]:
cluster_size_range = np.arange(1, 100, 1).astype(int)
adjusted_mutual_info_pair = []
adjusted_mutual_info = []

for k in cluster_size_range:
    labels_ = np.arange(n) // k
    contingency = contingency_matrix(labels, labels_)
    adjusted_mutual_info_pair.append(get_adjusted_mutual_info_pair(contingency, len(labels)))
    adjusted_mutual_info.append(get_adjusted_mutual_info_exact(contingency, len(labels)))

In [None]:
plt.plot(cluster_size_range, adjusted_mutual_info, lw=2, c='b')
plt.vlines(cluster_size, ymin=1.05 *np.max(adjusted_mutual_info), ymax=1.1*np.max(adjusted_mutual_info), lw=2)
plt.xlabel('Cluster size')
plt.ylabel('Mutual information')
plt.yticks([0, 1, 2])
plt.show()

In [None]:
plt.plot(cluster_size_range, adjusted_mutual_info_pair, lw=2, c='b')
plt.vlines(cluster_size, ymin=1.05 *np.max(adjusted_mutual_info_pair), ymax=1.1*np.max(adjusted_mutual_info_pair), lw=2)
plt.xlabel('Cluster size')
plt.ylabel('Mutual information')
plt.yticks([0, 0.01, 0.02, 0.03])
plt.show()

## Random clustering

In [None]:
def get_random_labels(n, n_clusters):
    p = np.random.rand(n_clusters)
    p /= np.sum(p)
    return np.random.choice(n_clusters, p=p, size=n)

In [None]:
np.random.seed(0)
n = 100
n_clusters = 2
n_exp = 10
n_tests = 1000

results = []

for t in range(n_exp):
    count = 0
    for i in range(n_tests):
        labels = get_random_labels(n, n_clusters)
        labels1 = get_random_labels(n, n_clusters)
        labels2 = get_random_labels(n, n_clusters)
        contingency1 = contingency_matrix(labels, labels1)
        contingency2 = contingency_matrix(labels, labels2)
        order = get_adjusted_mutual_info_exact(contingency1, n) > get_adjusted_mutual_info_exact(contingency2, n)
        order_pair = get_adjusted_mutual_info_pair(contingency1, n) > get_adjusted_mutual_info_pair(contingency2, n)
        count += int(order == order_pair)
    results.append(count / n_tests)

In [None]:
np.mean(results)

In [None]:
np.std(results)

## Computation times

In [None]:
n_range = [100, 300, 1000, 3000, 10000, 30000, 100000, 300000, 1000000, 3000000, 10000000]

In [None]:
n_clusters = 10
n_runs = 100000

mean_exact = []
mean_pair = []
std_exact = []
std_pair = []

for n in n_range:
    print(n)
    
    times_exact = []
    times_pair = []
    
    for t in range(min(int(n_runs / n) + 1, 5)):
        labels = np.arange(n) % n_clusters
        labels_ = get_random_labels(n, n_clusters)
        contingency = contingency_matrix(labels, labels_)
        t0 = time.time()
        get_adjusted_mutual_info_exact(contingency, n)
        t1 = time.time()
        times_exact.append(t1 - t0)
        t0 = time.time()
        get_adjusted_mutual_info_pair(contingency, n)
        t1 = time.time()
        times_pair.append(t1 - t0)
        
    mean_exact.append(np.mean(times_exact))
    mean_pair.append(np.mean(times_pair))
    std_exact.append(np.std(times_exact))
    std_pair.append(np.std(times_pair))

In [None]:
plt.xscale('log')
plt.yscale('log')
plt.errorbar(n_range, mean_exact, yerr=std_exact, label='Full adjustement', linestyle='none', marker='.', c='b', lw=3)
plt.errorbar(n_range, mean_pair, yerr=std_pair, label='Pairwise adjustement', linestyle='none', marker='.', c='r', lw=3)
plt.legend()
plt.xlabel('Number of samples')
plt.ylabel('Computation time (s)')
plt.show()