In [None]:
import json
from pathlib import Path
from itertools import combinations

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.manifold import TSNE
from sklearn.cluster import KMeans

In [None]:
PROJECT_ROOT = Path('.').absolute().parent

In [None]:
dataset = 'compas'
split = 'train'
gamma = 1.0

enc_dir = PROJECT_ROOT / 'encodings' / f'{dataset}_gamma_{gamma}'

In [None]:
column_ids = json.load(open(enc_dir / 'column_ids_undisc.json'))

t_features = pd.read_csv(enc_dir / f'{split}_t_features.csv', names=column_ids.keys(), sep=' ' )
t_labels = np.loadtxt(enc_dir / f'{split}_t_labels.csv')
t_sizes = np.loadtxt(enc_dir / f'{split}_t_sizes.csv')

w_features = pd.read_csv(enc_dir / f'{split}_w_features.csv', names=column_ids.keys(), sep=' ' )
w_labels = np.loadtxt(enc_dir / f'{split}_w_labels.csv')
w_sizes = np.loadtxt(enc_dir / f'{split}_w_sizes.csv')

In [None]:
n_clusters = 8
kmeans_t_features = KMeans(n_clusters=n_clusters, random_state=0).fit(t_features)
kmeans_w_features = KMeans(n_clusters=n_clusters, random_state=0).fit(w_features)

In [None]:
for perplexity in [15]:
    t_features_sne = TSNE(perplexity=perplexity).fit_transform(t_features)
    w_features_sne = TSNE(perplexity=perplexity).fit_transform(w_features)

    fig, ax = plt.subplots(nrows=1, ncols=2, figsize=(15, 7.5))

    ax[0].scatter(
        t_features_sne[:, 0], t_features_sne[:, 1], s=t_sizes, c=kmeans_w_features.labels_
    )
    ax[1].scatter(
        w_features_sne[:, 0], w_features_sne[:, 1], s=w_sizes, c=kmeans_w_features.labels_
    )
    ax[0].set_title('African-American')
    ax[1].set_title('Caucasian')

    fig.suptitle(f't-SNE (perplexity={perplexity}): {n_clusters} Clusters')
    plt.show()

In [None]:
fig, ax = plt.subplots(nrows=1, ncols=2, figsize=(15, 7.5))

for label in [0, 1]:
    ax[label].scatter(
        t_features_sne[t_labels == label][:, 0],
        t_features_sne[t_labels == label][:, 1], s=t_sizes[t_labels == label],
        c='tab:blue', label='African-American'
    )
    ax[label].scatter(
        w_features_sne[w_labels == label][:, 0],
        w_features_sne[w_labels == label][:, 1], s=w_sizes[w_labels == label],
        c='tab:orange', label='Caucasian'
    )
    ax[label].set_title(f'Label {label}')

    for idx in range(t_features_sne.shape[0]):
        if t_labels[idx] == label and w_labels[idx] == label:
            ax[label].plot(
                [t_features_sne[idx, 0], w_features_sne[idx, 0]],
                [t_features_sne[idx, 1], w_features_sne[idx, 1]],
                c='k', alpha=0.1
            )

handles, labels = plt.gca().get_legend_handles_labels()
by_label = dict(zip(labels, handles))
fig.legend(by_label.values(), by_label.keys())
fig.suptitle(f'Perplexity {perplexity}')
plt.show()

In [None]:
for x_axis, y_axis in combinations(['age', 'diff_custody', 'diff_jail', 'priors_count'], r=2):

    fig, ax = plt.subplots(nrows=1, ncols=1, figsize=(7.5, 7.5))

    ax.scatter(
        t_features[x_axis], t_features[y_axis], s=t_sizes, c='tab:blue', label='African-American'
    )
    ax.scatter(
        w_features[x_axis], w_features[y_axis], s=w_sizes, c='tab:orange', label='Caucasian'
    )

    for idx in range(t_features.shape[0]):
        ax.plot(
           [t_features[x_axis][idx], w_features[x_axis][idx]],
           [t_features[y_axis][idx], w_features[y_axis][idx]], c='k', alpha=0.1
        )

    ax.set_xlabel(x_axis)
    ax.set_ylabel(y_axis)
    fig.legend()
    plt.show()