In [59]:
import numpy as np
import pandas as pd

In [89]:
def get_sample(arr, n_iter=None, sample_size=1000):
    """ Get random sample from arr. 
    
    Parameters
    ----------
    arr : np.array
        Array from which to take sample
    n_iter : int
        Iteration number
    sample_size : int
        Size of sample from arr
    
    Returns
    -------
    sample : np.array
        Sample from arr of length sample_size
    """
    np.random.seed(n_iter)
    m = arr.shape[1]

    start_idx = (n_iter * sample_size) % m
    if start_idx + sample_size >= m:
        np.random.shuffle(arr)

    return arr[start_idx:start_idx+sample_size]


def collect_samples(arr, sample_size, n_samples):
    """ Collect a number of samples from arr.
    
    Parameters
    ----------
    arr : np.array
        Array from which to take samples
    sample_size : int
        Size of sample from arr
    n_samples : int
        Number of samples to take from arr
    
    Returns
    -------
    samples : np.array
        Sample matrix with shape (n_samples, sample_size)
    """
    m = arr.shape[1]
    samples = np.zeros((n_samples, sample_size, m), np.int64)

    for sample_n in range(n_samples):
        sample = get_sample(arr, n_iter=sample_n,
                            sample_size=sample_size)

        samples[sample_n] = sample

    return samples


def dissim(X, x):
    
    return np.sum(X != x, axis=1)


def build_dissim_matrix(X, Y):

    dissim_matrix = np.zeros((len(X), len(Y)))
    
    for i, x in enumerate(X):
        dissim_matrix[i, :] = dissim(Y, x)
    
    return dissim_matrix

def get_adjacency_matrices(dissim_matrix, beta_max, res):
    
    adjacency_matrices = []
    for beta in np.linspace(0, beta_max, res):
        adjacency_matrix = np.where(dissim_matrix <= beta, 1, 0)
        adjacency_matrices.append(adjacency_matrix)

    return adjacency_matrices

In [90]:
df = pd.read_csv('zoo.csv')

df = df[[col for col in df.columns if col not in ['animal_name', 'class_type']]]

catsize = df[df.catsize == 1].drop('catsize', axis=1).values
not_catsize = df[df.catsize == 0].drop('catsize', axis=1).values

In [91]:
%%time

n_samples = 10000

cat_samples = collect_samples(catsize, sample_size=25, n_samples=n_samples)
notcat_samples = collect_samples(not_catsize, sample_size=25, n_samples=n_samples)

CPU times: user 1.53 s, sys: 33.1 ms, total: 1.56 s
Wall time: 1.57 s


In [101]:
%%time

beta_max = 0.33
res = 4
results = []
for cat_sample, notcat_sample in zip(cat_samples, notcat_samples):
    dissim_matrix = build_dissim_matrix(cat_sample,
                                        notcat_sample)
    adjacency_matrices = get_adjacency_matrices(dissim_matrix,
                                                beta_max, res)
    results.append(adjacency_matrices)

results

CPU times: user 2.65 s, sys: 154 ms, total: 2.8 s
Wall time: 2.8 s


In [102]:
np.mean(results), np.std(results)

(0.0237936, 0.15240559241392698)