In [1]:
import numpy as np
import pandas as pd

In [10]:
def get_sample(arr, n_iter=None, sample_size=1000):
    """ Get random sample from arr. 
    
    Parameters
    ----------
    arr : np.array
        Array from which to take sample
    n_iter : int
        Iteration number
    sample_size : int
        Size of sample from arr
    
    Returns
    -------
    sample : np.array
        Sample from arr of length sample_size
    """
    np.random.seed(n_iter)
    m = arr.shape[0]

    start_idx = (n_iter * sample_size) % m
    if start_idx + sample_size >= m:
        np.random.shuffle(arr)
        start_idx = 0

    return arr[start_idx:start_idx+sample_size]

def collect_samples(arr, sample_size, n_samples):
    """ Collect a number of samples from arr.

    Parameters
    ----------
    arr : np.array
        Array from which to take samples
    sample_size : int
        Size of sample from arr
    n_samples : int
        Number of samples to take from arr

    Returns
    -------
    samples : np.array
        Sample matrix with shape (n_samples, sample_size)
    """
    m = arr.shape[0]
    samples = np.empty((n_samples, sample_size), object)

    for sample_n in range(n_samples):
        sample = get_sample(arr, n_iter=sample_n,
                            sample_size=sample_size)
        samples[sample_n] = sample

    return samples


def build_dissim_matrix(X, Y):

    dissim_matrix = np.zeros((len(X), len(Y)))

    for i, x in enumerate(X):
        dissim_matrix[i, :] = np.sum(Y != x, axis=1)

    return dissim_matrix

def get_adjacency_matrices(dissim_matrix, beta_max, res):
    
    adjacency_matrices = [
        np.where(dissim_matrix <= beta, 1, 0) for beta in np.linspace(0, beta_max, res)
    ]

    return adjacency_matrices

In [13]:
%%time

arr = np.array([i for i in range(30000)])
n_samples = 10000
sample_size = 90

arr_samples = collect_samples(arr, sample_size, n_samples)

CPU times: user 113 ms, sys: 22 ms, total: 135 ms
Wall time: 134 ms
