In [1]:
from collections import Counter

from scipy.spatial.distance import cosine
import numpy as np

In [2]:
argmins = Counter()

for iteration in range(20):    
    # Generate random vectors per class
    class_vectors = (np.random.random((20001, 100))-0.5)*2
    
    # Generate a sample from the distribution
    sample = np.minimum(np.random.zipf(1.7, 2001), 20000)
    
    # Generate noise 
    noise = np.random.normal(0, 0.01, (len(sample), 100))
    
    # Combine noise and data
    X_data = class_vectors[sample]
    X = X_data+noise

    # Count non-noisy class vectors
    freq = Counter()
    for row in np.array(X_data):
        freq[tuple(row)] += 1
    
    # Calculate mean vector for data
    mean_vector = X.mean(axis=0)
    
    # Find cosine distance between rank sorted class_vectors and the mean vector
    distances = [cosine(np.array(k), mean_vector) for k, v in freq.most_common()]
    
    # Take out the index of the minimum distance
    # If the index is 0 that indicates that the closest pairs of vectors
    # was the mean vector and the most common vector
    argmins[np.argmin(np.array(distances))] += 1
        
# Print out the results
for l, (k, v) in enumerate(argmins.most_common()):
    print(str(l).ljust(2), ': argmin:', str(k).ljust(3), ': freq:', str(v).ljust(3))

0  : argmin: 0   : freq: 20 
