Skip to content

Commit

Permalink
Cleanup clustering code (#3030)
Browse files Browse the repository at this point in the history
Summary:
Pull Request resolved: #3030

Added default arguments to the .h file (for some reason I forgot this file when migrating default args).
Logging a hash value in MatrixStats, useful to check if two runs really really run on the same matrix...

Reviewed By: pemazare

Differential Revision: D48834343

fbshipit-source-id: 7c1948464e66ada1f462f4486f7cf3159bbf9dfd
  • Loading branch information
mdouze authored and facebook-github-bot committed Aug 31, 2023
1 parent 3888f9b commit 5c4bd3f
Show file tree
Hide file tree
Showing 5 changed files with 72 additions and 71 deletions.
14 changes: 0 additions & 14 deletions faiss/Clustering.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -27,20 +27,6 @@

namespace faiss {

ClusteringParameters::ClusteringParameters()
: niter(25),
nredo(1),
verbose(false),
spherical(false),
int_centroids(false),
update_index(false),
frozen_centroids(false),
min_points_per_centroid(39),
max_points_per_centroid(256),
seed(1234),
decode_block_size(32768) {}
// 39 corresponds to 10000 / 256 -> to avoid warnings on PQ tests with randu10k

Clustering::Clustering(int d, int k) : d(d), k(k) {}

Clustering::Clustering(int d, int k, const ClusteringParameters& cp)
Expand Down
52 changes: 31 additions & 21 deletions faiss/Clustering.h
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
* LICENSE file in the root directory of this source tree.
*/

// -*- c++ -*-
/** Implementation of k-means clustering with many variants. */

#ifndef FAISS_CLUSTERING_H
#define FAISS_CLUSTERING_H
Expand All @@ -19,25 +19,35 @@ namespace faiss {
* constructor of the Clustering object.
*/
struct ClusteringParameters {
int niter; ///< clustering iterations
int nredo; ///< redo clustering this many times and keep best

bool verbose;
bool spherical; ///< do we want normalized centroids?
bool int_centroids; ///< round centroids coordinates to integer
bool update_index; ///< re-train index after each iteration?
bool frozen_centroids; ///< use the centroids provided as input and do not
///< change them during iterations

int min_points_per_centroid; ///< otherwise you get a warning
int max_points_per_centroid; ///< to limit size of dataset

int seed; ///< seed for the random number generator

size_t decode_block_size; ///< how many vectors at a time to decode

/// sets reasonable defaults
ClusteringParameters();
/// number of clustering iterations
int niter = 25;
/// redo clustering this many times and keep the clusters with the best
/// objective
int nredo = 1;

bool verbose = false;
/// whether to normalize centroids after each iteration (useful for inner
/// product clustering)
bool spherical = false;
/// round centroids coordinates to integer after each iteration?
bool int_centroids = false;
/// re-train index after each iteration?
bool update_index = false;

/// Use the subset of centroids provided as input and do not change them
/// during iterations
bool frozen_centroids = false;
/// If fewer than this number of training vectors per centroid are provided,
/// writes a warning. Note that fewer than 1 point per centroid raises an
/// exception.
int min_points_per_centroid = 39;
/// to limit size of dataset, otherwise the training set is subsampled
int max_points_per_centroid = 256;
/// seed for the random number generator
int seed = 1234;

/// when the training set is encoded, batch size of the codec decoder
size_t decode_block_size = 32768;
};

struct ClusteringIterationStats {
Expand Down Expand Up @@ -94,7 +104,7 @@ struct Clustering : ClusteringParameters {
* to decode the input vectors.
*
* @param codec codec used to decode the vectors (nullptr =
* vectors are in fact floats) *
* vectors are in fact floats)
*/
void train_encoded(
idx_t nx,
Expand Down
39 changes: 12 additions & 27 deletions faiss/MatrixStats.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
#include <stdarg.h> /* va_list, va_start, va_arg, va_end */

#include <faiss/utils/utils.h>
#include <inttypes.h>
#include <cmath>
#include <cstdio>

Expand All @@ -21,18 +22,6 @@ namespace faiss {
* MatrixStats
*********************************************************************/

MatrixStats::PerDimStats::PerDimStats()
: n(0),
n_nan(0),
n_inf(0),
n0(0),
min(HUGE_VALF),
max(-HUGE_VALF),
sum(0),
sum2(0),
mean(NAN),
stddev(NAN) {}

void MatrixStats::PerDimStats::add(float x) {
n++;
if (std::isnan(x)) {
Expand Down Expand Up @@ -74,26 +63,22 @@ void MatrixStats::do_comment(const char* fmt, ...) {
buf += size;
}

MatrixStats::MatrixStats(size_t n, size_t d, const float* x)
: n(n),
d(d),
n_collision(0),
n_valid(0),
n0(0),
min_norm2(HUGE_VAL),
max_norm2(0) {
MatrixStats::MatrixStats(size_t n, size_t d, const float* x) : n(n), d(d) {
std::vector<char> comment_buf(10000);
buf = comment_buf.data();
nbuf = comment_buf.size();

do_comment("analyzing %ld vectors of size %ld\n", n, d);
do_comment("analyzing %zd vectors of size %zd\n", n, d);

if (d > 1024) {
do_comment(
"indexing this many dimensions is hard, "
"please consider dimensionality reducution (with PCAMatrix)\n");
}

hash_value = hash_bytes((const uint8_t*)x, n * d * sizeof(*x));
do_comment("hash value 0x%016" PRIx64 "\n", hash_value);

size_t nbytes = sizeof(x[0]) * d;
per_dim_stats.resize(d);

Expand Down Expand Up @@ -156,7 +141,7 @@ MatrixStats::MatrixStats(size_t n, size_t d, const float* x)

if (n_collision > 0) {
do_comment(
"%ld collisions in hash table, "
"%zd collisions in hash table, "
"counts may be invalid\n",
n_collision);
}
Expand All @@ -167,22 +152,22 @@ MatrixStats::MatrixStats(size_t n, size_t d, const float* x)
max = it->second;
}
}
do_comment("vector %ld has %ld copies\n", max.first, max.count);
do_comment("vector %zd has %zd copies\n", max.first, max.count);
}

{ // norm stats
min_norm2 = sqrt(min_norm2);
max_norm2 = sqrt(max_norm2);
do_comment(
"range of L2 norms=[%g, %g] (%ld null vectors)\n",
"range of L2 norms=[%g, %g] (%zd null vectors)\n",
min_norm2,
max_norm2,
n0);

if (max_norm2 < min_norm2 * 1.0001) {
do_comment(
"vectors are normalized, inner product and "
"L2 search are equivalent\n");
"L2 search are equivalent\n");
}

if (max_norm2 > min_norm2 * 100) {
Expand Down Expand Up @@ -227,15 +212,15 @@ MatrixStats::MatrixStats(size_t n, size_t d, const float* x)
do_comment("no constant dimensions\n");
} else {
do_comment(
"%ld dimensions are constant: they can be removed\n",
"%zd dimensions are constant: they can be removed\n",
n_0_range);
}

if (n_dangerous_range == 0) {
do_comment("no dimension has a too large mean\n");
} else {
do_comment(
"%ld dimensions are too large "
"%zd dimensions are too large "
"wrt. their variance, may loose precision "
"in IndexFlatL2 (use CenteringTransform)\n",
n_dangerous_range);
Expand Down
30 changes: 21 additions & 9 deletions faiss/MatrixStats.h
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
#pragma once

#include <stdint.h>
#include <cmath>
#include <string>
#include <unordered_map>
#include <vector>
Expand All @@ -26,20 +27,31 @@ struct MatrixStats {
std::string comments;

// raw statistics
size_t n, d;
size_t n_collision, n_valid, n0;
double min_norm2, max_norm2;
size_t n = 0, d = 0;
size_t n_collision = 0;
size_t n_valid = 0;
size_t n0 = 0;
double min_norm2 = HUGE_VALF;
double max_norm2 = 0;
uint64_t hash_value = 0;

struct PerDimStats {
size_t n, n_nan, n_inf, n0;
/// counts of various special entries
size_t n = 0;
size_t n_nan = 0;
size_t n_inf = 0;
size_t n0 = 0;

float min, max;
double sum, sum2;
/// to get min/max and stddev values
float min = HUGE_VALF;
float max = -HUGE_VALF;
double sum = 0;
double sum2 = 0;

size_t n_valid;
double mean, stddev;
size_t n_valid = 0;
double mean = NAN;
double stddev = NAN;

PerDimStats();
void add(float x);
void compute_mean_std();
};
Expand Down
8 changes: 8 additions & 0 deletions tests/test_build_blocks.py
Original file line number Diff line number Diff line change
Expand Up @@ -256,6 +256,14 @@ def test_normalized(self):
print(comments)
assert 'vectors are normalized' in comments

def test_hash(self):
cc = []
for _ in range(2):
rs = np.random.RandomState(123)
m = rs.rand(40, 20).astype('float32')
cc.append(faiss.MatrixStats(m).hash_value)
self.assertTrue(cc[0] == cc[1])


class TestScalarQuantizer(unittest.TestCase):

Expand Down

0 comments on commit 5c4bd3f

Please sign in to comment.