This notebook compares the size of serialized sklearn `NearestNeighbors` models depending of the neighboring algorithm.

In [1]:
import itertools
import numpy as np
import os
import pandas as pd
import pickle

from sklearn.neighbors import NearestNeighbors

In [2]:
%%time

n_samples_range = np.logspace(3, 5, num=3, base=10, dtype=int)
n_features_range = np.logspace(1, 3, num=3, base=10, dtype=int)

algorithms = ['brute', 'kd_tree', 'ball_tree']
temp_path = 'temp_model.pickle'
results = []

for n_samples, n_features in itertools.product(n_samples_range, n_features_range):
    experiment_results = {'n_samples': n_samples, 'n_features': n_features}
    X = np.random.randn(n_samples, n_features)
    
    for algo in algorithms:
        model = NearestNeighbors(algorithm=algo)
        model.fit(X)
        
        pickle.dump(model, open(temp_path, 'wb'))
        pickle_size = os.path.getsize(temp_path)
        
        experiment_results[f'{algo}'] =  round(pickle_size / 1e6, 2)
    
    results.append(experiment_results)

os.remove(temp_path)

CPU times: user 34.8 s, sys: 2.7 s, total: 37.5 s
Wall time: 38.2 s


In [3]:
df = pd.DataFrame(results)
print(df.to_markdown())

|    |   n_samples |   n_features |   brute |   kd_tree |   ball_tree |
|---:|------------:|-------------:|--------:|----------:|------------:|
|  0 |        1000 |           10 |    0.08 |      0.1  |        0.1  |
|  1 |        1000 |          100 |    0.8  |      0.91 |        0.86 |
|  2 |        1000 |         1000 |    8    |      9.02 |        8.52 |
|  3 |       10000 |           10 |    0.8  |      0.98 |        0.94 |
|  4 |       10000 |          100 |    8    |      8.91 |        8.51 |
|  5 |       10000 |         1000 |   80    |     88.27 |       84.19 |
|  6 |      100000 |           10 |    8    |      9.59 |        9.26 |
|  7 |      100000 |          100 |   80    |     87.48 |       84.21 |
|  8 |      100000 |         1000 |  800    |    866.45 |      833.69 |
