In [6]:
import matplotlib.pyplot as plt
import matplotlib.font_manager as font_manager
import matplotlib
import numpy as np
import pandas as pd

for font in font_manager.findSystemFonts('/Users/leonardokuffo/Library/Fonts/'):
    font_manager.fontManager.addfont(font)

matplotlib.font_manager.findSystemFonts(fontpaths=None, fontext='ttf')
matplotlib.rc('font', family='Droid Serif') 

In [7]:
DATASETS_MAPPING = {
    'nytimes-16-angular': 'NYTimes/16',
    #'glove-25-angular': 'GloVe/25',
    'glove-50-angular': 'GloVe/50',
    'deep-image-96-angular': 'DEEP/96',
    'sift-128-euclidean': 'SIFT/128',
    'glove-200-angular': 'GloVe/200',
    'msong-420': 'MSong/420',
    'har-561': 'HAR/561',
    'contriever-768': 'Contriever/768',
    'instructorxl-arxiv-768': 'arXiv/768',
    'fashion-mnist-784-euclidean': 'F-MNIST/784',
    # 'mnist-784-euclidean': 'MNIST/784',
    'gist-960-euclidean': 'GIST/960',
    'openai-1536-angular': 'OpenAI/1536',
    'trevi-4096': 'Trevi/4096',
    'stl-9216': 'STL/9216'
}
arch = 'SAPPHIRE'
#arch = 'ZEN4_1GB'
#arch = 'ZEN4'
#arch = 'ZEN3'
# arch = 'GRAVITON4'
#arch = 'GRAVITON3'
#arch = 'M1'

In [8]:
linear = pd.read_csv(f'../{arch}/IVF_BRUTEFORCE_SIMD.csv')
ads = pd.read_csv(f'../{arch}/IVF_NARY_ADSAMPLING.csv')
ads_simd = pd.read_csv(f'../{arch}/IVF_NARY_ADSAMPLING_SIMD.csv')
pdx_ads = pd.read_csv(f'../{arch}/IVF_PDX_ADSAMPLING.csv')
pdx_bond = pd.read_csv(f'../{arch}/IVF_PDX_BOND.csv')
pdx_bond = pdx_bond[pdx_bond['algorithm'] == 'pdx-bond-sec']
pdx = pd.read_csv(f'../{arch}/IVF_PDX_VERTICAL_SCAN.csv')

linear['algorithm'] = 'Linear-Scan-SIMD'
ads['algorithm'] = 'ADSampling'
ads_simd['algorithm'] = 'ADSampling-SIMD'
pdx_ads['algorithm'] = 'ADSampling-PDX'
pdx_bond['algorithm'] = 'PDX-BOND'
pdx['algorithm'] = 'Linear-Scan-PDX'

df = pd.concat([
    #linear, 
    ads, 
    ads_simd,
    pdx_ads,
    #pdx,
    #pdx_bond
], ignore_index=True)
df = df.groupby(['algorithm', 'dataset', 'recall']).agg({
    'avg': 'min',
    'max': 'min',
    'min': 'min',
    'ivf_nprobe': 'min',
}).reset_index().sort_values(['algorithm', 'dataset', 'ivf_nprobe'])
df['avg'] = 1000 / df['avg']
df = df[['algorithm', 'dataset', 'recall', 'avg']]
df.columns =  ['Algorithm', 'Dataset', 'Recall', 'QPS']

df = df[df['Recall'] > 0.70]

FileNotFoundError: [Errno 2] No such file or directory: '../SAPPHIRE/IVF_BRUTEFORCE_SIMD.csv'

In [None]:
import seaborn as sns

colors = {
            'Linear-Scan-SIMD': '#d96a6a', #'#e06666',
            'ADSampling-PDX': '#59b872',
            'ADSampling-SIMD': '#B8A159',
            'PDX-BOND': '#5970B8',
            'ADSampling': '#5970B8',
        }


dashes = {
            'Linear-Scan-SIMD': '',
            'ADSampling-PDX': '',
            'ADSampling-SIMD': '',
            'PDX-BOND': '',
            'ADSampling': '',
}


[',', '.', 'o', 'v', '^', '<', '>', '8', 's', 'p', '*', 'h', 'H', 'D', 'd', 'P', 'X']

font = {'size': 8}
matplotlib.rc('font', **font)

fig, (
    (ax1, ax2, ax3, ax4, ax5, ax6, ax7),
    (ax9, ax10, ax11, ax12, ax13, ax14, ax15),
) = plt.subplots(2, 7, constrained_layout=False)
fig.set_size_inches(16, 2.8)
plots = [ax1, ax2, ax3, ax4, ax5, ax6, ax7, ax9, ax10, ax11, ax12, ax13, ax14, ax15]

plt.subplots_adjust(hspace=0.50, wspace=0.25)

for i, dataset in enumerate(list(DATASETS_MAPPING.keys())[:]):
    ax = plots[i]
    dataset_name = DATASETS_MAPPING[dataset]
    to_plot = df[df['Dataset'] == dataset]
    
    sns.lineplot(
        data=to_plot,
        x="Recall", 
        y="QPS",
        hue="Algorithm",
        # marker='o',
        markers=['o', 'd', 'X', 'P'],
        dashes=dashes,
        palette=colors,
        style='Algorithm',
        linewidth=0.9,
        markeredgewidth=0.2,
        markersize=3.0,
        ax=ax,
    )
    
    ax.xaxis.grid(linewidth=0.5, color='#ededed')
    ax.yaxis.grid(linewidth=0.5, color='#ededed')
    ax.set_axisbelow(True)
    
    ax.legend(
        loc="upper left",
        prop={'size': 9},
        ncols=5,
        frameon=False,
        bbox_to_anchor=(2.08, 1.55),
        markerscale=1.5
    )
    ax.set_title(dataset_name)
    ax.set_xlabel(f'Recall', fontsize=9)
    ax.set_ylabel('QPS', fontsize=9)
    ax.set_yscale('log')
    
    if (i != 0):
        ax.legend().remove()
    if (i != 0 and i != 7):
        ax.set_ylabel('')
    if (i < 7):
        ax.set_xlabel('')

#plt.savefig(f'../publication_figures/ADSAMPLING_PDX_VS_SIMD_{arch}.png', format='png', dpi=600, bbox_inches='tight')

In [18]:
combined = linear.merge(
    ads, on=['dataset', 'ivf_nprobe'], how='inner', suffixes=('_linear', '_ads')
).merge(
    pdx_ads, on=['dataset', 'ivf_nprobe'], how='inner', suffixes=('', '_pdx_ads')
).merge(
    bsa, on=['dataset', 'ivf_nprobe'], how='inner', suffixes=('', '_bsa')
).merge(
    pdx_bsa, on=['dataset', 'ivf_nprobe'], how='inner', suffixes=('', '_pdx_bsa')
)

combined = combined[combined['recall'] <= 1]

combined = combined[['dataset', 'recall_linear', 'ivf_nprobe', 'avg_linear', 'avg_ads', 'avg', 'avg_bsa', 'avg_pdx_bsa']]
combined['avg_linear'] = 1000 / combined['avg_linear']
combined['avg_ads'] = 1000 / combined['avg_ads']
combined['avg'] = 1000 / combined['avg']
combined['avg_bsa'] = 1000 / combined['avg_bsa']
combined['avg_pdx_bsa'] = 1000 / combined['avg_pdx_bsa']
combined = combined.rename(columns = {'avg': 'avg_pdx_ads'})

# Taking the minimum nprobe to achieve the highest possible recall 
combined = combined.groupby(['dataset', 'recall_linear']).last().reset_index().groupby(['dataset']).last().reset_index()

combined = combined[combined['dataset'].isin(list(DATASETS_MAPPING.keys()))]

combined['speedup_adsampling'] = combined['avg_pdx_ads'] / combined['avg_ads']
combined['speedup_bsa'] = combined['avg_pdx_bsa'] / combined['avg_bsa']
combined['speedup_linear_ads'] = combined['avg_pdx_ads'] / combined['avg_linear']
combined['speedup_linear_bsa'] = combined['avg_pdx_bsa'] / combined['avg_linear']

print('Results for', arch)
print('Average Speedup of PDX on BSA:', combined['speedup_bsa'].mean())
print('Average Speedup of PDX on ADSampling:', combined['speedup_adsampling'].mean())
print('Average Speedup of ADSampling against Linear:', combined['speedup_linear_ads'].mean())
print('Average Speedup of BSA against Linear:', combined['speedup_linear_bsa'].mean())
combined[['dataset', 'recall_linear', 'ivf_nprobe', 'speedup_adsampling', 'speedup_bsa', 'speedup_linear_ads', 'speedup_linear_bsa']]

Results for GRAVITON4
Average Speedup of PDX on BSA: 1.6889276163000064
Average Speedup of PDX on ADSampling: 1.995240275980785
Average Speedup of ADSampling against Linear: 8.78130603940166
Average Speedup of BSA against Linear: 7.245836026987008


Unnamed: 0,dataset,recall_linear,ivf_nprobe,speedup_adsampling,speedup_bsa,speedup_linear_ads,speedup_linear_bsa
0,contriever-768,0.998,144,1.396454,1.427716,8.88208,5.824277
1,deep-image-96-angular,0.998,96,2.694644,2.052904,5.043051,2.935811
2,fashion-mnist-784-euclidean,1.0,20,1.161265,1.20548,9.313629,6.021052
3,gist-960-euclidean,0.996,160,1.250532,1.229472,11.136725,10.366356
5,glove-200-angular,0.974,160,1.985916,2.671161,4.23203,3.045087
7,glove-50-angular,0.995,160,3.304438,2.354661,3.100528,2.177158
8,har-561,1.0,20,1.279813,1.469481,5.734803,5.53827
9,instructorxl-arxiv-768,0.997,160,1.610043,2.018759,14.542933,8.779328
11,msong-420,1.0,64,1.702979,1.7361,11.494365,6.836123
12,nytimes-16-angular,0.988,144,6.330283,2.720741,2.509495,1.695106


In [8]:
combined = linear.merge(
    ads, on=['dataset', 'ivf_nprobe'], how='inner', suffixes=('_linear', '_ads')
).merge(
    pdx_ads, on=['dataset', 'ivf_nprobe'], how='inner', suffixes=('', '_pdx_ads')
).merge(
    bsa, on=['dataset', 'ivf_nprobe'], how='inner', suffixes=('', '_bsa')
).merge(
    pdx_bsa, on=['dataset', 'ivf_nprobe'], how='inner', suffixes=('', '_pdx_bsa')
)

combined = combined[['dataset', 'recall_linear', 'ivf_nprobe', 'avg_linear', 'avg_ads', 'avg', 'avg_bsa', 'avg_pdx_bsa']]
combined['avg_linear'] = 1000 / combined['avg_linear']
combined['avg_ads'] = 1000 / combined['avg_ads']
combined['avg'] = 1000 / combined['avg']
combined['avg_bsa'] = 1000 / combined['avg_bsa']
combined['avg_pdx_bsa'] = 1000 / combined['avg_pdx_bsa']
combined = combined.rename(columns = {'avg': 'avg_pdx_ads'})

# Taking the minimum nprobe to achieve the highest possible recall 
combined = combined.groupby(['dataset', 'recall_linear']).last().reset_index().groupby(['dataset']).last().reset_index()

combined = combined[combined['dataset'].isin(list(DATASETS_MAPPING.keys()))]
# Removing contriever due to bad benchmarking for now
combined = combined[combined['dataset'] != 'contriever-768']


combined['speedup_adsampling'] = combined['avg_pdx_ads'] / combined['avg_ads']
combined['speedup_bsa'] = combined['avg_pdx_bsa'] / combined['avg_bsa']
combined['speedup_linear_ads'] = combined['avg_pdx_ads'] / combined['avg_linear']
combined['speedup_linear_bsa'] = combined['avg_pdx_bsa'] / combined['avg_linear']

print('Results for', arch)
print('Average Speedup of PDX on BSA:', combined['speedup_bsa'].mean())
print('Average Speedup of PDX on ADSampling:', combined['speedup_adsampling'].mean())
print('Average Speedup of ADSampling against Linear:', combined['speedup_linear_ads'].mean())
print('Average Speedup of BSA against Linear:', combined['speedup_linear_bsa'].mean())
combined[['dataset', 'recall_linear', 'ivf_nprobe', 'speedup_adsampling', 'speedup_bsa', 'speedup_linear_ads', 'speedup_linear_bsa']]

Results for M1
Average Speedup of PDX on BSA: 2.2263751320115364
Average Speedup of PDX on ADSampling: 3.2807706036780258
Average Speedup of ADSampling against Linear: 17.76564727429299
Average Speedup of BSA against Linear: 14.792892063272339


Unnamed: 0,dataset,recall_linear,ivf_nprobe,speedup_adsampling,speedup_bsa,speedup_linear_ads,speedup_linear_bsa
1,deep-image-96-angular,0.998,96,3.522025,2.879503,6.835933,3.162867
2,fashion-mnist-784-euclidean,1.0,20,1.217344,1.29913,18.085514,11.415558
3,gist-960-euclidean,0.997,160,2.317851,2.047816,25.190655,21.619456
5,glove-200-angular,0.976,160,3.137394,2.773946,8.012446,4.684001
6,glove-25-angular,0.995,160,7.769639,2.831501,3.19317,1.185029
7,glove-50-angular,0.995,160,5.798499,3.09245,3.63336,1.804462
8,har-561,1.0,20,1.302725,1.406227,11.497973,10.714192
9,instructorxl-arxiv-768,0.997,160,2.820817,3.740071,32.668368,16.526629
10,mnist-784-euclidean,1.0,80,1.416293,1.146125,34.129892,45.136043
11,msong-420,1.0,64,2.415492,2.695782,23.579896,11.439117
