# List primes selection
Leverage the results of `cli/prime_search.py` to find the minimal number of primes needed to have 0 collisions (if possible) for various embeddings size.

In [9]:
import json
from time import time
import numpy as np
from tabulate import tabulate
from collections import defaultdict
from tqdm.auto import tqdm
import pandas as pd
import plotly.express as px
from collections import defaultdict

In [10]:
collisions = defaultdict(lambda: defaultdict(list))
primes_lists = defaultdict(lambda: defaultdict(list))
for l in open('data/best_primes.json'):
    d = json.loads(l)
    if d['alphabet'] != 55000:
        continue
    collisions[d['embedding']][d['num_primes']].append(d['collisions'])
    primes_lists[d['embedding']][d['num_primes']].append(d['list'])

In [21]:
emb_size = sorted([i for i in collisions.keys()])
best_lists = defaultdict(lambda: defaultdict(list))
records = []
for es in emb_size:
    rows = []
    num_primes = sorted([i for i in collisions[es].keys()])
    for p in num_primes:
        data = collisions[es][p]
        
        records.append({
            "embedding_size": es,
            "num_primes": p,
            "min_collisions": np.min(data),
        })

        # find the best lists
        min_cols = np.min(data)
        for idx, val in enumerate(data):
            if val == min_cols:
                best_lists[es][p].append(primes_lists[es][p][idx])

In [22]:
df = pd.DataFrame.from_records(records)
df.head()

Unnamed: 0,embedding_size,num_primes,min_collisions
0,16,4,52484
1,16,5,48133
2,16,6,41030
3,16,7,34139
4,16,8,29008


In [26]:
px.line(df, x='num_primes', y='min_collisions', color='embedding_size',
title="Collisions vs num Primes", log_y=True)

## list the best primes list

In [28]:
emb_size = sorted([i for i in collisions.keys()])
for es in emb_size:
    rows = []
    num_primes = sorted([i for i in collisions[es].keys()])
    for p in num_primes:
        data = best_lists[es][p]

        rows.append([p, np.min(collisions[es][p])] + [ sorted(l) for l in best_lists[es][p]])
        
    print("\n%s\n" % es)
    print(tabulate(rows, headers=['num primes', ]))


16

                                                                                                     num primes
--  -----  ----------------------------------------------------------------------------------------  ---------------------
 4  52484  [19, 23, 1627, 2243]                                                                      [17, 23, 1051, 19001]
 5  48133  [17, 31, 97, 56041, 372979]
 6  41030  [19, 29, 1019, 1229, 1747, 2293]
 7  34139  [17, 73, 109, 113, 227, 1217, 1291]
 8  29008  [17, 19, 41, 419, 1427, 2267, 2311, 2417]
 9  25305  [17, 19, 73, 89, 127, 137, 661, 2239, 2383]
10  23029  [17, 19, 47, 61, 67, 73, 139, 149, 269, 1327]
11  22247  [17, 19, 31, 43, 47, 59, 61, 349, 1549, 30859, 48733]
12  22110  [17, 19, 29, 31, 43, 59, 67, 97, 15077, 58543, 59809, 158563]
13  22492  [17, 47, 59, 61, 107, 113, 127, 283, 593, 58193, 73517, 88379, 137443]
14  22817  [19, 37, 43, 53, 61, 311, 419, 1321, 2087, 47969, 53441, 82279, 88589, 329111]
15  21669  [17, 19, 23, 43, 47, 

In [None]:
[19, 23, 29, 41, 43, 53, 83, 89, 107, 113, 131, 137, 149, 743, 11981, 380839]

[17, 31, 37, 47, 67, 79, 89, 139, 149, 1613, 2141, 114809]
[31, 43, 59, 61, 89, 199, 941, 66343]
