In [1]:
import matplotlib.pyplot as plt
from tqdm.auto import tqdm
import numpy as np
import tensorflow as tf
from collections import defaultdict
from tabulate import tabulate
import plotly.express as px
from time import time
import pandas as pd
import random

In [2]:
from retvec import RecVec
from retvec.utils import tf_cap_memory

In [3]:
tf_cap_memory()

In [14]:
#  unicode list

ALPHABETS = [i * 10000 for i in range(1, 6)]
EMBEDDINGS_SIZE = [24, 32, 40, 48]
NUM_SHUFFLES = 10

In [5]:
FIXED_LISTS = {
    "24": [2, 3, 5, 7, 11, 13, 17, 19, 23, 29, 31, 37, 41, 43, 47, 53],
    "32": [2, 3, 5, 7, 11, 13, 17, 19, 23, 29, 31, 37, 41, 43, 47, 53],
    "40": [2, 3, 5, 7, 11, 13, 17, 19, 23, 29, 31, 37, 41, 43, 47, 53],
    "48": [2, 3, 5, 7, 11, 13, 17, 19, 23, 29, 31, 37, 41, 43, 47, 53]
}

NAIVE_LISTS = {
    "24": [29, 31, 37, 41, 43, 47, 53, 59, 61, 67, 71, 73, 79, 83, 89],
    "32": [37, 41, 43, 47, 53, 59, 61, 67, 71, 73, 79, 83, 89, 97, 101],
    "40": [41, 43, 47, 53, 59, 61, 67, 71, 73, 79, 83, 89, 97, 101, 103],
    "48": [53, 59, 61, 67, 71, 73, 79, 83, 89, 97, 101, 103, 107, 109]
}

In [6]:
def count_collisions(embeddings, alphabet_size):
    collisions = defaultdict(int)
    npe = embeddings[0].numpy()
    for idx in range(alphabet_size):
        v = "%s" % list(npe[idx])
        collisions[v] += 1

    collision_count = 0
    for v in collisions.values():
        collision_count += v - 1
    return collision_count

In [12]:
def embed(alphabet_size, emb_size, primes):
    ce = RecVec(max_len=alphabet_size,
                embedding_size=emb_size,
                positional_encoding=True,
                primes=primes, 
                is_eager=True)
    emb = ce(batch)
    return count_collisions(emb, alphabet_size)

In [16]:
pb = tqdm(total=len(ALPHABETS) * len(EMBEDDINGS_SIZE) * NUM_SHUFFLES)
records = defaultdict(list)
for alphabet_size in ALPHABETS:
    
    # generate alphabet
    chars = []
    for i in range(alphabet_size):
        try:
            c = chr(i)
        except:
            continue

        chars.append(c)
    
    batch = "".join(chars)
    batch = [batch]

    for emb_size in EMBEDDINGS_SIZE:
        records['alphabet_size'].append(alphabet_size)
        records['embedding_size'].append(emb_size)

        optimal_collisions = 0
        fixed_collisions = 0
        naive_collisions  = 0
        for i in range(NUM_SHUFFLES):
            random.shuffle(chars)
            batch = "".join(chars)
            batch = [batch]
            
            # optimal
            optimal_collisions += embed(alphabet_size, emb_size, None)

            # fixed
            fixed_collisions += embed(alphabet_size, emb_size, FIXED_LISTS["%s" % emb_size])


            # fixed
            naive_collisions += embed(alphabet_size, emb_size, NAIVE_LISTS["%s" % emb_size])

            pb.update()

        records['fixed_collisions'].append(naive_collisions)    
        records['naive_collisions'].append(naive_collisions)        
        records['optimal_collisions'].append(optimal_collisions)


        pb.set_postfix({"alphabet": alphabet_size, 'embedding': emb_size})
pb.close()

  4%|▍         | 8/200 [00:23<09:17,  2.91s/it]
100%|██████████| 200/200 [04:36<00:00,  1.38s/it, alphabet=5e+4, embedding=48]


In [9]:
df = pd.DataFrame.from_records(records)
df.head()

Unnamed: 0,alphabet_size,embedding_size,fixed_collisions,naive_collisions,optimal_collisions
0,10000,24,240,240,170
1,10000,32,50,50,0
2,10000,40,10,10,0
3,10000,48,50,50,0
4,20000,24,460,460,350


In [10]:
px.line(df, x='alphabet_size', y='fixed_collisions', color='embedding_size', title="Naive prime list")

In [11]:
px.line(df, x='alphabet_size', y='optimal_collisions', color='embedding_size', title="Optimal primes list")