# Benchmark the character embedding speed

In [15]:
import tensorflow as tf
from tensorflow.keras import layers
from time import time
import numpy as np
from tabulate import tabulate
from collections import defaultdict
from tqdm.auto import tqdm
import pandas as pd
import plotly.express as px
from perfcounters import PerfCounters
from retvec import RecVec
from retvec.utils import get_random_unicode, find_primes, tf_cap_memory

In [3]:
tf_cap_memory()

In [38]:
t = tf.strings.split("test-test", sep='-')

r = text_to_word_sequence("test-test", split='-')
r = tf.constant(r)
print(t, r)   

tf.Tensor([b'test' b'test'], shape=(2,), dtype=string) tf.Tensor([b'test' b'test'], shape=(2,), dtype=string)


## number of primes

Measure the impact of using mores primes numbers in the decomposition

In [4]:
MIN_PRIMES = 4
MAX_PRIMES = 20   # range is non inclusive
BATCH_SIZE = 256
MAX_LEN = 128
NUM_TESTS = 10000
PRIMES = find_primes(1000)
BATCH = [get_random_unicode(MAX_LEN) for _ in range(BATCH_SIZE)]

In [5]:
speed = []
for num_primes in tqdm(range(MIN_PRIMES, MAX_PRIMES)):
    prime_list = PRIMES[:num_primes] 
    ce = RecVec(is_eager=True, primes=prime_list)

    #warmup
    for _ in range(10):
        ce(BATCH)
    
    # measure
    ts = time()
    for _ in range(NUM_TESTS):
        ce(BATCH)
    speed.append(time()- ts)

100%|██████████| 16/16 [03:13<00:00, 12.07s/it]


In [7]:
y = [i / NUM_TESTS for i in speed]  # true speed
px.line(y=y, x=[i for i in range(MIN_PRIMES, MAX_PRIMES)], 
labels=dict(x="Number of primes", y="Time in sec"), 
title='Embedding speed')

## Input size

In [4]:
BATCH_SIZES= [2**i for i in range(4, 16)]
print(BATCH_SIZES)
EMBEDDINGS_SIZES = [24, 32, 40, 48]
NUM_TESTS = 100
MAX_LEN = [2**i for i in range(4, 10)]

[16, 32, 64, 128, 256, 512, 1024, 2048, 4096, 8192, 16384, 32768]


In [5]:

records = defaultdict(list)
pb = tqdm(total=len(BATCH_SIZES) * len(MAX_LEN) * len(EMBEDDINGS_SIZES))

for emb_size in EMBEDDINGS_SIZES:
    ce = RecVec(is_eager=True, embedding_size=emb_size)
    
    for batch_size in BATCH_SIZES:
        for max_len in MAX_LEN:
            batch = [get_random_unicode(max_len) for _ in range(batch_size)]

            #warmup
            for _ in range(3):
                ce(batch)
            
            # measure
            ts = time()
            for _ in range(NUM_TESTS):
                ce(batch)
            ts = time()- ts

            records['batch_size'].append(batch_size)
            records['max_len'].append(max_len)
            records['embedding_size'].append(emb_size)
            records['time'].append(ts)
            pb.update()
            pb.set_postfix({"batch_size": batch_size, 'max_len': max_len, 'embedding': emb_size})
pb.close()

 20%|██        | 59/288 [00:42<11:17,  2.96s/it, batch_size=8192, max_len=256, embedding=24]

In [6]:
df = pd.DataFrame.from_records(records)
df['time'] = df['time'] / NUM_TESTS
df.head()

NameError: name 'records' is not defined

# batch_size
Fixed max_len at 128

In [16]:
px.line(df[df['max_len']==128], x='batch_size', y='time', color='embedding_size',  title='Batch size vs time')

In [26]:
px.line(df[df['batch_size']==4096], x='max_len', y='time', color='embedding_size',   title='Max_len vs time')