In [1]:
import sys; sys.path.append('../'); sys.path.append('../hashing')
from hashing import DoubleHash

from pathlib import Path
import numpy as np
import pandas as pd

from tqdm.notebook import tqdm

DATA_DIR = 'casos_lhash'

np.random.seed(88)

In [2]:
sorted(Path(DATA_DIR).iterdir())

[WindowsPath('casos_lhash/ordered_blocks_creciente.npy'),
 WindowsPath('casos_lhash/ordered_blocks_intercalado.npy'),
 WindowsPath('casos_lhash/ordered_blocks_intercalado_rand.npy'),
 WindowsPath('casos_lhash/ordered_step_1.npy'),
 WindowsPath('casos_lhash/ordered_step_1_oversize.npy'),
 WindowsPath('casos_lhash/ordered_step_3.npy'),
 WindowsPath('casos_lhash/ordered_step_3_oversize.npy'),
 WindowsPath('casos_lhash/ordered_step_quarter.npy'),
 WindowsPath('casos_lhash/ordered_step_quarter_oversize.npy'),
 WindowsPath('casos_lhash/ordered_step_random.npy'),
 WindowsPath('casos_lhash/ordered_step_random_oversize.npy'),
 WindowsPath('casos_lhash/repeated_3_1.npy'),
 WindowsPath('casos_lhash/repeated_3_1_oversize.npy'),
 WindowsPath('casos_lhash/repeated_3_3.npy'),
 WindowsPath('casos_lhash/repeated_3_3_oversize.npy'),
 WindowsPath('casos_lhash/repeated_3_quarter.npy'),
 WindowsPath('casos_lhash/repeated_3_quarter_oversize.npy'),
 WindowsPath('casos_lhash/repeated_quarter_1.npy'),
 Windows

# Experimentos mejores y peores casos DoubleHash

Sea $th$, el tamaño de tabla hash en cada instancia.

In [3]:
size_max = int(1e4)
sizes = [i for i in range(10, size_max, size_max//4)]

d_hashes = [DoubleHash(size=s, update_size=True) for s in sizes]

df = pd.DataFrame(columns=['hash', 'original_size', 'final_size', 'caso',
                           'subcaso', 'caracteristica', 'oversize', 'tamano', 'tiempos', 'tiempo_total'])

## Secuencia de elementos ordenados

Se crean las siguientes variaciones de secuencias:

1. Secuencia ordenada creciente con paso de 1
2. Secuencia ordenada creciente con paso de 3
3. Secuencia ordenada creciente con paso de $\frac{th}{4}$
4. Secuencia ordenada creciente con paso de aleatorio entre 4 y $th$

5. Un cuarto creciente, otro decreciente y bis (bloques aleatorios)
6. 4 bloques crecientes independientes

Ademas para cada una de estas, existen dos tamaño: el tamaño de la tabla y 3 veces el tamaño de la tabla.

In [4]:
casos_ordenados_steps = list(Path(DATA_DIR).glob('ordered_step*'))
casos_ordenados_blocks = list(Path(DATA_DIR).glob('ordered_blocks*'))

arr_casos_ordenados_steps = [
    {caso.stem: np.load(caso, allow_pickle=True)} for caso in casos_ordenados_steps]

arr_casos_ordenados_blocks = [
    {caso.stem: np.load(caso, allow_pickle=True)} for caso in casos_ordenados_blocks]

casos_ordenados_steps, casos_ordenados_blocks

([WindowsPath('casos_lhash/ordered_step_1.npy'),
  WindowsPath('casos_lhash/ordered_step_1_oversize.npy'),
  WindowsPath('casos_lhash/ordered_step_3.npy'),
  WindowsPath('casos_lhash/ordered_step_3_oversize.npy'),
  WindowsPath('casos_lhash/ordered_step_quarter.npy'),
  WindowsPath('casos_lhash/ordered_step_quarter_oversize.npy'),
  WindowsPath('casos_lhash/ordered_step_random.npy'),
  WindowsPath('casos_lhash/ordered_step_random_oversize.npy')],
 [WindowsPath('casos_lhash/ordered_blocks_creciente.npy'),
  WindowsPath('casos_lhash/ordered_blocks_intercalado.npy'),
  WindowsPath('casos_lhash/ordered_blocks_intercalado_rand.npy')])

### Steps

In [5]:
for caso in tqdm(arr_casos_ordenados_steps):
    tipo_caso, arrs = list(caso.items())[0]
    rows = []
    
    tipo_caso = tipo_caso.split('_')
    
    type_name = 'ordered'
    
    subtype = tipo_caso[1]
    caracteristica = tipo_caso[2
                              ]
    if tipo_caso[-1] == 'oversize':
        oversize = 1
    else:
        oversize = 0
    
    for arr in tqdm(arrs):
        d_hashes = [DoubleHash(size=s, update_size=True) for s in sizes]
        for dh in tqdm(d_hashes):
            dh.run_experiment(arr)
            total_times, individual_times, original_size, final_size = dh.get_results()

            row = {
                'hash': 'double', 
                'original_size': original_size,
                'final_size' : final_size,
                'caso': type_name, 
                'subcaso' : subtype, 
                'caracteristica' : caracteristica, 
                'oversize' : oversize,
                'tamano': len(arr), 
                'tiempos': individual_times, 
                'tiempo_total': total_times
            }
            
        rows.append(row)
    df = df.append(pd.DataFrame(rows), ignore_index=True)

HBox(children=(FloatProgress(value=0.0, max=8.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=4.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=4.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=4.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=4.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=4.0), HTML(value='')))





HBox(children=(FloatProgress(value=0.0, max=4.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=4.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=4.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=4.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=4.0), HTML(value='')))





HBox(children=(FloatProgress(value=0.0, max=4.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=4.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=4.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=4.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=4.0), HTML(value='')))





HBox(children=(FloatProgress(value=0.0, max=4.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=4.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=4.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=4.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=4.0), HTML(value='')))





HBox(children=(FloatProgress(value=0.0, max=4.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=4.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=4.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=4.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=4.0), HTML(value='')))





HBox(children=(FloatProgress(value=0.0, max=4.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=4.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=4.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=4.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=4.0), HTML(value='')))





HBox(children=(FloatProgress(value=0.0, max=4.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=4.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=4.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=4.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=4.0), HTML(value='')))





HBox(children=(FloatProgress(value=0.0, max=4.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=4.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=4.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=4.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=4.0), HTML(value='')))






In [6]:
df.to_csv('results_double_ordered.csv', index=False)

In [7]:
df = pd.read_csv('results_double_ordered.csv')

### Blocks

In [8]:
for caso in tqdm(arr_casos_ordenados_blocks):
    tipo_caso, arrs = list(caso.items())[0]
    rows = []
    
    tipo_caso = tipo_caso.split('_')
    
    type_name = 'ordered'
    
    subtype = tipo_caso[1]
    caracteristica = tipo_caso[2
                              ]
    if tipo_caso[-1] == 'oversize':
        oversize = 1
    else:
        oversize = 0
    
    for arr in tqdm(arrs[2:]):
        d_hashes = [DoubleHash(size=s, update_size=True) for s in sizes]
        for dh in tqdm(d_hashes):
            dh.run_experiment(arr)
            total_times, individual_times, original_size, final_size = dh.get_results()

            row = {
                'hash': 'lineal', 
                'original_size': original_size,
                'final_size' : final_size,
                'caso': type_name, 
                'subcaso' : subtype, 
                'caracteristica' : caracteristica, 
                'oversize' : oversize,
                'tamano': len(arr), 
                'tiempos': individual_times, 
                'tiempo_total': total_times
            }
            
        rows.append(row)
    df = df.append(pd.DataFrame(rows), ignore_index=True)

HBox(children=(FloatProgress(value=0.0, max=3.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=2.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=4.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=4.0), HTML(value='')))





HBox(children=(FloatProgress(value=0.0, max=2.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=4.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=4.0), HTML(value='')))





HBox(children=(FloatProgress(value=0.0, max=2.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=4.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=4.0), HTML(value='')))






In [9]:
df.to_csv('results_double_ordered.csv', index=False)

## Secuencia de elementos repetidos

Se crean las siguientes variaciones de secuencias:

1. Secuencia de 3 elementos repetidos con step de 1
2. Secuencia de 3 elementos repetidos con step de 3
3. Secuencia de 3 elementos repetidos con step de $\frac{th}{4}$
4. Secuencia de $\frac{th}{4}$ elementos repetidos con step aleatorio entre 4 y $th$

Ademas para cada una de estas, existen dos tamaño: el tamaño de la tabla y 3 veces el tamaño de la tabla.

In [10]:
df = pd.DataFrame(columns=['hash', 'original_size', 'final_size', 'caso',
                           'repeticiones', 'step_size', 'oversize', 'tamano', 'tiempos', 'tiempo_total'])

casos_repetidos_3 = list(Path(DATA_DIR).glob('repeated_3*'))
casos_repetidos_quarter = list(Path(DATA_DIR).glob('repeated_quarter*'))

arr_casos_repetidos_3 = [{caso.stem: np.load(caso, allow_pickle=True)} for caso in casos_repetidos_3]
arr_casos_repetidos_quarter = [{caso.stem: np.load(caso, allow_pickle=True)} for caso in casos_repetidos_quarter]

casos_repetidos_3, casos_repetidos_quarter

([WindowsPath('casos_lhash/repeated_3_1.npy'),
  WindowsPath('casos_lhash/repeated_3_1_oversize.npy'),
  WindowsPath('casos_lhash/repeated_3_3.npy'),
  WindowsPath('casos_lhash/repeated_3_3_oversize.npy'),
  WindowsPath('casos_lhash/repeated_3_quarter.npy'),
  WindowsPath('casos_lhash/repeated_3_quarter_oversize.npy')],
 [WindowsPath('casos_lhash/repeated_quarter_1.npy'),
  WindowsPath('casos_lhash/repeated_quarter_1_oversize.npy'),
  WindowsPath('casos_lhash/repeated_quarter_random.npy'),
  WindowsPath('casos_lhash/repeated_quarter_random_oversize.npy')])

In [11]:
for caso in tqdm(arr_casos_repetidos_3):
    tipo_caso, arrs = list(caso.items())[0]
    rows = []
    
    tipo_caso = tipo_caso.split('_')
    
    type_name = 'repeated'
    
    num_repeats = tipo_caso[1]
    step_size = tipo_caso[2]
    
    if tipo_caso[-1] == 'oversize':
        oversize = 1
    else:
        oversize = 0
    
    for arr in tqdm(arrs):
        d_hashes = [DoubleHash(size=s, update_size=True) for s in sizes]
        for dh in tqdm(d_hashes):
            dh.run_experiment(arr)
            total_times, individual_times, original_size, final_size = dh.get_results()

            row = {
                'hash': 'lineal', 
                'original_size': original_size,
                'final_size' : final_size,
                'caso': type_name, 
                'repeticiones' : num_repeats, 
                'step_size' : step_size, 
                'oversize' : oversize,
                'tamano': len(arr), 
                'tiempos': individual_times, 
                'tiempo_total': total_times
            }
            
        rows.append(row)
    df = df.append(pd.DataFrame(rows), ignore_index=True)

HBox(children=(FloatProgress(value=0.0, max=6.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=4.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=4.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=4.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=4.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=4.0), HTML(value='')))





HBox(children=(FloatProgress(value=0.0, max=4.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=4.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=4.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=4.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=4.0), HTML(value='')))





HBox(children=(FloatProgress(value=0.0, max=4.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=4.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=4.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=4.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=4.0), HTML(value='')))





HBox(children=(FloatProgress(value=0.0, max=4.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=4.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=4.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=4.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=4.0), HTML(value='')))





HBox(children=(FloatProgress(value=0.0, max=4.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=4.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=4.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=4.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=4.0), HTML(value='')))





HBox(children=(FloatProgress(value=0.0, max=4.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=4.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=4.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=4.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=4.0), HTML(value='')))






In [12]:
df.to_csv('results_double_repeated.csv', index=False)

In [13]:
# for caso in tqdm(arr_casos_repetidos_quarter):
#     tipo_caso, arrs = list(caso.items())[0]
#     rows = []
    
#     tipo_caso = tipo_caso.split('_')
    
#     type_name = 'repeated'
    
#     num_repeats = tipo_caso[1]
#     step_size = tipo_caso[2]
    
#     if tipo_caso[-1] == 'oversize':
#         oversize = 1
#     else:
#         oversize = 0
    
#     for arr in tqdm(arrs):
#         l_hashes = [LinearHash(size=s, update_size=True) for s in sizes]
#         for lh in tqdm(l_hashes):
#             lh.run_experiment(arr)
#             total_times, individual_times, original_size, final_size = lh.get_results()

#             row = {
#                 'hash': 'lineal', 
#                 'original_size': original_size,
#                 'final_size' : final_size,
#                 'caso': type_name, 
#                 'repeticiones' : num_repeats, 
#                 'step_size' : step_size, 
#                 'oversize' : oversize,
#                 'tamano': len(arr), 
#                 'tiempos': individual_times, 
#                 'tiempo_total': total_times
#             }
            
#         rows.append(row)
#     df = df.append(pd.DataFrame(rows), ignore_index=True)

## Secuencia de elementos aleatorias

Se crean las siguientes variaciones de secuencias:

1. Secuencias aleatorias

In [14]:
casos_rand = list(Path(DATA_DIR).glob('random*'))
arr_casos_rand = [{caso.stem: np.load(caso, allow_pickle=True)} for caso in casos_rand]
casos_rand

[]

In [15]:
# for caso in tqdm(arr_casos_rand):
#     tipo_caso, arrs = list(caso.items())[0]
#     rows = []
    
#     for arr in tqdm(arrs):
#         d_hashes = [DoubleHash(size=s, update_size=True) for s in sizes]
#         for dh in tqdm(d_hashes):
#             dh.run_experiment(arr)
#             total_times, individual_times, original_size, final_size = dh.get_results()

#             row = {
#                 'hash': 'lineal', 
#                 'original_size': original_size,
#                 'final_size' : final_size,
#                 'caso': tipo_caso, 
#                 'tamano': len(arr), 
#                 'tiempos': individual_times, 
#                 'tiempo_total': total_times
#             }
            
#         rows.append(row)
#     df = df.append(pd.DataFrame(rows), ignore_index=True)

In [16]:
df

Unnamed: 0,hash,original_size,final_size,caso,repeticiones,step_size,oversize,tamano,tiempos,tiempo_total
0,lineal,7510,7510,repeated,3,1,0,12,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",[0.0]
1,lineal,7510,7510,repeated,3,1,0,2511,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",[0.04587388038635254]
2,lineal,7510,7510,repeated,3,1,0,5013,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",[0.21982979774475098]
3,lineal,7510,15020,repeated,3,1,0,7512,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",[0.20744776725769043]
4,lineal,7510,7510,repeated,3,1,1,33,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",[0.0010333061218261719]
5,lineal,7510,15020,repeated,3,1,1,7533,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.000997781753540039...",[0.19946527481079102]
6,lineal,7510,30040,repeated,3,1,1,15033,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",[0.7699837684631348]
7,lineal,7510,30040,repeated,3,1,1,22533,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",[2.4768500328063965]
8,lineal,7510,7510,repeated,3,3,0,12,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",[0.0]
9,lineal,7510,15020,repeated,3,3,0,2511,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",[0.049010515213012695]
