In [74]:
from src.linear_hashing import LinearHashing
from random import sample, randint
from math import ceil
import pandas as pd
import plotly.express as px

def generate_keys(amount, start=0, end=10**5):
    random_numbers = set()
    while len(random_numbers) < amount:
        random_numbers.add(randint(start, end))

    return list(random_numbers)

def average(numbers):
    return sum(numbers) / len(numbers)

def linePlot(data,x,y, y_range, color=None):
    if not color:
        plot_data = data.groupby(x)[y].mean().reset_index()
    else:
        plot_data = data.groupby([x,color])[y].mean().reset_index()

    fig = px.line(plot_data, x=x, y=y, title=f'{y} em funcao do {x}', markers=True, color=color)
    fig.update_yaxes(range=y_range)
    fig.show()


test_ps = [1, 5, 10, 20, 50]
test_alphas = [0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]


# Desempenho quanto ao espaço

In [64]:
data = []

for p in test_ps:
    for alpha in test_alphas:
        alphas = []
        p_stars = []

        for _ in range(10):
            linear_hashing = LinearHashing(p, alpha)
            keys = generate_keys(p*1000)
            for key in keys:
                linear_hashing.insert(key)

            alphas.append(linear_hashing.alpha)
            p_stars.append(linear_hashing.p_star)

        avg_alpha = average(alphas)
        avg_p_star = average(p_stars)

        data.append([p,alpha,avg_alpha,avg_p_star])

data = pd.DataFrame(data, columns=['p', 'maxAlpha','avgAlpha','avgP*']) 
data

Unnamed: 0,p,maxAlpha,avgAlpha,avgP*
0,1,0.2,0.199992,1.668595
1,1,0.3,0.299868,1.630612
2,1,0.4,0.399936,1.530059
3,1,0.5,0.499925,1.580294
4,1,0.6,0.599808,1.578393
5,1,0.7,0.69979,1.687594
6,1,0.8,0.799936,1.88403
7,1,0.9,0.899038,2.133624
8,5,0.2,0.199988,1.172852
9,5,0.3,0.29994,1.109989


In [66]:
linePlot(data,x="maxAlpha",y="avgAlpha",y_range=[0,1])

   maxAlpha  avgAlpha
0       0.2  0.199993
1       0.3  0.299926
2       0.4  0.399987
3       0.5  0.499900
4       0.6  0.598207
5       0.7  0.686685
6       0.8  0.778141
7       0.9  0.892750


In [67]:
linePlot(data,x="p",y="avgAlpha",y_range=[0,1])

    p  avgAlpha
0   1  0.549787
1   5  0.549313
2  10  0.549100
3  20  0.545619
4  50  0.528424


In [68]:
linePlot(data,x="maxAlpha",y="avgP*",y_range=[0,4])

   maxAlpha     avgP*
0       0.2  1.177931
1       0.3  1.151159
2       0.4  1.122922
3       0.5  1.145891
4       0.6  1.250236
5       0.7  1.401296
6       0.8  1.877406
7       0.9  3.610814


In [69]:
linePlot(data,x="p",y="avgP*",y_range=[1,2])

    p     avgP*
0   1  1.711650
1   5  1.616456
2  10  1.641907
3  20  1.626569
4  50  1.364453


# Desempenho quanto ao número médio de acessos


In [77]:

data = []

for p in test_ps:
    for alpha in test_alphas:
        S_avg_accesses = []
        C_avg_accesses = []

        for _ in range(10):
            linear_hashing = LinearHashing(p, alpha)

            size = p * 1000
            sample_size = ceil(size * 0.20)

            random_keys = generate_keys(size+sample_size)

            keys = random_keys[sample_size:]
            
            for key in keys:
                linear_hashing.insert(key)

            S_sample = random_keys[:sample_size]
            C_sample = sample(keys, sample_size)

            S_sample_search_accesses = []
            C_sample_search_accesses = []
            for i in range(sample_size):
                S_sample_search_accesses.append(linear_hashing.search(S_sample[i])[1])
                C_sample_search_accesses.append(linear_hashing.search(C_sample[i])[1])

            S_avg_accesses.append(average(S_sample_search_accesses))
            C_avg_accesses.append(average(C_sample_search_accesses))
            

        data.append([p,alpha, average(S_avg_accesses), "S"])
        data.append([p,alpha, average(C_avg_accesses), "C"])


data = pd.DataFrame(data, columns=['p', 'maxAlpha','avg_accesses','type']) 
data


Unnamed: 0,p,maxAlpha,avg_accesses,type
0,1,0.2,0.01450,S
1,1,0.2,1.27050,C
2,1,0.3,0.01950,S
3,1,0.3,1.26650,C
4,1,0.4,0.02300,S
...,...,...,...,...
75,50,0.7,22.49581,C
76,50,0.8,47.56692,S
77,50,0.8,24.50298,C
78,50,0.9,137.97664,S


In [79]:
linePlot(data,x="p",y="avg_accesses",y_range=[0,60], color="type")

In [78]:
linePlot(data,x="maxAlpha",y="avg_accesses",y_range=[0,60], color="type")

# Desempenho durante a inclusão dos n registros


In [9]:
p = 10
max_alpha = 0.85


alphas = []
p_stars = []
l_maxes = []


def index_average(arr):
    return [sum(item[i] for item in arr) / len(arr) for i in range(len(arr[0]))]

for _ in range(10):
    i_alphas = []
    i_p_stars = []

    linear_hashing = LinearHashing(p, max_alpha)
    keys = generate_keys(10000)
    for key in keys:
        linear_hashing.insert(key)
        i_alphas.append(linear_hashing.alpha)
        i_p_stars.append(linear_hashing.p_star)


    alphas.append(i_alphas)
    p_stars.append(i_p_stars)


average_ith_alphas = index_average(alphas)
average_ith_p_stars = index_average(p_stars)

print(
    f"avg i alphas: {average_ith_alphas} avg p*: {average_ith_p_stars}"
)

avg i alphas: [0.049999999999999996, 0.09999999999999999, 0.14999999999999997, 0.19999999999999998, 0.25, 0.29999999999999993, 0.35000000000000003, 0.39999999999999997, 0.45000000000000007, 0.5, 0.5499999999999999, 0.5999999999999999, 0.6500000000000001, 0.6766666666666667, 0.725, 0.72, 0.7649999999999999, 0.5999999999999999, 0.6333333333333332, 0.6500000000000001, 0.6825000000000001, 0.6783333333333333, 0.6708333333333334, 0.6799999999999999, 0.6875, 0.6500000000000001, 0.6749999999999999, 0.7000000000000001, 0.7104999999999999, 0.735, 0.7595000000000002, 0.784, 0.792, 0.7989999999999998, 0.7000000000000001, 0.7079999999999999, 0.7276666666666668, 0.7473333333333332, 0.7280000000000002, 0.7466666666666667, 0.7653333333333334, 0.7699999999999999, 0.7166666666666667, 0.7228571428571429, 0.7285714285714285, 0.7338095238095239, 0.7077976190476192, 0.7228571428571429, 0.7262500000000001, 0.7410714285714286, 0.7467857142857144, 0.7150000000000001, 0.7003571428571427, 0.7039285714285713, 0.7