In [15]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib

In [16]:
matplotlib.use("pgf")
matplotlib.rcParams.update({
    "pgf.texsystem": "pdflatex",
    'font.family': 'serif',
    'text.usetex': True,
    'pgf.rcfonts': False,
})

In [17]:
def read_data(filename):
    theta = pd.read_csv(filename, header = None)
    theta = theta[[0, 1, 2, 3, 9, 10]]
    theta[0] = theta[0].apply(lambda x: int(x.replace("(", "")))
    theta[10] = theta[10].apply(lambda x: float(x.replace(")", "")))
    for i in [2,3]:
        theta[i] = theta[i].apply(lambda x: int(x))
    theta[9] = theta[9].apply(lambda x: float(x))
    theta = theta.sort_values(by=[0, 2, 3])
    theta = theta.rename(columns={0:"partitions", 1:"condition", 2:"attrIndex1", 3:"attrIndex2", 9:"mean", 10:"std"})
    return theta

In [18]:
def extract_joins(theta_dataset, attrIndex1, attrIndex2):
    theta_extr = theta_dataset[(theta_dataset["attrIndex1"]==attrIndex1) & (theta_dataset["attrIndex2"]==attrIndex2)]
    theta_extr_less = theta_extr[theta_extr["condition"]=="<"].sort_values(by="partitions")
    theta_extr_more = theta_extr[theta_extr["condition"]==">"].sort_values(by="partitions")
    return theta_extr_less, theta_extr_more

In [25]:
def plot_dataset(theta_less, theta_more, size_label):
    plt.figure(figsize=(17, 8))

    # Plot data and save figure
    markers, caps, bars = plt.errorbar(theta_less["partitions"], theta_less["mean"], yerr=theta_less["std"], label = "$<$ condition",
                                       color="blue", marker="D", markersize=7)
    [bar.set_alpha(0.5) for bar in bars]
    markers, caps, bars = plt.errorbar(theta_more["partitions"], theta_more["mean"], yerr=theta_more["std"], label = "$>$ condition",
                                       color="orange", marker="o", markersize=7)
    [bar.set_alpha(0.5) for bar in bars]
    plt.grid(linestyle='dotted')

    # set labels (LaTeX can be used) -> Note: with the setting deactivated, this will print \textbf{...}
    plt.xlabel(r'\textbf{Partitions number}', fontsize=11)
    plt.ylabel(r'\textbf{Query execution time [s] }', fontsize=11)
    plt.title(r'\textbf{Execution time change with number of partitions - ' + size_label + ' datasets}', fontsize=13.5)
    plt.legend()
    plt.xscale("log")

    plt.savefig("plot/time_change" + size_label + ".pdf")
    plt.close()

In [30]:
theta_4000 = read_data("../thetajoin_count_4000.txt")

In [27]:
theta_4000_2_2_less, theta_4000_2_2_more = extract_joins(theta_4000, 2, 2)

In [29]:
plot_dataset(theta_4000_2_2_less, theta_4000_2_2_more, "4K")

In [31]:
labels = ["1K", "2K", "3K", "4K"]
for index, i in enumerate(range(1000, 4001, 1000)):
    filename = "../thetajoin_count_" + str(i) + ".txt"
    theta = read_data(filename)
    theta_less, theta_more = extract_joins(theta, 2, 2)
    plot_dataset(theta_less, theta_more, labels[index])