In [None]:
import re
import numpy as np
import pandas as pd
import matplotlib as mpl
import matplotlib.pyplot as plt

plt.rcParams["font.family"] = "Times New Roman"
plt.rcParams["font.weight"] = "normal"

In [None]:
real_datasets = ["WORDS", "GEO", "URL", "DNA"]
real_datasets_map = {
    "az-words_truncated20.txt": "WORDS",
    "GeoNames_truncated20.txt": "GEO",
    "uk-2002_truncated20.txt": "URL",
    "dna-k-mer.txt": "DNA",
}

In [None]:
df_trie = pd.read_csv("FST server results.csv")
for k, v in real_datasets_map.items():
    df_trie["dataset"] = df_trie["dataset"].str.replace(k, v)

df_learned = pd.read_csv("learned results.csv", comment="#")
df_learned.dropna(how="all", inplace=True)
for k, v in real_datasets_map.items():
    df_learned["dataset"] = df_learned["dataset"].str.replace(k, v)

df_trie["dataset"] = df_trie["dataset"].str.replace("synthetic|.txt", "", regex=True)
df_learned["dataset"] = df_learned["dataset"].str.replace("synthetic|.txt", "", regex=True)

In [None]:
def get_model_marker(model):
    for x in ["MLP", "LSTM-multi", "LSTM", "BiLSTM", "CNN"]:
        if model.startswith(x):
            return f"${model[0]}$"
    all_markers = ["o",  "p", "^", "X", "+", "*", "p", "v", "s", "d", "H", "<", ">", "2"]
    models = "SMLP.[100, 50]", "SMLP.[50, 50]", "SMLP.[30, 50]", "SMLP.[20, 50]"
    for i, x in enumerate(models):
        if model.startswith(x):
            return all_markers[i]
    return all_markers[-1]


def format_model_name(model):
    if model.startswith("SMLP."):
       # transform e.g. SMLP.[100, 50].step.10 to SMLP$_{100, 10, 50}$
       m = re.match(r"SMLP\.\[(\d+), (\d+)\]\.step\.(\d+)", model)
       b, h, d = m.groups()
       new_name = r"SMLP$_{%s, %s, %s}$" % (b, d, h)
       return model.replace(m.group(0), new_name)
    for x in ["MLP", "LSTM-multi", "LSTM", "BiLSTM", "CNN"]:
        if model.startswith(x):
            return x
    return model
        

def get_model_color(model):
    if "Expanded" in model: return "C4"
    if "no enrich" in model: return "C0"
    norm = mpl.colors.Normalize(vmin=-1, vmax=3)
    cmap = mpl.cm.ScalarMappable(norm=norm, cmap=mpl.cm.Reds)
    cmap.set_array([])
    if "bin input" in model: return "C7"
    if "step.2" in model: return cmap.to_rgba(0)
    if "step.3" in model: return cmap.to_rgba(1)
    if "step.5" in model: return cmap.to_rgba(2)
    if "step.10" in model: return cmap.to_rgba(3)
    for i, x in enumerate(["MLP", "LSTM-multi", "LSTM", "BiLSTM", "CNN"]):
        if model.startswith(x):
            return f"C{i+2}"
    return "C3"

# Real data

In [None]:
fig, axs_2d = plt.subplots(2, 2, figsize=(6.99, 6))
axs = axs_2d.flatten()

plotx = "bits/string"
ploty = "mean error"

for title, ax in zip(real_datasets, axs):
    d = df_trie[(df_trie["dataset"] == title) & (df_trie["error"] >= 2)]
    ax.plot(d["trie bytes"] * 8 / d["n"], d["error"], "-",
            marker=None, markersize=2, label="FST")
    
    d2 = df_learned[(df_learned["dataset"] == title)]
    for name in d2["model"].unique():
        d3 = d2[d2["model"] == name]
        ax.plot(d3[plotx], d3[ploty], linestyle="None",
                marker=get_model_marker(name), 
                label=format_model_name(name),
                c=get_model_color(name))
    ax.set_ylim(0, max(d2[ploty]) * 1.08)
    ax.set_xlim(0, max(d2[plotx] * 1.08))

    ax.set_title("%s (%d strings)" % (title, d["n"].iloc[0]))
    ax.minorticks_on()
    ax.grid(which="both", linestyle=":", linewidth="0.5", color="#DEDEDE")
    ax.legend(ncol=2, prop={"size": 5.3})
    if ax == axs[2] or ax == axs[3]: ax.set_xlabel("Size (bits/string)")
    if ax == axs[0] or ax == axs[2]: ax.set_ylabel(ploty.capitalize())

fig.tight_layout()
plt.savefig(f"Real {ploty}.pdf")
plt.show()

# Synthetic data

In [None]:
import re

plotx = "bits/string"
ploty = ("mean error", 800)

groups = iter(["Increasing density", "Increasing alphabet size", "Increasing string length", "Fixing dataset size, and varying string length or alphabet size"])

datasets = ["L12_P6_D0.01_A4", "L12_P6_D0.1_A4", "L12_P6_D0.5_A4",    # Varying density
            "L8_P4_D0.01_A12", "L8_P4_D0.01_A14", "L8_P4_D0.01_A16",  # Varying alphabet size
            "L14_P7_D0.01_A4", "L15_P7_D0.01_A4", "L16_P8_D0.01_A4",  # Varying length
            "L8_P4_D0.005394798103178236_A14", "L8_P4_D0.001847386360168457_A16", "L12_P6_D8.298315997062881e-09_A14"] # Keeping size fixed

fig, axs_2d = plt.subplots(4, 3, figsize=(6.99, 8.99), sharex=False, sharey=False)
axs = axs_2d.flatten()

for title, ax in zip(datasets, axs):
    d = df_trie[(df_trie["dataset"] == title) & (df_trie["error"] >= 2)]
    trie_bit_per_string = d["trie bytes"] * 8 / d["n"]
    ax.plot(trie_bit_per_string, d["error"], "-",
            marker=None, markersize=2, label="FST")
    
    d2 = df_learned[(df_learned["dataset"] == title)]
    for name in d2["model"].unique():
        d3 = d2[d2["model"] == name]
        ax.plot(d3[plotx], d3[ploty[0]], linestyle="None",
                marker=get_model_marker(name), 
                label=format_model_name(name),
                c=get_model_color(name))

    if np.where(axs_2d == ax)[0] == 3:
        title = re.sub(r"D.+?_", "", title)

    title = re.sub(r"P\d+_", "", title)
    title = re.sub(r"([LPDA])", r"\1=", title)
    title = title.replace("_", ",").replace("A", "\sigma")
    title = "\n$_{%s}$" % title
    if np.where(axs_2d == ax)[1][0] == 1:
        title = next(groups) + title

    ax.set_title(title)
    ax.set_xlabel("Size (bits/string)")
    ax.minorticks_on()
    ax.grid(which="both", linestyle=":", linewidth="0.5", color="#DEDEDE")
    ax.legend(ncol=1, prop={"size": 5.5})
    ax.set_xlim(0, min(1.1 * d2[plotx].max(), trie_bit_per_string.max()))
    ax.set_ylim(-10, ploty[1])
    if np.where(axs_2d == ax)[1][0] == 0: ax.set_ylabel(ploty[0].capitalize())

fig.tight_layout()
plt.savefig(f"Synthetic {ploty[0]}.pdf")
plt.show()