In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np

In [None]:
# sns.set(rc = {'figure.figsize':(8,8)})

# DFs

In [None]:
paths = {
    "dijbytes": "dataset_sizes_bytes.csv",
    "Wij": "../SAMPLING_WEIGHTS/SAMPLING_WEIGHTS_real.csv",
    "dijtokens": "dataset_sizes_tokens.csv",
}

In [None]:
dfs = {k: pd.read_csv(paths[k], index_col=0) for k in paths.keys()}

dfs["dijbytes*Wij"] = pd.DataFrame(dfs["dijbytes"].values*dfs["Wij"].values, 
                                   columns=dfs["dijbytes"].columns, 
                                   index=dfs["dijbytes"].index)
dfs["dijtokens*Wij"] = pd.DataFrame(dfs["dijtokens"].values*dfs["Wij"].values, 
                                   columns=dfs["dijtokens"].columns, 
                                   index=dfs["dijtokens"].index)

dfs["fijbytes"] = dfs["dijbytes"].div(dfs["dijbytes"].to_numpy().sum()).multiply(100)
dfs["Fijbytes"] = dfs["dijbytes*Wij"].div(dfs["dijbytes*Wij"].to_numpy().sum()).multiply(100)
dfs["fijtokens"] = dfs["dijtokens"].div(dfs["dijtokens"].to_numpy().sum()).multiply(100)
dfs["Fijtokens"] = dfs["dijtokens*Wij"].div(dfs["dijtokens*Wij"].to_numpy().sum()).multiply(100)
    
for key in dfs.keys():
    if key == "Wij":
        dfs[key]["total"] = dfs[key].apply(lambda x: 0, axis=1)
        dfs[key].loc["total"] = dfs[key].apply(lambda x: 0, axis=0)
    else:
        dfs[key]["total"] = dfs[key].apply(lambda x: sum(x), axis=1)
        dfs[key].loc["total"] = dfs[key].apply(lambda x: sum(x), axis=0)

In [None]:
def drop_total(_df):
    _df.drop("total", axis=0, inplace=True)
    _df.drop("total", axis=1, inplace=True)
    
def add_total(_df):
    _df["total"] = _df.apply(lambda x: sum(x), axis=1)
    _df.loc["total"] = _df.apply(lambda x: sum(x), axis=0)
    
def totex(_df, _name, header, tail="\end{tabular}}"):
    t = "\\scalebox{\\tabscale}{"
    t += header + " \n"
    t += _df.to_csv().replace(",", " & ").replace("commoncrawl", "cc").replace("conversational", "conv").replace("\n", " \\\\ \n").replace("_", "\_").replace("\\\\", "\\\\ \\hline", 1)
    if t.endswith(" \\\\ \n"):
        t = t[:-len(" \\\\ \n")]
        t += " \n"
    t = "\\\\ \\hline".join(t.rsplit("\\\\", 1))  # replace last "\\" by "\\ \hline"
    t += tail
    
    path = f"tables/{_name}.tex"
    with open(path, "w") as f:
        f.write(t)

### Step 0: Plain dataset sizes

In [None]:
totex(dfs["dijbytes"].applymap(lambda x: f"{x:.1f}"), "dijbytes", header="\\begin{tabular}{c||c|c|c|c|c|c||c}")
dfs["dijbytes"]

In [None]:
totex(dfs["fijbytes"].applymap(lambda x: f"{x:.2f}"), "fijbytes", header="\\begin{tabular}{c||c|c|c|c|c|c||c}")
dfs["fijbytes"]

### Step 1: Tokenizer Training

In [None]:
dfs["Wij"]

In [None]:
totex(dfs["Wij"], "Wij", header="\\begin{tabular}{c||c|c|c|c|c|c||c}")

In [None]:
dfs["dijbytes*Wij"] 

In [None]:
totex(dfs["Fijbytes"].applymap(lambda x: f"{x:.2f}"), "FFijbytes", header="\\begin{tabular}{c||c|c|c|c|c|c||c}")
dfs["Fijbytes"]

### Step 2: Tokenizer Application

In [None]:
totex(dfs["dijtokens"], "dijtokens", header="\\begin{tabular}{c||c|c|c|c|c|c||c}")
dfs["dijtokens"]

In [None]:
totex(dfs["fijtokens"].applymap(lambda x: f"{x:.2f}"), "fijtokens", header="\\begin{tabular}{c||c|c|c|c|c|c||c}")
dfs["fijtokens"]

In [None]:
T = dfs["dijtokens"].loc["total"]["total"]
T

In [None]:
dfs["rij"] = dfs["dijtokens"]/dfs["dijbytes"]/10**9
dfs["rij"]

In [None]:
m1, m2 = min([elem for val in dfs["rij"].values for elem in val]), max([elem for val in dfs["rij"].values for elem in val])
1/m1, 1/m2

### Step 3: Model Training

In [None]:
totex(dfs["Fijtokens"].applymap(lambda x: f"{x:.2f}"), "FFijtokens", header="\\begin{tabular}{c||c|c|c|c|c|c||c}")
dfs["Fijtokens"]

In [None]:
dfs["Eij"] = T*dfs["Fijtokens"]/100/dfs["dijtokens"]
dfs["Eij"] = dfs["Eij"].fillna(0)
totex(dfs["Eij"].applymap(lambda x: f"{x:.2f}"), "Eij", header="\\begin{tabular}{c||c|c|c|c|c|c||c}")
dfs["Eij"]

In [None]:
dfs["Eij_rounded"] = dfs["Eij"].applymap(lambda x: np.ceil(x))
drop_total(dfs["Eij_rounded"])
dfs["Eij_rounded"] = dfs["Eij_rounded"].fillna(0)
# totex(dfs["Eij_rounded"].applymap(lambda x: f"{x:.0f}"), "Eij_rounded", header="\\begin{tabular}{c||c|c|c|c|c|c||c}")
dfs["Eij_rounded"]

In [None]:
dfs["Tijmax"] = dfs["Eij_rounded"]*dfs["dijtokens"]/(dfs["Fijtokens"]/100)
dfs["Tijmax"].drop("total", axis=1, inplace=True)
# dfs["Tijmax"] = dfs["Tijmax"].fillna(0)
dfs["Tijmax"]

In [None]:
_t = [value for array in dfs["Tijmax"].values for value in array if value > 0]

Tmax = min(_t)
Tmax

### Plot

In [None]:
I = [elem for elem in dfs["Eij"].index.to_list() if elem != "total"]
J = [elem for elem in dfs["Eij"].columns.to_list() if elem != "total"]
I, J

In [None]:
Tthr = 98.8*10**9

In [None]:
verbose = 0
xlim = 500

fig, ax = plt.subplots(1, 1, figsize=(8, 6))
if not isinstance(ax, list):
    ax = [ax, None]

ax[0].set_xlim([0, xlim])
ax[0].set_ylim([0, 1])
ax[0].set_xlabel("t [10^9 tokens]", fontsize=14)
_ = ax[0].plot()

y = 0
for i, category in enumerate(I):
    for j, language in enumerate(J):
        clr = "r" if (2*i+j)%2 == 0 else "green"
        dijtokens = dfs["dijtokens"].iloc[i, j]/10**9
        Fijtokens = dfs["Fijtokens"].iloc[i, j]/100
        Tijmax = dfs["Tijmax"].iloc[i, j]/10**9
        
        _xlim = Tijmax if Tijmax > 0 else xlim
    
        x = np.linspace(0, _xlim, 2)
        y1 = [y]*len(x)
        y2 = [y + Fijtokens]*len(x)
        if verbose:
            print(category, language, f"{Fijtokens:.2f}", clr)
        _ = ax[0].fill_between(x, y1, y2, color=clr, alpha=0.5)
        
        if Fijtokens > 0:
            E_1 = dijtokens / Fijtokens
            if verbose:
                print(E_1)
            _ = ax[0].plot([E_1, E_1], 
                           [y1[0], y2[0]], 
                           linestyle="-", 
                           color="k", 
                           label="E_ij" if i == 0 and j == 0 else None)
            
        if Fijtokens > 0.05:
            _ = ax[0].text(10, y + 0.02, f"{category}, {language}")
            _ = ax[0].text(T/10**9 + 10, y + 0.02, f"{T/E_1/10**9:.2f}")
            
        y += Fijtokens
        
ax[0].plot([Tthr/10**9, Tthr/10**9], [0, 1], linestyle=":", color="k", label="T_thr")
ax[0].plot([T/10**9, T/10**9], [0, 1], linestyle="--", color="k", label="T")
_ = ax[0].legend(loc="upper right")

plt.savefig("./figs/data_overview.png", facecolor='w')

# OLD

### Minimum Hypothesis

# HEATMAPS

In [None]:
ax = sns.heatmap(dfs["dijbytes"], annot=True)
ax.set_title('dataset sizes [bytes]')
plt.show()

In [None]:
ax = plt.axes()
sns.heatmap(dfs["Wij"], annot=True)
ax.set_title('weights')
plt.show()

In [None]:
ax = plt.axes()
sns.heatmap(dfs["fijbytes"], annot=True)
ax.set_title('dataset_size [%]')
plt.show()

In [None]:
ax = plt.axes()
sns.heatmap(dfs["Fijbytes"], annot=True)
ax.set_title('dataset_size weighted [%]')
plt.show()