In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np

In [None]:
# sns.set(rc = {'figure.figsize':(8,8)})

# DFs

In [None]:
paths = {
    "dijbytes": "dataset_sizes_bytes.csv",
    "Wij": "../SAMPLING_WEIGHTS/SAMPLING_WEIGHTS_real.csv",
    "dijtokens": "dataset_sizes_tokens.csv",
}

In [None]:
dfs = {k: pd.read_csv(paths[k], index_col=0) for k in paths.keys()}

dfs["dijbytes*Wij"] = pd.DataFrame(dfs["dijbytes"].values*dfs["Wij"].values, 
                                   columns=dfs["dijbytes"].columns, 
                                   index=dfs["dijbytes"].index)
dfs["dijtokens*Wij"] = pd.DataFrame(dfs["dijtokens"].values*dfs["Wij"].values, 
                                   columns=dfs["dijtokens"].columns, 
                                   index=dfs["dijtokens"].index)

dfs["fijbytes"] = dfs["dijbytes"].div(dfs["dijbytes"].to_numpy().sum()).multiply(100)
dfs["Fijbytes"] = dfs["dijbytes*Wij"].div(dfs["dijbytes*Wij"].to_numpy().sum()).multiply(100)
dfs["fijtokens"] = dfs["dijtokens"].div(dfs["dijtokens"].to_numpy().sum()).multiply(100)
dfs["Fijtokens"] = dfs["dijtokens*Wij"].div(dfs["dijtokens*Wij"].to_numpy().sum()).multiply(100)
    
for key in dfs.keys():
    if key == "Wij":
        dfs[key]["total"] = dfs[key].apply(lambda x: 0, axis=1)
        dfs[key].loc["total"] = dfs[key].apply(lambda x: 0, axis=0)
    else:
        dfs[key]["total"] = dfs[key].apply(lambda x: sum(x), axis=1)
        dfs[key].loc["total"] = dfs[key].apply(lambda x: sum(x), axis=0)

In [None]:
def drop_total(_df):
    _df.drop("total", axis=0, inplace=True)
    _df.drop("total", axis=1, inplace=True)
    
def add_total(_df):
    _df["total"] = _df.apply(lambda x: sum(x), axis=1)
    _df.loc["total"] = _df.apply(lambda x: sum(x), axis=0)
    
def totex(_df, _name, header, tail="\end{tabular}}"):
    t = "\\scalebox{\\tabscale}{"
    t += header + " \\\\\n"
    t += _df.to_csv().replace(",", " & ").replace("commoncrawl", "cc").replace("conversational", "conv").replace("\n", " \\\\ \n").replace("_", "\_").replace("\\\\", "\\\\ \\hline", 1)
    if t.endswith(" \\\\ \n"):
        t = t[:-len(" \\\\ \n")]
        t += " \n"
    t += tail
    
    path = f"tables/{_name}.tex"
    with open(path, "w") as f:
        f.write(t)

### Step 0: Plain dataset sizes

In [None]:
totex(dfs["dijbytes"].applymap(lambda x: f"{x:.1f}"), "dijbytes", header="\\begin{tabular}{c||c|c|c|c|c|c||c}")
dfs["dijbytes"]

In [None]:
totex(dfs["fijbytes"].applymap(lambda x: f"{x:.2f}"), "fijbytes", header="\\begin{tabular}{c||c|c|c|c|c|c||c}")
dfs["fijbytes"]

### Step 1: Tokenizer Training

In [None]:
dfs["Wij"]

In [None]:
totex(dfs["Wij"], "Wij", header="\\begin{tabular}{c||c|c|c|c|c|c||c}")

In [None]:
dfs["dijbytes*Wij"] 

In [None]:
totex(dfs["Fijbytes"].applymap(lambda x: f"{x:.2f}"), "FFijbytes", header="\\begin{tabular}{c||c|c|c|c|c|c||c}")
dfs["Fijbytes"]

### Step 2: Tokenizer Application

In [None]:
totex(dfs["dijtokens"], "dijtokens", header="\\begin{tabular}{c||c|c|c|c|c|c||c}")
dfs["dijtokens"]

In [None]:
totex(dfs["fijtokens"].applymap(lambda x: f"{x:.2f}"), "fijtokens", header="\\begin{tabular}{c||c|c|c|c|c|c||c}")
dfs["fijtokens"]

In [None]:
T = dfs["dijtokens"].loc["total"]["total"]
T

In [None]:
dfs["rij"] = dfs["dijtokens"]/dfs["dijbytes"]/10**9
dfs["rij"]

In [None]:
m1, m2 = min([elem for val in dfs["rij"].values for elem in val]), max([elem for val in dfs["rij"].values for elem in val])
1/m1, 1/m2

### Step 3: Model Training

In [None]:
totex(dfs["Fijtokens"].applymap(lambda x: f"{x:.2f}"), "FFijtokens", header="\\begin{tabular}{c||c|c|c|c|c|c||c}")
dfs["Fijtokens"]

### Step 3b: Model Training

In [None]:
dfs["Eij"] = T*dfs["Fijtokens"]/100/dfs["dijtokens"]
dfs["Eij"] = dfs["Eij"].fillna(0)
totex(dfs["Eij"].applymap(lambda x: f"{x:.2f}"), "Eij", header="\\begin{tabular}{c||c|c|c|c|c|c||c}")
dfs["Eij"]

In [None]:
dfs["Eij_rounded"] = dfs["Eij"].applymap(lambda x: np.ceil(x))
drop_total(dfs["Eij_rounded"])
dfs["Eij_rounded"] = dfs["Eij_rounded"].fillna(0)
totex(dfs["Eij_rounded"].applymap(lambda x: f"{x:.0f}"), "Eij_rounded", header="\\begin{tabular}{c||c|c|c|c|c|c||c}")
dfs["Eij_rounded"]

In [None]:
_dfs_dijtokens = dfs["dijtokens"].copy()
drop_total(_dfs_dijtokens)
dfs["Fijtokens_rounded"] = pd.DataFrame(dfs["Eij_rounded"].values*_dfs_dijtokens.values/T*100, 
                                   columns=dfs["Eij_rounded"].columns, 
                                   index=dfs["Eij_rounded"].index)
dfs["Fijtokens_rounded"] = dfs["Fijtokens_rounded"].fillna(0)
add_total(dfs["Fijtokens_rounded"])

totex(dfs["Fijtokens_rounded"].applymap(lambda x: f"{x:.2f}"), "Fijtokens_rounded", header="\\begin{tabular}{c||c|c|c|c|c|c||c}")
dfs["Fijtokens_rounded"]

In [None]:
factor = dfs["Fijtokens_rounded"].loc["total", "total"]/100
factor

dfs["Fijtokens_rounded_normalized"] = dfs["Fijtokens_rounded"].applymap(lambda x: x/factor)

totex(dfs["Fijtokens_rounded_normalized"].applymap(lambda x: f"{x:.2f}"), "Fijtokens_rounded_normalized", header="\\begin{tabular}{c||c|c|c|c|c|c||c}")
dfs["Fijtokens_rounded_normalized"]

In [None]:
dfs["Fijtokens"]

In [None]:
dfs["Fijtokens_rounded_normalized_ratio"] = dfs["Fijtokens_rounded_normalized"]/dfs["Fijtokens"]
dfs["Fijtokens_rounded_normalized_ratio"] = dfs["Fijtokens_rounded_normalized_ratio"].fillna(0)
totex(dfs["Fijtokens_rounded_normalized_ratio"].applymap(lambda x: f"{x:.2f}"), "Fijtokens_rounded_normalized_ratio", header="\\begin{tabular}{c||c|c|c|c|c|c||c}")
dfs["Fijtokens_rounded_normalized_ratio"]

### Minimum Hypothesis

# HEATMAPS

In [None]:
sns.heatmap(dfs["dijbytes"], annot=True)
ax.set_title('dataset sizes [bytes]')
plt.show()

In [None]:
ax = plt.axes()
sns.heatmap(dfs["Wij"], annot=True)
ax.set_title('weights')
plt.show()

In [None]:
ax = plt.axes()
sns.heatmap(dfs["fijbytes"], annot=True)
ax.set_title('dataset_size [%]')
plt.show()

In [None]:
ax = plt.axes()
sns.heatmap(dfs["Fijbytes"], annot=True)
ax.set_title('dataset_size weighted [%]')
plt.show()