In [None]:
import json
import glob
from tqdm import tqdm
import os
import pandas as pd
from collections import Counter
import logging

# Set the logging level to WARNING to suppress DEBUG messages
logging.basicConfig(level=logging.WARNING)

# Alternatively, you can set the level specifically for urllib3
logging.getLogger("urllib3").setLevel(logging.WARNING)

from utils import renyi_efficiency

from sentencepiece import SentencePieceProcessor
from transformers import AutoTokenizer, PreTrainedTokenizerFast

tqdm.pandas()


core_models = {
    "gpt2": "gpt2",
    "mpt": "mosaicml/mpt-7b-instruct",
    "bloom": "bigscience/bloom-7b1",
    "gpt-neox": "EleutherAI/gpt-neox-20b",
    "falcon": "tiiuae/falcon-40b",
    "pythia": "EleutherAI/pythia-12b",
    "codet5": "Salesforce/codet5-small",
    "incoder": "facebook/incoder-1B",
    "starcoder": "bigcode/starcoder",
    "replit": "replit/replit-code-v1_5-3b",
    "codegen": "Salesforce/codegen-350M-mono",
    "byt5": "google/byt5-small",
    "deepseek-coder": "deepseek-ai/deepseek-coder-1.3b-instruct",
    "Yi-6B": "01-ai/Yi-6B",
    "mistral": "mistralai/Mistral-7B-v0.1",
    "santacoder": "bigcode/santacoder",
    "llama": "meta-llama/Llama-2-7b",
}


sizes = {
    "gpt4": 100_256,
    "gpt-neox": 50_257,
}


def load_tokenizer(model_path: str):
    if model_path in core_models:
        pretrained_name = core_models[model_path]
        tokenizer = AutoTokenizer.from_pretrained(
            pretrained_name, trust_remote_code=True
        )
    elif os.path.exists(f"tokenizers/{model_path}.json"):
        tokenizer = PreTrainedTokenizerFast(
            tokenizer_file=f"tokenizers/{model_path}.json"
        )
    elif os.path.exists(f"tokenizers/{model_path}.model"):
        tokenizer = SentencePieceProcessor(model_file=f"tokenizers/{model_path}.model")
    else:
        return 0
    return len(tokenizer)


def column_explosion(col: str):
    def explode_dict_to_series(in_dict):
        keys = [
            "Compression",
            "Renyi",
            "Token Count",
            "Bytes per Token",
            "Chars per Token",
            "Tokens per Byte",
            "Tokens per Char",
            "Gini",
        ]
        # keys = ["Compression"]
        out_dict = {f"{col} {key}": in_dict[key] for key in keys}
        return pd.Series(out_dict)

    return explode_dict_to_series


def name_to_params(name: str) -> dict:
    if name in sizes:
        return {"vs": sizes[name]}
    params = {"rsc": 0, "u64": 0, "sp": False}
    splits = name.split("_")

    for i, split in enumerate(splits):
        if split == "rsc":
            params["rsc"] = 1
        if split == "u64":
            params["u64"] = 1
        if split == "sp":
            params["sp"] = True
        if split == "cp":
            params["cp"] = float(splits[i + 1])
        if split == "mp":
            params["mp"] = float(splits[i + 1])
        if split == "nc":
            # number of characters in Billions
            params["nc"] = int(int(splits[i + 1]) / 1_000_000_000)
        if split == "msl":
            params["msl"] = splits[i + 1]
        if split == "mt":
            params["mt"] = splits[i + 1]
        if split == "nrn":
            params["nrn"] = splits[i + 1]
        if split == "ncf":
            params["ncf"] = int(splits[i + 1])
        if split == "vs":
            params["vs"] = int(splits[i + 1])
    return params

# Get dataset statistics

In [None]:
datasets = glob.glob("data/*/test/*.jsonl")
dataset_stats = {}
for dataset in datasets:
    dataset_name = dataset.split("/")[-1].replace(".jsonl", "")
    if dataset_name in dataset_stats:
        continue
    dataset_stats[dataset_name] = {}
    data = []
    with open(dataset, "r") as f:
        data = [json.loads(line) for line in f]

    dataset_stats[dataset_name]["total_chars"] = sum([d["char_size"] for d in data])
    dataset_stats[dataset_name]["total_bytes"] = sum([d["byte_size"] for d in data])

# Read Evals

In [None]:
eval_files = glob.glob("evals/*.json")
evals = {}
for eval_file in tqdm(eval_files):
    eval_name = eval_file.split("/")[-1].replace(".eval.json", "")
    if eval_name in evals:
        continue
    with open(eval_file, "r") as f:
        evals[eval_name] = json.load(f)

### Calculate eval statistics

In [None]:
import numpy as np


# calculate compression
for eval_name in tqdm(evals):
    norm = "llama"
    assert norm in evals
    for dataset in evals[eval_name]:
        evals[eval_name][dataset]["Renyi"] = renyi_efficiency(
            Counter(evals[eval_name][dataset]["vocab_counter"]), power=2.5
        )
        evals[eval_name][dataset]["Token Count"] = sum(
            evals[eval_name][dataset]["lengths"]
        )
        evals[eval_name][dataset]["Bytes per Token"] = (
            dataset_stats[dataset]["total_bytes"]
            / evals[eval_name][dataset]["Token Count"]
        )
        evals[eval_name][dataset]["Chars per Token"] = (
            dataset_stats[dataset]["total_chars"]
            / evals[eval_name][dataset]["Token Count"]
        )
        evals[eval_name][dataset]["Tokens per Char"] = (
            evals[eval_name][dataset]["Token Count"]
            / dataset_stats[dataset]["total_chars"]
        )
        evals[eval_name][dataset]["Tokens per Byte"] = (
            evals[eval_name][dataset]["Token Count"]
            / dataset_stats[dataset]["total_bytes"]
        )
        evals[eval_name][dataset]["Compression"] = evals[eval_name][dataset][
            "Token Count"
        ] / sum(evals[norm][dataset]["lengths"])

In [None]:
print("to evals df")
df = pd.DataFrame.from_dict(evals, orient="index")

for col in tqdm(df.columns):
    df = df.join(df[col].apply(column_explosion(col)))
    df.drop(columns=[col], inplace=True)

df["Tokenizer"] = df.index
df.reset_index(drop=True, inplace=True)
df = df.join(df["Tokenizer"].apply(name_to_params).apply(pd.Series))
df["Vocabulary Size"] = df.Tokenizer.progress_apply(load_tokenizer)

In [None]:
# fill in missing values
df.loc[df["Vocabulary Size"] == 0, "Vocabulary Size"] = df.loc[
    df["Vocabulary Size"] == 0, "vs"
]
df["Vocabulary Size"] = df["Vocabulary Size"].fillna(df["vs"])

In [None]:
import glob

code_datasets = [
    p.split("/")[-1].replace(".jsonl", "") for p in glob.glob("data/code/test/*")
]

", ".join(sorted(code_datasets))


### Average over code/english/multilingual

In [None]:
code_datasets = [
    p.split("/")[-1].replace(".jsonl", "") for p in glob.glob("data/code/test/*")
]
english_datasets = [
    p.split("/")[-1].replace(".jsonl", "") for p in glob.glob("data/english/test/*")
]
multilingual_datasets = [
    p.split("/")[-1].replace(".jsonl", "")
    for p in glob.glob("data/multilingual/test/*")
]

compression_metrics = [
    "Compression",
    "Bytes per Token",
    "Chars per Token",
    "Tokens per Byte",
    "Tokens per Char",
    "Renyi",
    "Token Count",
    "Gini"
]

out_cols = []

for compression_metric in tqdm(compression_metrics):
    code_compression_metric_cols = [
        f"{dataset} {compression_metric}" for dataset in code_datasets
    ]
    english_compression_metric_cols = [
        f"{dataset} {compression_metric}" for dataset in english_datasets
    ]
    multilingual_compression_metric_cols = [
        f"{dataset} {compression_metric}" for dataset in multilingual_datasets
    ]

    df["Average Code " + compression_metric] = df[code_compression_metric_cols].mean(
        axis=1
    )
    df["Average English " + compression_metric] = df[
        english_compression_metric_cols
    ].mean(axis=1)
    df["Average Multilingual " + compression_metric] = df[
        multilingual_compression_metric_cols
    ].mean(axis=1)

    average_cols = [
        f"Average Code {compression_metric}",
        f"Average English {compression_metric}",
        f"Average Multilingual {compression_metric}",
    ]
    df["Average " + compression_metric] = df[average_cols].mean(axis=1)

    out_cols += ["Average " + compression_metric] + average_cols

In [None]:
first_cols = ["Tokenizer", "Vocabulary Size"] + out_cols


second_cols = sorted(
    list(
        set(df.columns)
        - set(first_cols)
        - set(["rsc", "u64", "sp", "cp", "msl", "mt", "mp", "nrn", "nc", "vs"])
    )
)

df = df[first_cols + second_cols]

In [None]:
df.to_csv("eval_results.csv", index=False)