In [None]:
def tokenize_eval(modelfile, subfolder, evaldata, outdir):
    from transformers import AutoTokenizer
    import pandas as pd
    import os

    tokenizer = AutoTokenizer.from_pretrained(
        modelfile,
        subfolder=subfolder
    )

    lexdata = pd.read_csv(open(evaldata, "r"), delimiter="\t")
    tokens = [str(x) for x in lexdata["spelling"]]
    lexicality = lexdata["lexicality"]

    subtokens = [tokenizer.tokenize(token) for token in tokens]

    chunkability = [1 - (len(subtokens[i]) / len(tokens[i])) for i in range(len(tokens))]
    num_splits = [len(toklist) - 1 for toklist in subtokens]
    reading_times = lexdata["rt"]
    lengths = [len(token) for token in tokens]
    accuracies = lexdata["accuracy"]

    results = pd.DataFrame(list(
        zip(tokens, lexicality, lengths, subtokens, num_splits, chunkability, reading_times, accuracies)
    ), columns=["Stimulus", "Lexicality", "Length", "Subtokens", "Num_Splits", "Chunkability", "Reading_Time", "Accuracy"])

    os.makedirs(outdir, exist_ok=True)
    outfile = os.path.join(outdir, f"{subfolder}_output.csv")
    results.to_csv(outfile, index=False)
    print(f"Saved results for {subfolder} to {outfile}")



# --- TOKENIZER SUBFOLDERS ---
subfolders = [
    "bytelevel",
    "frequency_128000",
    "frequency_16000",
    "frequency_256000",
    "frequency_32000",
    "frequency_64000",
    "frequency_8064",
    "fw57M_Entropy_frequency-mean-post-merge_128000",
    "fw57M_Entropy_frequency-mean-post-merge_16000",
    "fw57M_Entropy_frequency-mean-post-merge_256000",
    "fw57M_Entropy_frequency-mean-post-merge_32000",
    "fw57M_Entropy_frequency-mean-post-merge_64000",
    "fw57M_Entropy_frequency-mean-post-merge_8064",
    "fw57M_Entropy_min-mean-post-merge_128000",
    "fw57M_Entropy_min-mean-post-merge_16000",
    "fw57M_Entropy_min-mean-post-merge_256000",
    "fw57M_Entropy_min-mean-post-merge_32000",
    "fw57M_Entropy_min-mean-post-merge_64000",
    "fw57M_Entropy_min-mean-post-merge_8064",
    "fw57M_Surprisal_frequency-mean-post-merge_128000",
    "fw57M_Surprisal_frequency-mean-post-merge_16000",
    "fw57M_Surprisal_frequency-mean-post-merge_256000",
    "fw57M_Surprisal_frequency-mean-post-merge_32000",
    "fw57M_Surprisal_frequency-mean-post-merge_64000",
    "fw57M_Surprisal_frequency-mean-post-merge_8064",
    "fw57M_Surprisal_min-mean-post-merge_128000",
    "fw57M_Surprisal_min-mean-post-merge_16000",
    "fw57M_Surprisal_min-mean-post-merge_256000",
    "fw57M_Surprisal_min-mean-post-merge_32000",
    "fw57M_Surprisal_min-mean-post-merge_64000",
    "fw57M_Surprisal_min-mean-post-merge_8064",
    "ngram_Entropy_frequency-mean-post-merge_128000",
    "ngram_Entropy_frequency-mean-post-merge_16000",
    "ngram_Entropy_frequency-mean-post-merge_256000",
    "ngram_Entropy_frequency-mean-post-merge_32000",
    "ngram_Entropy_frequency-mean-post-merge_64000",
    "ngram_Entropy_frequency-mean-post-merge_8064",
    "ngram_Entropy_min-mean-post-merge_128000",
    "ngram_Entropy_min-mean-post-merge_16000",
    "ngram_Entropy_min-mean-post-merge_256000",
    "ngram_Entropy_min-mean-post-merge_32000",
    "ngram_Entropy_min-mean-post-merge_64000",
    "ngram_Entropy_min-mean-post-merge_8064",
    "ngram_Space Probability_frequency-mean-post-merge_128000",
    "ngram_Space Probability_frequency-mean-post-merge_16000",
    "ngram_Space Probability_frequency-mean-post-merge_256000",
    "ngram_Space Probability_frequency-mean-post-merge_32000",
    "ngram_Space Probability_frequency-mean-post-merge_64000",
    "ngram_Space Probability_frequency-mean-post-merge_8064",
    "ngram_Space Probability_min-mean-post-merge_128000",
    "ngram_Space Probability_min-mean-post-merge_16000",
    "ngram_Space Probability_min-mean-post-merge_256000",
    "ngram_Space Probability_min-mean-post-merge_32000",
    "ngram_Space Probability_min-mean-post-merge_64000",
    "ngram_Space Probability_min-mean-post-merge_8064",
    "ngram_Surprisal_frequency-mean-post-merge_128000",
    "ngram_Surprisal_frequency-mean-post-merge_16000",
    "ngram_Surprisal_frequency-mean-post-merge_256000",
    "ngram_Surprisal_frequency-mean-post-merge_32000",
    "ngram_Surprisal_frequency-mean-post-merge_64000",
    "ngram_Surprisal_frequency-mean-post-merge_8064",
    "ngram_Surprisal_min-mean-post-merge_128000",
    "ngram_Surprisal_min-mean-post-merge_16000",
    "ngram_Surprisal_min-mean-post-merge_256000",
    "ngram_Surprisal_min-mean-post-merge_32000",
    "ngram_Surprisal_min-mean-post-merge_64000",
    "ngram_Surprisal_min-mean-post-merge_8064"
]


In [None]:
# --- SETTINGS ---
modelfile = "InfoTokenizers/tokenizers"

# Set a local path to blp-items.txt
evaldata = #download this from https://github.com/codebyzeb/infotokenization/blob/main/eval/cog-plausibility/en/blp-items.txt
outdir = #some output directory, e.g., "/Tokenizers/chunk"

In [None]:

# --- RUN ---
for subfolder in subfolders:
    tokenize_eval(modelfile, subfolder, evaldata, outdir)


In [None]:
import pandas as pd
import numpy as np
import os
from ast import literal_eval
from scipy.stats import pearsonr

#we load directory containing chunk stats
chunk_stats = []

for file in files:
    # Only include relevant tokenizer outputs
    if not file.endswith("_output.csv") and not file.endswith("_output.tsv"):
        continue

    filepath = os.path.join(outdir, file)
    try:
        if file.endswith(".tsv"):
            df = pd.read_csv(filepath, sep="\t")
        else:
            df = pd.read_csv(filepath)
    except Exception as e:
        print(f"Error reading {file}: {e}")
        continue

    # Check that all required columns exist
    required_columns = {"Lexicality", "Chunkability", "Num_Splits"}
    if not required_columns.issubset(df.columns):
        print(f"Skipping {file}: missing columns {required_columns - set(df.columns)}")
        continue

    for category_label, category_df in df.groupby("Lexicality"):
        label = "Words" if category_label == "W" else "Non-Words"

        chunkability_vals = category_df["Chunkability"].dropna()
        num_splits_vals = category_df["Num_Splits"].dropna()

        chunk_stats.append({
            "Tokenizer": file.replace(".csv", "").replace(".tsv", ""),
            "Category": label,
            "Chunkability_Mean": np.mean(chunkability_vals),
            "Chunkability_Stdev": np.std(chunkability_vals),
            "NumSplits_Mean": np.mean(num_splits_vals),
            "NumSplits_Stdev": np.std(num_splits_vals)
        })

# Save results
chunk_stats_df = pd.DataFrame(chunk_stats)
chunk_path = os.path.join(outdir, "chunkability_stats.tsv")
chunk_stats_df.to_csv(chunk_path, sep="\t", index=False)
print(f"\nSaved chunkability stats to {chunk_path}")

Ok, so to get the table you can use either of the two cells and should get the same output

In [None]:
import pandas as pd
import numpy as np
import os
from ast import literal_eval
from scipy.stats import pearsonr

# Collect all CSV files
files = [f for f in os.listdir(outdir) if f.endswith(".csv") or f.endswith(".tsv")]
print(f"Found {len(files)} tokenizer output files.\n")

results = []

print("Checking for non-significant correlations at p >= 0.01\n")

for file in files:
    # Only include relevant tokenizer outputs
    if not file.endswith("_output.csv") and not file.endswith("_output.tsv"):
        continue # Skip files that are not tokenizer outputs

    filepath = os.path.join(outdir, file)
    print(f"Tokenizer: {file}")

    # Load tokenizer output
    if file.endswith(".tsv"):
        df = pd.read_csv(filepath, sep="\t")
    else:
        df = pd.read_csv(filepath)

    # Split into words and nonwords
    words = df[df["Lexicality"] == "W"]
    nonwords = df[df["Lexicality"] == "N"]

    datasets = {"words": words, "nonwords": nonwords}

    # Iterate through categories
    for category, dataset in datasets.items():
        tokens = list(dataset["Stimulus"])
        rts = list(dataset["Reading_Time"])
        accs = list(dataset["Accuracy"])

        # Get number of splits and wordiness
        num_splits = list(dataset["Num_Splits"])
        wordiness = list(dataset["Chunkability"])

        # Compute correlations
        corr1, p1 = pearsonr(num_splits, rts)
        corr2, p2 = pearsonr(num_splits, accs)
        corr3, p3 = pearsonr(wordiness, rts)
        corr4, p4 = pearsonr(wordiness, accs)

        results.append({
            "tokenizer": file.replace(".csv", "").replace(".tsv", ""),
            "category": category,
            "measure": "NumSplits vs RT",
            "r": corr1,
            "p": p1
        })
        results.append({
            "tokenizer": file.replace(".csv", "").replace(".tsv", ""),
            "category": category,
            "measure": "NumSplits vs Acc",
            "r": corr2,
            "p": p2
        })
        results.append({
            "tokenizer": file.replace(".csv", "").replace(".tsv", ""),
            "category": category,
            "measure": "Wordiness vs RT",
            "r": corr3,
            "p": p3
        })
        results.append({
            "tokenizer": file.replace(".csv", "").replace(".tsv", ""),
            "category": category,
            "measure": "Wordiness vs Acc",
            "r": corr4,
            "p": p4
        })

    print("-" * 40)

# Save all results to a CSV
results_df = pd.DataFrame(results)
save_path = os.path.join(outdir, "correlation_results.tsv")
results_df.to_csv(save_path, sep="\t", index=False)

print(f"\nSaved all correlation results to {save_path}")

In [None]:
import pandas as pd
from scipy.stats import pearsonr
from ast import literal_eval
import os
# Directories for evaluation and results
save_path = os.path.join(outdir, "final_correlation_results.csv")

# Function to evaluate the correlation results from the CSV files
def evaluate_correlation_results():
    # Define the path where the tokenizer results are stored
    resultpath = outdir_tokenizer  # Using the same directory as the output folder
    files = os.listdir(resultpath)

    all_results = []

    for file in files:
        # Process only CSV files and filter out the correlation results files
        if file.endswith(".csv") and not file.startswith("correlation_results") and not file.startswith("chunkability_stats") and not file.startswith("final_correlation_results"):
            filepath = os.path.join(resultpath, file)
            print(f"Tokenizer: {file}")
            
            # Load CSV file
            df = pd.read_csv(filepath)  # Read CSV files
            
            # Drop rows with missing values
            df = df.dropna()

            # Split the data into words and nonwords
            words = df[df["Lexicality"] == "W"]
            nonwords = df[df["Lexicality"] == "N"]
            datasets = {"words": words, "nonwords": nonwords}
            
            print(f"Processing file: {file}")

            # Extract filename without extension
            filename_no_extension = os.path.splitext(file)[0]

            # Iterate through categories (words, nonwords)
            for category, dataset in datasets.items():
                print(f"Processing category: {category}")
                
                # Initialize variables to store correlations
                category_results = []

                # Measurements
                tokens = list(dataset["Stimulus"])
                rts = list(dataset["Reading_Time"])
                accs = list(dataset["Accuracy"])
                num_splits = list(dataset["Num_Splits"])
                chunkability = list(dataset["Chunkability"])

                # Calculate splits and wordiness for each model
                splits = list(dataset["Subtokens"].apply(literal_eval))  # Assuming 'Subtokens' contains the splits
                num_splits = [len(x) - 1 for x in splits]  # Number of splits is len(splits) - 1
                max_len = len(dataset)
                wordiness = [1 - (len(splits[i]) / len(str(tokens[i]))) for i in range(max_len)]  # Wordiness formula

                # Compute correlations
                corr1, p1 = pearsonr(num_splits, rts)   # Correlation of Num_Splits with RT
                corr2, p2 = pearsonr(num_splits, accs)  # Correlation of Num_Splits with Accuracy
                corr3, p3 = pearsonr(wordiness, rts)    # Correlation of Wordiness with RT
                corr4, p4 = pearsonr(wordiness, accs)   # Correlation of Wordiness with Accuracy

                # Store the results for the current category
                results = [
                    filename_no_extension,  # Add the filename (without extension)
                    category,
                    "{:.2f}".format(corr1),
                    "{:.2f}".format(corr2),
                    "{:.2f}".format(corr3),
                    "{:.2f}".format(corr4)
                ]
                category_results.append(results)
                
                # Append the results for the category
                all_results.append((category, category_results))

    # Save the results to a CSV file
    with open(save_path, "w", encoding="utf-8") as outfile:
        # Write the header
        outfile.write("Tokenizer, Category, NumSplits_RT, NumSplits_Acc, Chunkability_RT, Chunkability_Acc\n")
        
        # Write each result line
        for category, category_results in all_results:
            for result in category_results:
                filename, category, result1, result2, result3, result4 = result
                # Write result for each category
                outfile.write(f"{filename}, {category}, {result1}, {result2}, {result3}, {result4}\n")

    print(f"Correlation results saved to {save_path}")

# Run the evaluation
evaluate_correlation_results()