In [1]:
import os
import pandas as pd

def compile_stats(data_path):
    folders = sorted(os.listdir(data_path))
    folders = [x for x in folders if x[0] != "."]
    data = []
    for task_name in folders:
        files = sorted(os.listdir(os.path.join(data_path, task_name)))
        d = {"task_name": task_name}
        for filename in files:
            if not filename.endswith(".tsv"):
                continue
            col_name = "_".join(filename.split("_")[-2:]).replace(".tsv", "")
            with open(os.path.join(data_path, task_name, filename)) as fin:
                lines = fin.readlines()
            d[col_name] = len(lines)
        data.append(d)
    return pd.DataFrame.from_dict(data)

df = compile_stats("data/crossfit")

In [2]:
import json

def get_num_test_examples(task_names):
    return df[df.task_name.isin(task_names)]["100_test"].sum()

def get_num_train_examples(task_names, include_test=False):
    def f1(row):
        return row["100_train"] + row["13_train"] + row["21_train"] + row["42_train"] + row["87_train"]
    def f2(row):
        return row["100_train"] + row["13_train"] + row["21_train"] + row["42_train"] + row["87_train"] + row["100_test"]
    if include_test:
        return df[df.task_name.isin(task_names)].apply(f2, axis=1).sum()
    return df[df.task_name.isin(task_names)].apply(f1, axis=1).sum()

TASKS_SPLITS = [
    ("data/custom_tasks_splits/random.json", False),
    ("data/custom_tasks_splits/train_clf_test_clf.json", False),
    ("data/custom_tasks_splits/train_non_nli_test_nli.json", False),
    ("data/custom_tasks_splits/train_non_para_test_para.json", False),
    ("data/custom_tasks_splits/train_non_mrc_qa_test_mrc.json", False),
    ("data/custom_tasks_splits/train_non_mc_qa_test_mc.json", False),
]

data = []
for task_split, include_test in TASKS_SPLITS:
    with open(task_split, "r") as fin:
        split_dict = json.load(fin)
    
    has_dev = len(split_dict["dev"]) > 0

    n_train_task = len(split_dict["train"])
    n_val_task = 0 if not has_dev else len(split_dict["dev"])
    n_test_task = len(split_dict["test"])

    n_train_ex = get_num_train_examples(split_dict["train"], include_test)
    n_val_ex = 0 if not has_dev else get_num_train_examples(split_dict["dev"])
    n_test_ex = get_num_test_examples(split_dict["test"])

    data.append([
        task_split.split("/")[-1].replace(".json", ""),
        n_train_task, n_val_task, n_test_task,
        n_train_ex, n_val_ex, n_test_ex])

pd.DataFrame(data, columns=["split", "n_train_task", "n_val_task", "n_test_task",
    "n_train_ex", "n_val_ex", "n_test_ex"])

Unnamed: 0,split,n_train_task,n_val_task,n_test_task,n_train_ex,n_val_ex,n_test_ex
0,random,104,20,15,23105,18080,35780
1,train_clf_test_clf,41,9,8,25665,2080,13425
2,train_non_nli_test_nli,49,0,9,28625,0,18758
3,train_non_para_test_para,54,0,4,29745,0,49448
4,train_non_mrc_qa_test_mrc,39,0,6,6240,0,33749
5,train_non_mc_qa_test_mc,25,0,20,4000,0,56851
