In [42]:
import os
import pandas as pd



In [43]:
name_mapping = {
    'stsbenchmark': 'stsbenchmark-test-sts',
    'ms_mrpc': 'ms-mrpc',
    'onestop_all': 'onestop_parallel_all_pairs',
    'simple_amr': 'amr_true_paraphrases',
    'fb_anli_pre_hyp': 'fb-anli-pre-hyp',
    'fb_anli_hyp_pre': 'fb-anli-hyp-pre',
    'sickr_sts': 'sickr-sts',
    'pawsx_test': 'paws-x-test',
    'stanfordnlp_snli_pre_hyp': 'stannlp-snli-pre-hyp',
    'fb_xnli_pre_hyp': 'fb-xnli-pre-hyp',
    'fb_xnli_hyp_pre': 'fb-xnli-hyp-pre',
    'stanfordnlp_snli_hyp_pre': 'stannlp-snli-hyp-pre'
}


In [44]:
import pandas as pd
from collections import defaultdict


NO_ID_KEY = "NOID"
MULTI_LANG_KEY = "MULTILANG"
ID_KEY = "id"
id_mapping = {
    'stsbenchmark': ID_KEY,
    'ms_mrpc': NO_ID_KEY,
    'onestop_all': 'OriginalID',
    'simple_amr': NO_ID_KEY,
    'fb_anli_pre_hyp': ID_KEY,
    'fb_anli_hyp_pre': ID_KEY,
    'sickr_sts': ID_KEY,
    'pawsx_test': MULTI_LANG_KEY,
    'stanfordnlp_snli_pre_hyp': ID_KEY,
    'fb_xnli_pre_hyp': MULTI_LANG_KEY,
    'fb_xnli_hyp_pre': MULTI_LANG_KEY,
    'stanfordnlp_snli_hyp_pre': ID_KEY,
}
lang_key_mapping = {
    'pawsx_test': 'language',
    'fb_xnli_pre_hyp': 'lang',
    'fb_xnli_hyp_pre': 'lang',
}

def get_original_data(key: str)->pd.DataFrame:
    fname = name_mapping[key]+".tsv"
    path = os.path.join("original_reproduction_code/datasets", fname)
    return pd.read_csv(path, sep='\t')


In [53]:
def save_json_dict(key: str, dict, path: str) -> None:
    fname = name_mapping[key]+".json"
    path = os.path.join(path, fname)
    print(f"saving {path}")
    with open(path, 'w') as f:
        json.dump(dict, f, indent=2, ensure_ascii=False)

def id_for_record(dataset_key: str, i: int, element: pd.Series):
    id_field = id_mapping[dataset_key]
    # special cases
    if "sts" in dataset_key:
        id = f"stsbenchmark-{i}"
    elif "sick" in dataset_key:
        id = f"sick-r-{i}"
    elif id_field == NO_ID_KEY:
        id = f"{dataset_key}/{i}"
    elif id_field == MULTI_LANG_KEY:
        lang_key = lang_key_mapping[dataset_key]
        lang = element[lang_key]
        id = f"{dataset_key}/{lang}-{i}"
    else:
        id = element[id_field]

    return id

def convert_datasets_to_json_templates():
    for dataset_key in name_mapping.keys():
        records = {}
        df = get_original_data(dataset_key)

        for i, e in df.iterrows():
            s_key = 'sentence'
            s1_key = s_key+'1'
            s2_key = s_key+'2'
            s1 = e[s1_key]
            s2 = e[s2_key]
            id = id_for_record(dataset_key, i, e)
            if id not in records.keys():
                records[id] = {}
            records[id][s1_key] = s1
            records[id][s2_key] = s2

        human_annotation_column_name = 'Human Annotation - Consensus'
        if human_annotation_column_name in df.columns:
            for i, e in df.iterrows():
                score = e['score']
                c = e[human_annotation_column_name]
                label = (c == 1.0)
                # only consider this label if score>=4, meaning if this is part of the sts-h dataset.
                if score >= 4:
                    id = id_for_record(dataset_key, i, e)
                    records[id]['label'] = label
        elif 'label' in df.columns:
            for i, e in df.iterrows():
                label = (e['label'] == 1)
                id = id_for_record(dataset_key, i, e)
                records[id]['label'] = label


        if 'score' in df.columns:
            for i, e in df.iterrows():
                score = e['score']
                id = id_for_record(dataset_key, i, e)
                records[id]['score'] = score
        save_json_dict(dataset_key, records, "datasets_no_results")


convert_datasets_to_json_templates()

saving datasets_no_results/stsbenchmark-test-sts.json
saving datasets_no_results/ms-mrpc.json
saving datasets_no_results/onestop_parallel_all_pairs.json
saving datasets_no_results/amr_true_paraphrases.json
saving datasets_no_results/fb-anli-pre-hyp.json
saving datasets_no_results/fb-anli-hyp-pre.json
saving datasets_no_results/sickr-sts.json
saving datasets_no_results/paws-x-test.json
saving datasets_no_results/stannlp-snli-pre-hyp.json
saving datasets_no_results/fb-xnli-pre-hyp.json
saving datasets_no_results/fb-xnli-hyp-pre.json
saving datasets_no_results/stannlp-snli-hyp-pre.json


In [54]:
# make sure the way we identify a record in a dataset, is correct (id is unique for that dataset only once)
for dataset_key in name_mapping.keys():
    ids = defaultdict(int)
    df = get_original_data(dataset_key)
    id_field = id_mapping[dataset_key]

    for i, element in df.iterrows():

        if id_field == NO_ID_KEY:
            id = f"{dataset_key}/{i}"
        elif id_field == MULTI_LANG_KEY:
            lang_key = lang_key_mapping[dataset_key]
            lang = element[lang_key]
            id = f"{dataset_key}/{lang}-{i}"
        else:
            id = element[id_field]
        ids[id] += 1
    for id, count in ids.items():
        if count > 1:
            print(f"{dataset_key}/{id}: {count}")

In [55]:
def find_unique_column_names(directory_path):
    """
    Finds unique column names across all TSV files in a given directory.

    Args:
        directory_path (str): Path to the directory containing TSV files.

    Returns:
        set: A set of unique column names from all TSV files.
    """
    unique_columns = set()

    # Iterate through all files in the directory
    for filename in os.listdir(directory_path):
        # Check if the file is a TSV file
        if filename.endswith('.tsv'):
            file_path = os.path.join(directory_path, filename)
            try:
                # Read the TSV file into a DataFrame
                df = pd.read_csv(file_path, sep='\t', nrows=1)  # Read only the header row for efficiency
                # Update the set of unique columns
                unique_columns.update(df.columns)
            except Exception as e:
                print(f"Error reading {file_path}: {e}")

    return unique_columns

# Example usage
directory_path = "original_reproduction_code/datasets"  # Replace with the path to your directory
unique_columns = find_unique_column_names(directory_path)

prefixes = [
    # "CLEAN-",
    "XLM-RoBERTa-EN-",
    "LLama3 zero-shot",
    "LLama3 ICL_4"
]
def is_column_of_interest(column_name):
    for prefix in prefixes:
        if column_name.lower().startswith(prefix.lower()):
            return True
cols_of_interest = [c for c in unique_columns if is_column_of_interest(c)]

print("\n".join(sorted(list(cols_of_interest))))

LLama3 ICL_4 (Ex. Same Content)
LLama3 ICL_4 (Paraph)
LLama3 ICL_4 (Sem Equiv)
LLama3 zero-shot (Ex. Same Content)
LLama3 zero-shot (Paraph)
LLama3 zero-shot (Sem Equiv)
XLM-RoBERTa-EN-ALLTHREE-V2-1
XLM-RoBERTa-EN-ALLTHREE-V2-2
XLM-RoBERTa-EN-ALLTHREE-V2-3
XLM-RoBERTa-EN-ALLTHREE-V3-1
XLM-RoBERTa-EN-ALLTHREE-V3-2
XLM-RoBERTa-EN-ALLTHREE-V3-3
XLM-RoBERTa-EN-ALLTHREE-V3-4
XLM-RoBERTa-EN-ALLTHREE-V4-1
XLM-RoBERTa-EN-ALLTHREE-V4-2
XLM-RoBERTa-EN-ALLTHREE-V4-3
XLM-RoBERTa-EN-EASYNEG-25-V2-1
XLM-RoBERTa-EN-EASYNEG-25-V2-2
XLM-RoBERTa-EN-EASYNEG-25-V2-3
XLM-RoBERTa-EN-EASYNEG-25-V3-1
XLM-RoBERTa-EN-EASYNEG-25-V3-2
XLM-RoBERTa-EN-EASYNEG-25-V3-3
XLM-RoBERTa-EN-EASYNEG-25-V4-1
XLM-RoBERTa-EN-EASYNEG-25-V4-2
XLM-RoBERTa-EN-EASYNEG-25-V4-3
XLM-RoBERTa-EN-EASYNEG-50-V2-1
XLM-RoBERTa-EN-EASYNEG-50-V2-2
XLM-RoBERTa-EN-EASYNEG-50-V2-3
XLM-RoBERTa-EN-EASYNEG-50-V3-1
XLM-RoBERTa-EN-EASYNEG-50-V3-2
XLM-RoBERTa-EN-EASYNEG-50-V3-3
XLM-RoBERTa-EN-EASYNEG-50-V4-1
XLM-RoBERTa-EN-EASYNEG-50-V4-2
XLM-RoBERTa-E

In [57]:
import json


def get_paper_json_dict(key: str)->dict:
    fname = name_mapping[key]+".json"
    path = os.path.join("benches/paper", fname)
    print(f"reading {path}")
    with open(path, 'r') as f:
        return json.load(f)



def add_data_record(dataset_key:str, i:int, paper_element: pd.Series, json_dict):
    id = id_for_record(dataset_key, i, paper_element)
    if id not in json_dict.keys():
        id = f"{id}"
        if id not in json_dict.keys():
            print(f"ERROR! [id field is {id_field}]{id} is not found in the dataset {dataset_key}. Dict keys: {json_dict.keys()}")
            exit(1)

    for col in paper_element.keys():
        if col in cols_of_interest:
            paper_value = paper_element[col]
            if paper_value == 1:
                json_dict[id][col] = True
            elif paper_value == 0:
                json_dict[id][col] = False
            else:
                print(f"Unexpected value {paper_value} for {col}: {paper_element}")

for dataset_key in name_mapping.keys():
    df = get_original_data(dataset_key)
    json_dict = get_paper_json_dict(dataset_key)
    for i, element in df.iterrows():
        add_data_record(dataset_key, i, element, json_dict)
    save_json_dict(dataset_key, json_dict, "benches/paper")
    # add_data_record(dataset_key, id_mapping[dataset_key], json_dict, json_dict)



reading benches/paper/stsbenchmark-test-sts.json
saving benches/paper/stsbenchmark-test-sts.json
reading benches/paper/ms-mrpc.json
saving benches/paper/ms-mrpc.json
reading benches/paper/onestop_parallel_all_pairs.json
saving benches/paper/onestop_parallel_all_pairs.json
reading benches/paper/amr_true_paraphrases.json
saving benches/paper/amr_true_paraphrases.json
reading benches/paper/fb-anli-pre-hyp.json
saving benches/paper/fb-anli-pre-hyp.json
reading benches/paper/fb-anli-hyp-pre.json
saving benches/paper/fb-anli-hyp-pre.json
reading benches/paper/sickr-sts.json
saving benches/paper/sickr-sts.json
reading benches/paper/paws-x-test.json
saving benches/paper/paws-x-test.json
reading benches/paper/stannlp-snli-pre-hyp.json
saving benches/paper/stannlp-snli-pre-hyp.json
reading benches/paper/fb-xnli-pre-hyp.json
saving benches/paper/fb-xnli-pre-hyp.json
reading benches/paper/fb-xnli-hyp-pre.json
saving benches/paper/fb-xnli-hyp-pre.json
reading benches/paper/stannlp-snli-hyp-pre.json