In [2]:
import os
import pandas as pd

def find_unique_column_names(directory_path):
    """
    Finds unique column names across all TSV files in a given directory.

    Args:
        directory_path (str): Path to the directory containing TSV files.

    Returns:
        set: A set of unique column names from all TSV files.
    """
    unique_columns = set()

    # Iterate through all files in the directory
    for filename in os.listdir(directory_path):
        # Check if the file is a TSV file
        if filename.endswith('.tsv'):
            file_path = os.path.join(directory_path, filename)
            try:
                # Read the TSV file into a DataFrame
                df = pd.read_csv(file_path, sep='\t', nrows=1)  # Read only the header row for efficiency
                # Update the set of unique columns
                unique_columns.update(df.columns)
            except Exception as e:
                print(f"Error reading {file_path}: {e}")

    return unique_columns

# Example usage
directory_path = "original_reproduction_code/datasets"  # Replace with the path to your directory
unique_columns = find_unique_column_names(directory_path)

prefixes = [
    # "CLEAN-",
    "XLM-RoBERTa-EN-",
    "LLama3 zero-shot",
    "LLama3 ICL_4"
]
def is_column_of_interest(column_name):
    for prefix in prefixes:
        if column_name.lower().startswith(prefix.lower()):
            return True
cols_of_interest = [c for c in unique_columns if is_column_of_interest(c)]

print("\n".join(sorted(list(cols_of_interest))))

LLama3 ICL_4 (Ex. Same Content)
LLama3 ICL_4 (Paraph)
LLama3 ICL_4 (Sem Equiv)
LLama3 zero-shot (Ex. Same Content)
LLama3 zero-shot (Paraph)
LLama3 zero-shot (Sem Equiv)
XLM-RoBERTa-EN-ALLTHREE-V2-1
XLM-RoBERTa-EN-ALLTHREE-V2-2
XLM-RoBERTa-EN-ALLTHREE-V2-3
XLM-RoBERTa-EN-ALLTHREE-V3-1
XLM-RoBERTa-EN-ALLTHREE-V3-2
XLM-RoBERTa-EN-ALLTHREE-V3-3
XLM-RoBERTa-EN-ALLTHREE-V3-4
XLM-RoBERTa-EN-ALLTHREE-V4-1
XLM-RoBERTa-EN-ALLTHREE-V4-2
XLM-RoBERTa-EN-ALLTHREE-V4-3
XLM-RoBERTa-EN-EASYNEG-25-V2-1
XLM-RoBERTa-EN-EASYNEG-25-V2-2
XLM-RoBERTa-EN-EASYNEG-25-V2-3
XLM-RoBERTa-EN-EASYNEG-25-V3-1
XLM-RoBERTa-EN-EASYNEG-25-V3-2
XLM-RoBERTa-EN-EASYNEG-25-V3-3
XLM-RoBERTa-EN-EASYNEG-25-V4-1
XLM-RoBERTa-EN-EASYNEG-25-V4-2
XLM-RoBERTa-EN-EASYNEG-25-V4-3
XLM-RoBERTa-EN-EASYNEG-50-V2-1
XLM-RoBERTa-EN-EASYNEG-50-V2-2
XLM-RoBERTa-EN-EASYNEG-50-V2-3
XLM-RoBERTa-EN-EASYNEG-50-V3-1
XLM-RoBERTa-EN-EASYNEG-50-V3-2
XLM-RoBERTa-EN-EASYNEG-50-V3-3
XLM-RoBERTa-EN-EASYNEG-50-V4-1
XLM-RoBERTa-EN-EASYNEG-50-V4-2
XLM-RoBERTa-E

In [3]:
name_mapping = {
    'stsbenchmark': 'stsbenchmark-test-sts',
    'ms_mrpc': 'ms-mrpc',
    'onestop_all': 'onestop_parallel_all_pairs',
    'simple_amr': 'amr_true_paraphrases',
    'fb_anli_pre_hyp': 'fb-anli-pre-hyp',
    'fb_anli_hyp_pre': 'fb-anli-hyp-pre',
    'sickr_sts': 'sickr-sts',
    'pawsx_test': 'paws-x-test',
    'stanfordnlp_snli_pre_hyp': 'stannlp-snli-pre-hyp',
    'fb_xnli_pre_hyp': 'fb-xnli-pre-hyp',
    'fb_xnli_hyp_pre': 'fb-xnli-hyp-pre',
    'stanfordnlp_snli_hyp_pre': 'stannlp-snli-hyp-pre'
}

NO_ID_KEY = "NOID"
ID_KEY = "id"
id_mapping = {
    'stsbenchmark': ID_KEY,
    'ms_mrpc': NO_ID_KEY,
    'onestop_all': 'OriginalID',
    'simple_amr': NO_ID_KEY,
    'fb_anli_pre_hyp': ID_KEY,
    'fb_anli_hyp_pre': ID_KEY,
    'sickr_sts': ID_KEY,
    'pawsx_test': ID_KEY,
    'stanfordnlp_snli_pre_hyp': ID_KEY,
    'fb_xnli_pre_hyp': ID_KEY,
    'fb_xnli_hyp_pre': ID_KEY,
    'stanfordnlp_snli_hyp_pre': ID_KEY,
}

In [29]:
import json
def get_original_data(key: str)->pd.DataFrame:
    fname = name_mapping[key]+".tsv"
    path = os.path.join("original_reproduction_code/datasets", fname)
    return pd.read_csv(path, sep='\t')
def save_paper_json_dict(key: str, dict):
    fname = name_mapping[key]+".json"
    path = os.path.join("benches/paper", fname)
    print(f"reading {path}")
    with open(path, 'w') as f:
        json.dump(dict, f, indent=2)
def get_paper_json_dict(key: str)->dict:
    fname = name_mapping[key]+".json"
    path = os.path.join("benches/paper", fname)
    print(f"reading {path}")
    with open(path, 'r') as f:
        return json.load(f)

def add_data_record(dataset_key:str, i:int, paper_element: pd.Series, json_dict):
    id_field = id_mapping[dataset_key]
    if id_field == NO_ID_KEY:
        id = f"{dataset_key}/{i}"
    else:
        id = paper_element[id_field]
    if id not in json_dict.keys():
        id = f"{id}"
        if id not in json_dict.keys():
            print(f"ERROR! [id field is {id_field}]{id} is not found in the dataset {dataset_key}. Dict keys: {json_dict.keys()}")
            exit(1)

    for col in paper_element.keys():
        if col in cols_of_interest:
            paper_value = paper_element[col]
            if paper_value == 1:
                json_dict[id][col] = True
            elif paper_value == 0:
                json_dict[id][col] = False
            else:
                print(f"Unexpected value {paper_value} for {col}: {paper_element}")



for dataset_key in name_mapping.keys():
    df = get_original_data(dataset_key)
    json_dict = get_paper_json_dict(dataset_key)
    for i, element in df.iterrows():
        add_data_record(dataset_key, i, element, json_dict)
    save_paper_json_dict(dataset_key, json_dict)
    # add_data_record(dataset_key, id_mapping[dataset_key], json_dict, json_dict)




reading benches/paper/stsbenchmark-test-sts.json
reading benches/paper/stsbenchmark-test-sts.json
reading benches/paper/ms-mrpc.json
reading benches/paper/ms-mrpc.json
reading benches/paper/onestop_parallel_all_pairs.json
reading benches/paper/onestop_parallel_all_pairs.json
reading benches/paper/amr_true_paraphrases.json
reading benches/paper/amr_true_paraphrases.json
reading benches/paper/fb-anli-pre-hyp.json
reading benches/paper/fb-anli-pre-hyp.json
reading benches/paper/fb-anli-hyp-pre.json
reading benches/paper/fb-anli-hyp-pre.json
reading benches/paper/sickr-sts.json
reading benches/paper/sickr-sts.json
reading benches/paper/paws-x-test.json
reading benches/paper/paws-x-test.json
reading benches/paper/stannlp-snli-pre-hyp.json
reading benches/paper/stannlp-snli-pre-hyp.json
reading benches/paper/fb-xnli-pre-hyp.json
reading benches/paper/fb-xnli-pre-hyp.json
reading benches/paper/fb-xnli-hyp-pre.json
reading benches/paper/fb-xnli-hyp-pre.json
reading benches/paper/stannlp-snli-h