In [None]:
import pandas as pd
from glob import glob
import os
import os.path as osp
import re
import json
from nltk.corpus import wordnet as wn


def lemma2synset(lemma):
    possible_synsets = [synset for synset in wn.synsets(lemma) if synset.pos() == "n"]
    selected_synset = possible_synsets[0]
    return selected_synset.name()


def map_list_to_wn(lemma_list):
    return list(map(lemma2synset, lemma_list))


def process_file(file):

    with open(file, "r") as f:
        content = json.load(f)

        args = content["args"]
        data = content["data"]            

    predictions_df = pd.DataFrame(data)

    # add metadata
    args["model_size"] = re.search(r'\-(\d+b)\-', args['model_id']).group(1)
    predictions_df["model_id"] = args["model_id"]
    predictions_df["model_size"] = args["model_size"]
    predictions_df["quant"] = args["quant"]
    predictions_df["model_type"] = args["model_type"]
        
    return args, predictions_df


def process_files(files):
    processed = [process_file(file) for file in files]

    out_df = None

    base_cols = [
        "item_id",
        "tangram",
        "scene",
        "tangram_id",
        "tangram_pos",
        "image_name",
    ]
    for args, df in processed:
        if out_df is None:
            out_df = df.copy()[base_cols]
            
        model_identifyer = f'{args["model_type"]}-{args["model_size"]}'
        response_col = f'response_{model_identifyer}'
        label_col = f'label_{model_identifyer}'
        synset_col = f'synset_{model_identifyer}'
        location_response_col = f'location_response_{model_identifyer}'
        location_label_col = f'location_label_{model_identifyer}'
        df = df.rename(columns={"response": response_col, "label": label_col, "location_response": location_response_col, "location_label": location_label_col})
        df[synset_col] = df[label_col].map(map_list_to_wn)

        cols = ["item_id", response_col, label_col, synset_col, location_response_col, location_label_col]
        for col in cols: 
            if col not in df.columns:
                df[col] = None
           
        # handle missing values     
        n_entries = df[response_col].map(len)
        max_entries = max(n_entries)
        too_few = n_entries < max_entries
        too_few_selection = df[too_few]
        
        
        if len(too_few_selection) > 0:
            print(f'padding missing values in {model_identifyer} ({len(too_few_selection)} rows affected)')
            print('item_ids:', too_few_selection.item_id.to_list())
        
        for i in too_few_selection.index:
            missing = max_entries - n_entries.loc[i]
            df.at[i, response_col] += ['None']*missing
            df.at[i, label_col] += ['None']*missing
            df.at[i, synset_col] += ['none.n.01']*missing

        out_df = pd.merge(
            out_df,
            df[cols],
            left_on="item_id",
            right_on="item_id",
        )

    return out_df

In [None]:
input_dir = osp.abspath('../predicted_data/raw')
output_dir = osp.abspath('../predicted_data/processed')

if not osp.isdir(output_dir):
    os.makedirs(output_dir)

In [None]:
twostep_predictions = True
fewshot_predictions = False

prediction_files = glob(osp.join(input_dir, '*.json'))

if twostep_predictions:
    prediction_files = [p for p in prediction_files if "twostep" in p]
else: 
    prediction_files = [p for p in prediction_files if "twostep" not in p]
    
if fewshot_predictions:
    prediction_files = [p for p in prediction_files if "fewshot" in p]
else: 
    prediction_files = [p for p in prediction_files if "fewshot" not in p]
    
print(*prediction_files, sep='\n')

In [None]:
predictions_df = process_files(prediction_files)

predictions_df['set_idx'] = predictions_df.apply(lambda x: list(range(10)), axis=1)
pred_cols = sorted([c for c in predictions_df.columns if any([
    c.startswith(cat) for cat in ['response', 'label', 'synset']
])])
pred_cols.append('set_idx')

predictions_df = predictions_df.explode(pred_cols)
predictions_df['item_identifyer'] = predictions_df.apply(lambda x: f'{x.tangram_id}-{x.scene}-{x.set_idx}', axis=1)

predictions_df.head()

In [None]:
out_path = osp.join(output_dir, f'processed_predictions{"_twostep" if twostep_predictions else ""}{"_fewshot" if fewshot_predictions else ""}.csv')

print(f'write results to {out_path}')

predictions_df.to_csv(out_path)