In [1]:
import os
import subprocess
import pandas as pd
from dotenv import load_dotenv

In [19]:
PROJ_DIR = os.path.abspath('../../..')

In [21]:
# Get the path to the directory this file is in
env_path = os.path.join(PROJ_DIR, '.env')
load_dotenv(dotenv_path=env_path)

True

In [22]:
# try and import the learner_datasets module
try:
    from learner_datasets import get_dataset_info
    from evaluate import format_sentence_info
except ImportError:
    # if the module is not found, add the src directory to the path
    import sys
    sys.path.append(os.path.join(PROJ_DIR, 'src'))

    from learner_datasets import get_dataset_info
    from evaluate.evaluate import format_sentence_info

In [23]:
# read best few_shot results
df = pd.read_csv('../best_dev_set_results.csv')
# select only rows where split == 'wibea-dev'
df = df[df['split'] == 'wibea-dev']

open_source_model_template = 'wibea_dataset_dev_{model}_{prompt_type}_{prompt_index}_beams=1_temp=0.1_topk=50_topp=1.0'
gpt_model_template = 'wibea_dataset_dev_{model}_{prompt_type}_{prompt_index}_temp=0.1_topp=1.0'
cohere_model_template = 'wibea_dataset_dev_{model}_{prompt_type}_{prompt_index}_temp=0.1_topk=50_topp=1.0'

few_shot_base_path = os.path.join(PROJ_DIR, 'paper_output', 'output_few_shot_dev')
zero_shot_base_path = os.path.join(PROJ_DIR, 'paper_output', 'output_zero_shot_dev')

model_paths = []
for index, row in df.iterrows():
    model = row['model'].split('/')[-1]

    template = open_source_model_template
    if 'gpt' in model:
        template = gpt_model_template
    elif 'command' in model:
        template = cohere_model_template

    prompt_type_index = row['prompt_type_index']
    prompt_type, prompt_index = prompt_type_index.split('_')

    prompt_type = prompt_type.replace('-', '_')

    # need to reduce prompt_index by 1 when mapping to file paths
    prompt_index = int(prompt_index) - 1

    base_path = few_shot_base_path
    if prompt_type == '0_shot':
        prompt_type = 'zero_shot'
        base_path = zero_shot_base_path

    model_path = os.path.join(
        base_path,
        template.format(model=model, prompt_type=prompt_type, prompt_index=prompt_index),
        "run_1",)
    model_paths.append(model_path)

In [None]:
for m in model_paths:
    print(m)
    assert os.path.exists(m)

    for cefr in ['A', 'B', 'C', 'N']:
        assert os.path.exists(os.path.join(m, f'hyp_post_{cefr}.m2'))

In [25]:
def evaluate_hyp_cefr_level(sub_dir, cefr_level):
    base_folder = os.path.dirname(sub_dir)
    config_path = os.path.join(base_folder, "config.yaml")

    model_hyp_file = os.path.join(sub_dir, f"hyp_post_{cefr_level}.m2")

    # run errant compare
    corpora_path = os.environ.get("CORPORA")
    ref_file = os.path.join(corpora_path, f'wi+locness/m2/{cefr_level}.dev.auto.m2')
    errant_compare_args = [
        "errant_compare",
        "-hyp",
        model_hyp_file,
        "-ref",
        ref_file,
        "-cat",
        "3",
        "-v"
    ]
    # subprocess.run(errant_compare_args)
    stdoutput = subprocess.check_output(errant_compare_args).decode("utf-8")

    evaluation_components = stdoutput.split(
            "=========== Span-Based Correction ============"
        )

    sentence_info = evaluation_components[0].split(
        "----------------------------------------"
    )
    sentence_info = format_sentence_info(sentence_info[1:])

    per_error_results = evaluation_components[1].split("\n")[1:-2]
    # list to pandas dataframe
    per_error_results = [x.split() for x in per_error_results]
    df = pd.DataFrame(per_error_results)
    # make first row the header and drop it
    df.columns = df.iloc[0]
    df = df.drop(0)

    overall_results = evaluation_components[-1].split("\n")
    header = overall_results[1].split("\t")
    values = overall_results[2].split("\t")
    results = dict(zip(header, values))

    # save all info
    data = {
        "sentence_results": sentence_info,
        "per_error_results": per_error_results,
        "overall_results": results,
    }

    # all_results_file = os.path.join(sub_dir, f"results_hyp_post_{cefr_level}.json")
    # with open(all_results_file, "wt") as f:
    #     json.dump(data, f, indent=4)


    return results

In [None]:
cefr_levels = ['A', 'B', 'C', 'N']

columns = ['model', 'cefr', 'precision', 'recall', 'f05']
all_results = {}

all_rows = []

for sub_dir in model_paths:
    print(sub_dir)
    model = os.path.basename(os.path.dirname(sub_dir))
    model = model.split('_')[3]
    for cefr_level in cefr_levels:
        try:
            results = evaluate_hyp_cefr_level(sub_dir, cefr_level)
            all_results[cefr_level] = results
            row = [
                model,
                cefr_level,
                results['Prec'],
                results['Rec'],
                results['F0.5']]
            all_rows.append(row)
        except Exception as e:
            print(e)
            print(f"Failed for {sub_dir} and {cefr_level}")
            break
    
    #     break
    # break

df = pd.DataFrame(all_rows, columns=columns)
df


In [27]:
# convert f05 column to float
df['f05'] = df['f05'].astype(float)

In [28]:
# save dataframe
df.to_csv(os.path.join('..', 'cefr_results.csv'), index=False)