In [1]:
import pandas as pd
import numpy as np
import os
import re
from copy import deepcopy 

from collections import Counter
import seaborn as sns
import matplotlib.pyplot as plt

In [2]:
base_dir = 'data/english_only/prompting_results_clean/with_metrics/' #experiment 1
#base_dir = 'data/english_only/100k_results/with_metrics' #experiment 2

In [3]:
from aggregate_metrics_helper import *

In [4]:
os.chdir('/shared/0/projects/research-jam-summer-2024/')
pd.options.mode.copy_on_write = True

In [5]:
for f in os.listdir(base_dir):
    if f.startswith('wildchat_subset_en_2k_prompting') and not f.endswith('_end.jsonl') and not f.endswith('_embeddings.npz') and not f.endswith('lexical.jsonl') and not f.endswith('MERGED.jsonl') and not f.endswith('_POS_DEP.jsonl'):
        print(f)

wildchat_subset_en_2k_prompting_Qwen2-72B-Instruct.jsonl
wildchat_subset_en_2k_prompting_c4ai-command-r-v01.jsonl
wildchat_subset_en_2k_prompting_Mixtral-8x7B-Instruct-v0.1.jsonl
wildchat_subset_en_2k_prompting_Meta-Llama-3-70B-Instruct.jsonl
wildchat_subset_en_2k_prompting_Mistral-Large-Instruct.jsonl
wildchat_subset_en_2k_prompting_Phi-3-medium-4k-instruct.jsonl
wildchat_subset_en_2k_prompting_Mistral-7B-Instruct-v0.3.jsonl
wildchat_subset_en_2k_prompting_Meta-Llama-3.1-8B-Instruct.jsonl
wildchat_subset_en_2k_prompting_Meta-Llama-3.1-70B-Instruct.jsonl


In [6]:
# READ DATA
f = 'wildchat_subset_en_2k_prompting_Qwen2-72B-Instruct.jsonl'
metrics = make_human_vs_llm_df(f, base_dir)

wildchat_subset_en_2k_prompting_Qwen2-72B-Instruct.jsonl
read metrics


KeyError: "['human_pos', 'llm_pos'] not in index"

In [None]:
metrics.head()

In [None]:
# turn llm_ cols into metrics from the random baseline


In [8]:
# log scale heavy-tailed count metrics
for k in ['word_count', 'word_length', 'perplexity', 'dep_dpth', 'dep_brth', 'dep_dep_dist']:
    metrics['human_'+k] = np.log(metrics['human_'+k]+1)
    metrics['llm_'+k] = np.log(metrics['llm_'+k]+1)

KeyError: 'human_dep_dpth'

In [8]:
# check that columns are exactly the merge keys and human/llm metrics from all_metrics 

print(Counter(['human_'+k in metrics.columns for k in all_metrics]))
print(Counter(['llm_'+k in metrics.columns for k in all_metrics]))
print(Counter([k in metrics.columns for k in merge_keys]))
print(Counter([k in merge_keys or re.sub('human_|llm_','',k) in all_metrics 
               for k in metrics.columns]))


Counter({True: 20})
Counter({True: 20})
Counter({True: 4})
Counter({True: 44})


In [9]:
metric_category = {}
for k in lexical: metric_category[k] = 'lexical'
for k in syntactic: metric_category[k] = 'syntactic'
for k in semantic: metric_category[k] = 'semantic'
for k in style: metric_category[k] = 'style'

In [12]:
# CREATE COLUMN AGGREGATES WITH CORRELATION
def corr_metric(corr_method):
    model = []
    metric = []
    cor = []
    for mod in set(metrics.model):
        print(mod)
        sub = metrics[metrics.model == mod]
        for k in all_metrics:
            model.append(mod)
            metric.append(k)
            cor.append(col_diff_correlate(sub['human_'+k], sub['llm_'+k], all_metrics[k], corr_method))
    col_corr = pd.DataFrame({'model': model, 'metric': metric, 'cor': cor, 'corr_method': corr_method})
    col_corr['category'] = col_corr['metric'].replace(metric_category)
    return col_corr

In [13]:
col_corr = {m: corr_metric(m) for m in ['pearson','spearman','kendall']}

wildchat_subset_en_2k_prompting_Meta-Llama-3-70B-Instruct
wildchat_subset_en_2k_prompting_c4ai-command-r-v01
wildchat_subset_en_2k_prompting_Meta-Llama-3.1-8B-Instruct
wildchat_subset_en_2k_prompting_Mistral-7B-Instruct-v0.3
wildchat_subset_en_2k_prompting_Phi-3-medium-4k-instruct
wildchat_subset_en_2k_prompting_Mixtral-8x7B-Instruct-v0.1
wildchat_subset_en_2k_prompting_Qwen2-72B-Instruct
wildchat_subset_en_2k_prompting_Mistral-Large-Instruct
wildchat_subset_en_2k_prompting_Meta-Llama-3.1-70B-Instruct
wildchat_subset_en_2k_prompting_Meta-Llama-3-70B-Instruct
wildchat_subset_en_2k_prompting_c4ai-command-r-v01
wildchat_subset_en_2k_prompting_Meta-Llama-3.1-8B-Instruct
wildchat_subset_en_2k_prompting_Mistral-7B-Instruct-v0.3
wildchat_subset_en_2k_prompting_Phi-3-medium-4k-instruct
wildchat_subset_en_2k_prompting_Mixtral-8x7B-Instruct-v0.1
wildchat_subset_en_2k_prompting_Qwen2-72B-Instruct
wildchat_subset_en_2k_prompting_Mistral-Large-Instruct
wildchat_subset_en_2k_prompting_Meta-Llama-3.1

In [None]:
col_corr['pearson'].to_csv('data/agg_metrics/cor_metrics_random_baseline.csv', index=None)