In [1]:
import pandas as pd
from datasets import load_dataset

  from .autonotebook import tqdm as notebook_tqdm


In [None]:
# load datasets

COLUMNS = ['common_id', 'template_id', 'topic_id', 'topic_polarity', 'call_idx', 'prompt_text', 'response_text', 'eval_text']
MODELS = ["rpp_step3_qwen25-14b"] #["rpp_step3_llama-31-70b", "rpp_step3_llama-31-8b", "rpp_step3_qwen25-72b", "rpp_step3_qwen25-14b", "rpp_step3_qwen25-7b", "rpp_step3_gpt4o", "rpp_step3_olmo2-7b", "rpp_step3_olmo2-13b"]

df_dict = {}

for model in MODELS:

    print('Loading', model)
    
    ds = load_dataset(f"musashihinck/{model}")["train"]
    
    # sample 100 from ds for debugging
    #ds = ds.select(range(100))

    # convert to pandas dataframe
    df_dict[model.split("_")[-1]] = ds.to_pandas()
    
    # select relevant columns
    df_dict[model.split("_")[-1]] = df_dict[model.split("_")[-1]][COLUMNS]


Loading rpp_step3_qwen25-14b


Downloading data: 100%|██████████| 23/23 [02:59<00:00,  7.81s/files]
Generating train split: 100%|██████████| 3180000/3180000 [00:15<00:00, 202345.44 examples/s]


In [3]:
# parse the eval_text column

def parse_eval_text(eval_text):

    for char in eval_text:
        for i in range(1, 6):
            if f"{i}" in char:
                return i
            
    if "refusal" in eval_text.lower():
        return "refusal"
    
    else:
        return "PARSE ERROR"


for model in df_dict:
        
        df_dict[model]['eval_label'] = df_dict[model]['eval_text'].apply(parse_eval_text)

        # count values
        print(f'{model}: total of {len(df_dict[model])} samples')
        print(df_dict[model]['eval_label'].value_counts())
        print()

        # flag sample of PARSE ERRORS
        #if len(df_dict[model][df_dict[model]['eval_label'] == 'PARSE ERROR']) > 0:
        #    print("#" * 80)
        #    print(model)

            #for _, row in df_dict[model][df_dict[model]['eval_label'] == 'PARSE ERROR'].sample(3,random_state=42).iterrows():
            #    print(row['eval_text'])
            #    print()

qwen25-14b: total of 3180000 samples
eval_label
2              689689
3              625786
5              575531
4              569618
1              527827
refusal        189663
PARSE ERROR      1886
Name: count, dtype: int64



In [4]:
def sanity_checks(df):

    # print total number of rows
    print(f"Total number of rows: {len(df)}")

    # assert that there is an equal number of rows for each call_idx
    assert df.call_idx.value_counts().nunique() == 1

    # assert that there is an equal number of rows for each topic_id
    assert df.topic_id.value_counts().nunique() == 1

    # count missing values in response_text
    print(f"Missing values in response_text: {df.response_text.isnull().sum()}")

    # count missing values in eval_text
    print(f"Missing values in eval_text: {df.eval_text.isnull().sum()}")

    # count PARSE ERROR in eval_label
    print(f"PARSE ERROR in eval_label: {df[df.eval_label == 'PARSE ERROR'].shape[0]}")

for model in df_dict:
    print(f"Model: {model}")
    sanity_checks(df_dict[model])
    print("\n")

Model: qwen25-14b
Total number of rows: 3180000
Missing values in response_text: 0
Missing values in eval_text: 0
PARSE ERROR in eval_label: 1886




In [5]:
# store as csv 

for model in df_dict:
    df_dict[model].to_csv(f"../data/model_responses/{model}.csv", index=False)
    print(f"Saved {model}.csv")

Saved qwen25-14b.csv
