In [19]:
import os 
import pandas as pd 
import numpy as np 
exp_name = "exp_6_2_debising"
reasonings = [
    "reasoning",
    "logical_reasoning",
    "moral_reasoning",
    "third_person"
]
new_dfs = []
new_validity_dfs = []
for reasoning in reasonings:
    dir_path = f"results/{exp_name}/{reasoning}"
    models = [
        'openai--gpt-4o-mini', 
        'llama3_1_instruct--70b', 
        'llama3_1--8b', 
        'gemma2--9b', 
        # 'exaone--8b', 
        'qwen2--7b'
        ]
    result_dfs = []
    for model in models:
        result_df = pd.read_csv(os.path.join(dir_path, f"{model}.csv"))
        result_df['model'] = model
        result_dfs.append(result_df)

    def mv_func(x):
        values = ['must', 'should', 'ought']
        for v in values:
            if v in x:
                return v 
        if "has to" in x or "have to" in x:
            return "have to"
        return "none"

    def process_model_output(x):
        x = str(x).strip()
        # return either 1 or 0 first appearance
        # for i in range(len(x)):
        #     if x[i] == "1" or x[i] == "0":
        #         return int(x[i])
        # return np.nan #   -1000000  # np.nan #  -100000 # np.nan
        
        if x.count("Answer:") > 1:
            # drop when the second answer appears
            x = x[:x.find("Answer:")+len("Answer:")]
        elif "Answer:" in x:
            x = x.split("Answer:")[1][:3]
        if "1" in x and "0" in x:
            # print("Both 1 and 0 appear")
            return np.nan
        elif "1" in x:
            return 1
        elif "0" in x:
            return 0
        else:
            # print("No 1 or 0 appear")
            return np.nan # np.nan
        
    def process_validity(x):
        x = str(x).strip()
        # # return either 1 or 0 first appearance
        # for i in range(len(x)):
        #     if x[i] == "1" or x[i] == "0":
        #         return int(x[i])
        # return np.nan #   -1000000  # np.nan #  -100000 # np.nan
        
        if x.count("Answer:") > 1:
            # drop when the second answer appears
            x = x[:x.find("Answer:")+len("Answer:")]
        elif "Answer:" in x:
            x = x.split("Answer:")[1][:3]
        if "1" in x and "0" in x:
            # print("Both 1 and 0 appear")
            return 0 # False
        elif "1" in x:
            return 1 # True
        elif "0" in x:
            return 1
        else:
            # print("No 1 or 0 appear")
            return 0# np.nan


    result_df = pd.concat(result_dfs)
    result_df['mv'] = result_df['input'].apply(mv_func)
    result_df['model_output'] =  result_df['model_output_raw'].apply(process_model_output)
    result_df['validity'] =  result_df['model_output_raw'].apply(process_validity)
    result_df.head()

    targets = ['input_type', 'model', 'model_output']
    groupby = targets[:-1]
    result_averaged = result_df[targets].groupby(groupby).mean().reset_index()
    result_averaged_validity = result_df[['input_type', 'model', 'validity']].groupby(groupby).mean().reset_index()
    new_df = pd.DataFrame(columns=['type'] + models)
    validity_df = pd.DataFrame(columns=['type'] + models)
    for input_type in ['strong',]:
        new_row = [input_type]
        validity_row = [input_type]
        for model in models:
            temp = result_averaged[result_averaged['input_type'] == input_type]
            temp = temp[temp['model'] == model]
            new_row.append(np.round(temp['model_output'].item(), 2))
            temp2 = result_averaged_validity[result_averaged_validity['input_type'] == input_type]
            temp2 = temp2[temp2['model'] == model]
            validity_row.append(np.round(temp2['validity'].item(), 2))
        new_df.loc[len(new_df)] = new_row
        validity_df.loc[len(validity_df)] = validity_row
    new_df['reasoning'] = reasoning
    validity_df['reasoning'] = reasoning
    
    new_dfs.append(new_df)
    new_validity_dfs.append(validity_df)
new_dfs = pd.concat(new_dfs)
new_validity_dfs = pd.concat(new_validity_dfs)
new_dfs.to_csv(f'trainging_free_reasoning_{exp_name}.csv', index=False)


In [20]:
new_dfs

Unnamed: 0,type,openai--gpt-4o-mini,llama3_1_instruct--70b,llama3_1--8b,gemma2--9b,qwen2--7b,reasoning
0,strong,0.94,1.0,0.87,0.77,0.93,reasoning
0,strong,0.99,0.0,0.93,0.74,0.95,logical_reasoning
0,strong,0.99,0.31,0.9,0.8,1.0,moral_reasoning
0,strong,0.96,0.86,0.54,0.89,0.88,third_person


In [21]:
new_validity_dfs

Unnamed: 0,type,openai--gpt-4o-mini,llama3_1_instruct--70b,llama3_1--8b,gemma2--9b,qwen2--7b,reasoning
0,strong,1.0,0.0,0.2,0.56,0.36,reasoning
0,strong,1.0,0.01,0.16,0.41,0.29,logical_reasoning
0,strong,1.0,0.99,0.15,0.35,0.45,moral_reasoning
0,strong,1.0,1.0,1.0,1.0,1.0,third_person
