In [None]:
# 240708

import json
import pandas as pd
import os
import numpy as np

path_list = r"path_list.csv"
answ_df = pd.read_csv(r"../0_data/answer.csv", index_col=0)

out_path = '.subjects'
anl_path = '.out'

In [2]:
def run_resp(resp_path, save_path, split_by):
    year_dict = {}
    with open(resp_path, 'rb') as f:
        exam = json.load(f)
    f.close()
    filename = os.path.basename(resp_path)
    
    resp_df = pd.DataFrame(index=range(1, 141))
    
    split_by = '@'
    alert_dict = {}
    symbol_to_number = {'①': 1, '②': 2, '③': 3, '④': 4, '⑤': 5}

    for i, response in enumerate(exam):
        year = exam[i]['year']
        session = exam[i]['subject_id'][0]
        ques_num = exam[i]['question_number']
        if exam[i]['subject_id'] in ('3-2', '3-3', '3-4'):
            ques_num += 77
        elif exam[i]['subject_id'] == '4-1':
            ques_num -= 63
        response = exam[i]['response']
        col_name = f'{year}_{session}'

        counts = {'①': 0, '②': 0, '③': 0, '④': 0, '⑤': 0}
        resp = None
        if split_by is not None:
            if split_by == '@':
                desc = str(response)
            else:
                desc = response.split(split_by)[1]
        for num in desc:
            if num in counts:
                counts[num] += 1
        
        first_occurrence = None
        for num in desc:
            if num in counts:
                first_occurrence = num
                break
        max_count = max(counts.values())
        max_elements = [key for key, value in counts.items() if value == max_count]
        if len(max_elements) == 1:
            mode = max_elements[0]
        else:
            mode = None

        if first_occurrence == mode:
            resp = first_occurrence
        else:
            if mode is None:
                resp = first_occurrence
            else:
                resp = mode
            alert_dict[i+1] = desc

        if resp is not None:
            resp_numb = symbol_to_number[resp]
        else:
            for num in desc:
                if num in '12345':
                    resp = num
                    break
            if resp is not None:
                resp_numb = int(resp)
            else:
                resp_numb = None
                alert_dict[i+1] = desc
        
        if col_name not in resp_df.columns:
            resp_df[col_name] = None
        resp_df.at[ques_num, col_name] = resp_numb
        
    year_dict[filename] = alert_dict

    return resp_df, year_dict

In [3]:
def run_anal(answ_df, resp_df, save_path):
    score_df = (answ_df == resp_df).astype(int)
    
    resp_df.to_csv(f'{save_path}/response.csv')
    df = pd.DataFrame()
    df['score'] = score_df.sum()
    df['solved'] = resp_df.count()
    
    df['total'] = answ_df.count()
    df['rate'] = score_df.sum()/answ_df.count()*100
    df['rate*'] = score_df.sum()/resp_df.count()*100

    df['40_pass'] = df['rate'] >= 40
    df['40_pass*'] = df['rate*'] >= 40
    df.to_csv(f'{save_path}/analysis_40.csv')

    df2 = df.copy()
    df2['year'] = df2.index.str[:4].astype(int)
    df_yearly = df2.groupby('year').sum()
    df_yearly.drop(columns=['year'], inplace=True, errors='ignore')
    df_yearly.drop(columns=['rate', 'rate*'], inplace=True, errors='ignore')

    df_yearly['rate'] = df_yearly['score']/df_yearly['total']*100
    df_yearly['rate*'] = df_yearly['score']/df_yearly['solved']*100
    df_yearly['60_pass'] = df_yearly['rate'] >= 60
    df_yearly['60_pass*'] = df_yearly['rate*'] >= 60
    df_yearly['pass'] = np.where(df_yearly['60_pass'] * df_yearly['40_pass'] == 4, True, False)
    df_yearly.to_csv(f'{save_path}/analysis_60.csv')
    return df_yearly

In [4]:
data_df = pd.read_csv(path_list)
mean_df = pd.DataFrame()

os.makedirs(anl_path, exist_ok=True)
conv_dict = {}
for i in range(len(data_df)):
    resp_path = data_df['path'][i]
    resp_name = data_df['name'][i]
    save_path = f'{out_path}/{resp_name}'
    split_by = data_df['split'][i]
    
    if not os.path.exists(save_path):
        os.makedirs(save_path)
    resp_df, year_dict = run_resp(resp_path, save_path, split_by)
    anal_df = run_anal(answ_df, resp_df, save_path)
    scr = anal_df[['rate','solved']].mean()
    pss = anal_df[['pass']].sum()
    std = anal_df[['rate']].std()
    if mean_df.empty:
        mean_df = pd.DataFrame(columns=scr.index)
    mean_df.loc[resp_name] = scr
    mean_df.at[resp_name,'pass']=pss['pass']
    mean_df.at[resp_name,'SD']=std['rate']
    conv_dict[resp_name] = year_dict

mean_df.to_csv(f'{anl_path}/mean.csv')
with open(f'{anl_path}/warn_list.json', 'w', encoding='utf-8') as f:
    json.dump(conv_dict, f, ensure_ascii=False, indent=4)

In [5]:
kor_target = r' \[kor\]'
df_kor = mean_df[mean_df.index.str.contains(kor_target)]
df_kor.index = df_kor.index.str.split(kor_target).str[0]
df_kor.to_csv(f'{anl_path}/mean_kor.csv')

eng_target = r' \[eng\]'
df_eng = mean_df[mean_df.index.str.contains(eng_target)]
df_eng.index = df_eng.index.str.split(eng_target).str[0]
df_eng.to_csv(f'{anl_path}/mean_eng.csv')

In [6]:
# 데이터 병합
df_merged = pd.DataFrame({
    'eng-rate': df_eng['rate'],
    'kor-rate': df_kor['rate'],
    'eng-pass': df_eng['pass'].astype(int),
    'kor-pass': df_kor['pass'].astype(int)
})

# 인덱스 이름 설정
df_merged.index.name = 'name'

df_merged.to_csv(f'{anl_path}/mean_merged.csv')
df_merged

Unnamed: 0_level_0,eng-rate,kor-rate,eng-pass,kor-pass
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
c4ai-command-r-plus,67.333333,64.190476,5,5
c4ai-command-r-v01,59.666667,56.904762,1,0
chat-bison@001,63.428571,60.619048,6,4
chat-bison@002,54.190476,49.380952,0,0
claude-3-5-sonnet@20240620,84.380952,84.380952,6,6
claude-3-opus@20240229,79.142857,80.047619,6,6
claude-3-sonnet@20240229,71.428571,65.857143,5,5
gemini-1.0-pro-001,65.52381,62.666667,3,4
gemini-1.0-pro-002,56.952381,47.333333,1,0
gemini-1.5-pro-001,74.571429,71.095238,6,6
