In [11]:
import pandas as pd
import numpy as np
from scipy import stats

import itertools
from pathlib import Path

# formatting
import re
def add_comma(match):
    return match.group(0) + ','

from scipy.stats import pearsonr

# warning messages
import warnings
warnings.filterwarnings("ignore", message="An input array is constant; the correlation coefficient is not defined.")

# determine accuracy rates of human judges
def topRaters(targets_data, human_data, df_folds):
    human_data = raters_data.copy()
    targets_h = []
    for target in targets_data.target_nr: #iterate over targets
        for fold in range(10): #iterate over folds
            test_items_idx = df_folds.loc[df_folds.fold_nr==fold+1, "test_items"].iloc[0]
            test_items_names = ["q" + str(x) for x in test_items_idx]
            true_x  = targets_data.loc[targets_data.target_nr == target, test_items_names].iloc[0]
            if any((human_data.target == target) & (human_data.fold == fold+1)):
                human_x = human_data.loc[(human_data.target == target) & (human_data.fold == fold+1), test_items_names].iloc[0]
                human_id = human_data.loc[(human_data.target == target) & (human_data.fold == fold+1), "id"].iloc[0]
                nas = np.logical_or(np.isnan(true_x), np.isnan(human_x))   # in case nan are in vector
                corr_human, p_human = pearsonr(true_x[~nas], human_x[~nas])
                targets_h.append([corr_human, human_id])
            else:
                corr_human, p_human = [np.nan, np.nan]
                targets_h.append([corr_human, np.nan])
        targets_df = pd.DataFrame(targets_h, columns=["accuracy", "id"])

    return targets_df

## Choose questionnaire

In [12]:
d = "riasec" #BIG5, 16PF, RIASEC, HSQ
h_path = "../human_studies/" + d.upper() + "/" + d.lower() 

### Import & Process Human Response Data


In [13]:
# load human judges responses
raters_data = pd.read_csv(h_path + "_qualtrics_cleaned.csv", index_col = 0)

# load target responses (original data)
targets_data = pd.read_csv(h_path + "_targets_data.csv", index_col=0)
# rename questions by order (e.g., "q1")
new_cols_names = ["q" + str(x) for x in range(1, len(targets_data.columns[4:])+1)] 
rename_cols_dict = {k:v for k,v in zip(targets_data.columns[4:].tolist(), new_cols_names)}
targets_data.rename(columns=rename_cols_dict, inplace=True) 

# load auxiliary data
df_folds = pd.read_csv(h_path + "_question_folds.csv") # save to files
df_folds.test_items = df_folds.test_items.apply(lambda x: re.sub(r'\[[0-9\.\s]+\]', add_comma, x)).apply(lambda x: re.sub(r'([0-9\.]+)', add_comma, x)).apply(lambda x: np.array(eval(x)[0]))
df_folds.train_items = df_folds.train_items.apply(lambda x: re.sub(r'\[[0-9\.\s]+\]', add_comma, x)).apply(lambda x: re.sub(r'([0-9\.]+)', add_comma, x)).apply(lambda x: np.array(eval(x)[0]))

# determine top raters
df = topRaters(targets_data, raters_data, df_folds)
top_raters = df.sort_values("accuracy", ascending=False)[:60].id.tolist()

In [14]:
# save top_raters list as text file
with open(h_path + "_top_raters.txt", "w") as f:
    for line in top_raters:
        f.write(f"{line}\n")