In [3]:
import pandas as pd
import numpy as np
from scipy import stats

import itertools
from pathlib import Path

## Choose questionnaire

In [4]:
d = "big5" #BIG5, 16PF, RIASEC, HSQ
h_path = "../human_studies/" + d.upper() + "/" + d.lower() 

### Import & Process Human Response Data


In [5]:
# load data
if d=="16pf": # Merge multiple 16pf studies
    df_raw1 = pd.read_csv(h_path + "_qualtrics_raw1.csv")
    df_raw2 = pd.read_csv(h_path + "_qualtrics_raw2.csv")
    df_raw = pd.concat([df_raw1, df_raw2.iloc[2:]], ignore_index=True)
else: # directly load file for other questionnaires
    df_raw = pd.read_csv(h_path + "_qualtrics_raw.csv")
    
#filter columns and rename
cols = ["Q6.2", "Q27.1", "Q28.2", "Q28.3", "Q8.2", "target", "fold"]
nr_questions = {"big5": 100, "16pf": 163, "riasec": 48, "hsq": 32}
max_n = nr_questions[d] + 1
cols_response = [str(x) + "_Q589_4" for x in range(1,max_n)]
cols.extend(cols_response)
df_raw_pre = df_raw.loc[:, cols]
rename_dict = {k:v for k,v in zip(cols_response, ["q" + str(x) for x in range(1, max_n)])}
rename_dict = rename_dict | {'Q6.2': 'attention_check', 'Q27.1': 'impressions', "Q28.2": "age", "Q28.3": "gender", "Q8.2": "id"}
df_raw_pre = df_raw_pre.rename(columns=rename_dict)
df_raw_pre = df_raw_pre.drop([0,1], axis=0)

# convert scale responses to never
replacer = {'1': 1, '2': 2, '3': 3, '4': 4, '5': 5}
df_raw_pre[["q" + str(x) for x in range(1,max_n)]] = df_raw_pre[["q" + str(x) for x in range(1,max_n)]].replace(replacer)
df_raw_pre.loc[:, ["age", "target", "fold"]] = df_raw_pre.loc[:, ["age", "target", "fold"]].astype(float)

# drop failed attention check and duplicate data
df = df_raw_pre[df_raw_pre.attention_check == "1"].reset_index(drop=True).drop(["attention_check"], axis = 1)
df_final = df.dropna(subset=["age"]).drop_duplicates(["target", "fold"])
df_final.to_csv(h_path + "_qualtrics_cleaned.csv") #save preprocessed human data

Check how many partipants finished the study

In [None]:
# Check how many people took the test in total (to check if data is most current in qualtrics)
print(df_raw_pre.shape)

# Check how many partipants finished the study
print(df.shape)

# Check how many test folds are covered
print(df_final.shape)

Print Summary Stats (mean age, gender, sample size)

In [80]:
[df_final.age.mean(), np.mean(df_final.gender=="2"), df_final.shape[0]] #mean age, gender, sample size

[40.60166666666667, 0.5083333333333333, 600]

Check for missing conditions [Target, Fold]

In [81]:
folds = df_final.loc[:,["target", "fold"]].sort_values(["target", "fold"]).values.tolist()
all_folds = [list(tup) for tup in itertools.product(range(2,62), range(1,11))]
missing = [x for x in all_folds if x not in folds]

print(len(missing)) # nr of conditions missing
print(missing) #print missing conditions

0
[]
