In [6]:
import pandas as pd
import numpy as np
from scipy import stats

import itertools
from pathlib import Path

## Choose questionnaire

In [7]:
d = "big5" #BIG5, 16PF, RIASEC, HSQ
h_path = "../human_studies/" + d.upper() + "/" + d.lower() 

### Import & Process Human Response Data


In [8]:
df_raw = pd.read_csv(h_path + "_qualtrics_raw2.csv")

#filter columns and rename
cols = ["Q6.2", "Q27.1", "Q28.2", "Q28.3", "Q8.2", "target", "fold"]
cols_response = [str(x) + "_Q589_4" for x in range(1,101)]
cols.extend(cols_response)
df_raw_pre = df_raw.loc[:, cols]
rename_dict = {k:v for k,v in zip(cols_response, ["q" + str(x) for x in range(1,101)])}
rename_dict = rename_dict | {'Q6.2': 'attention_check', 'Q27.1': 'impressions', "Q28.2": "age", "Q28.3": "gender", "Q8.2": "id"}
df_raw_pre = df_raw_pre.rename(columns=rename_dict)
df_raw_pre = df_raw_pre.drop([0,1], axis=0)

# convert scale responses to never
replacer = {'1': 1, '2': 2, '3': 3, '4': 4, '5': 5}
df_raw_pre[["q" + str(x) for x in range(1,101)]] = df_raw_pre[["q" + str(x) for x in range(1,101)]].replace(replacer)
df_raw_pre.loc[:, ["age", "target", "fold"]] = df_raw_pre.loc[:, ["age", "target", "fold"]].astype(float)

# drop failed attention check
df = df_raw_pre[df_raw_pre.attention_check == "1"].reset_index(drop=True).drop(["attention_check"], axis = 1)
df.to_csv(h_path + "_qualtrics_cleaned.csv") #save preprocessed human data

Print Summary Stats (mean age, gender, sample size)

In [4]:
[df.age.mean(), np.mean(df.gender=="2"), df.shape[0]] #mean age, gender, sample size

[38.84126984126984, 0.4603174603174603, 630]

Check for duplicate data

In [6]:
df.loc[:,["target", "fold"]].drop_duplicates().shape
# duplicate data!

(598, 2)

Find missing conditions [Target, Fold]

In [7]:
folds = df.loc[:,["target", "fold"]].sort_values(["target", "fold"]).values.tolist()
all_folds = [list(tup) for tup in itertools.product(range(2,62), range(1,11))]
missing = [x for x in all_folds if x not in folds]

print(len(missing)) # nr of conditions missing
print(missing) #print missing conditions

3
[[6, 4], [11, 6], [48, 7]]
