In [1]:
import pandas as pd
import numpy as np
from scipy import stats
from pathlib import Path

#Preprocessing
from scipy.stats import pearsonr, mode 
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.model_selection import KFold

#Models
from sklearn.svm import SVC
from sklearn.linear_model import RidgeClassifier, Ridge
from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor
from sklearn.cluster import KMeans

#plots
import seaborn as sns
from matplotlib import pyplot as plt

#random state
randState = 0

## Choose data set

In [2]:
data = "BIG5"
h_path = "human_studies/" + data + "/" + data.lower() 

### Import data


In [9]:
df_raw = pd.read_csv(h_path + "_qualtrics_raw.csv")

#filter columns and rename
cols = ["Q6.2", "Q27.1", "Q28.2", "Q28.3", "Q8.2", "target", "fold"]
cols_response = [str(x) + "_Q589_4" for x in range(1,101)]
cols.extend(cols_response)
df_raw_pre = df_raw.loc[:, cols]
rename_dict = {k:v for k,v in zip(cols_response, ["q" + str(x) for x in range(1,101)])}
rename_dict = rename_dict | {'Q6.2': 'attention_check', 'Q27.1': 'impressions', "Q28.2": "age", "Q28.3": "gender", "Q8.2": "id"}
df_raw_pre = df_raw_pre.rename(columns=rename_dict)
df_raw_pre = df_raw_pre.drop([0,1], axis=0)

# convert scale responses to never
replacer = {'1': 1, '2': 2, '3': 3, '4': 4, '5': 5}
df_raw_pre[["q" + str(x) for x in range(1,101)]] = df_raw_pre[["q" + str(x) for x in range(1,101)]].replace(replacer)
df_raw_pre.loc[:, ["age", "target", "fold"]] = df_raw_pre.loc[:, ["age", "target", "fold"]].astype(float)

# drop failed attention check
df = df_raw_pre[df_raw_pre.attention_check == "1"].reset_index(drop=True).drop(["attention_check"], axis = 1)
# df.to_csv(h_path + "_qualtrics_cleaned.csv")
[df.age.mean(), np.mean(df.gender=="2"), df.shape[0]] #mean age, gender, sample size

[38.975083056478404, 0.45348837209302323, 602]

In [10]:
df.target.sort_values().drop_duplicates().tolist()[:-1] == list(range(2,62))
# repeat target 54, fold 3 (error: target variable was 545 -> did not see the questions)
# otherwise all folds included

True

In [11]:
df.loc[:,["target", "fold"]].drop_duplicates().shape
# duplicate data!

(573, 2)

In [13]:
import itertools
folds = df.loc[:,["target", "fold"]].sort_values(["target", "fold"]).values.tolist()
all_folds = [list(tup) for tup in itertools.product(range(2,62), range(1,11))]
missing = [x for x in all_folds if x not in folds]
print(len(missing)) # 28 conditions missing
missing

28


[[2, 10],
 [6, 1],
 [6, 4],
 [10, 10],
 [11, 3],
 [11, 6],
 [11, 9],
 [12, 4],
 [13, 9],
 [20, 4],
 [27, 1],
 [27, 6],
 [29, 6],
 [32, 3],
 [36, 5],
 [38, 5],
 [39, 4],
 [42, 10],
 [47, 8],
 [48, 7],
 [50, 2],
 [50, 8],
 [53, 7],
 [54, 3],
 [54, 6],
 [55, 10],
 [57, 2],
 [59, 6]]