In [1]:
import pandas as pd
import numpy as np
from scipy import stats
from pathlib import Path

#Preprocessing
from scipy.stats import pearsonr, mode 
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.model_selection import KFold

#Models
from sklearn.svm import SVC
from sklearn.linear_model import RidgeClassifier, Ridge
from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor
from sklearn.cluster import KMeans

#plots
import seaborn as sns
from matplotlib import pyplot as plt

#random state
randState = 0

## Choose data set

In [2]:
data = "BIG5"
h_path = "human_studies/" + data + "/" + data.lower() 

### Import data


In [3]:
df_raw = pd.read_csv(h_path + "_qualtrics_raw.csv")

#filter columns and rename
cols = ["Q6.2", "Q27.1", "Q28.2", "Q28.3", "Q8.2", "target", "fold"]
cols_response = [str(x) + "_Q589_4" for x in range(1,101)]
cols.extend(cols_response)
df_raw_pre = df_raw.loc[:, cols]
rename_dict = {k:v for k,v in zip(cols_response, ["q" + str(x) for x in range(1,101)])}
rename_dict = rename_dict | {'Q6.2': 'attention_check', 'Q27.1': 'impressions', "Q28.2": "age", "Q28.3": "gender", "Q8.2": "id"}
df_raw_pre = df_raw_pre.rename(columns=rename_dict)
df_raw_pre = df_raw_pre.drop([0,1], axis=0)

# convert scale responses to never
replacer = {'1': 1, '2': 2, '3': 3, '4': 4, '5': 5}
df_raw_pre[["q" + str(x) for x in range(1,101)]] = df_raw_pre[["q" + str(x) for x in range(1,101)]].replace(replacer)
df_raw_pre.loc[:, ["age", "target", "fold"]] = df_raw_pre.loc[:, ["age", "target", "fold"]].astype(float)

# drop failed attention check
df = df_raw_pre[df_raw_pre.attention_check == "1"].reset_index(drop=True).drop(["attention_check"], axis = 1)
# df.to_csv(h_path + "_qualtrics_cleaned.csv")
[df.age.mean(), np.mean(df.gender=="Female"), df.shape[0]] #mean age, gender, sample size

[38.975083056478404, 0.0, 602]

In [5]:
df

Unnamed: 0,impressions,age,gender,id,target,fold,q1,q2,q3,q4,...,q91,q92,q93,q94,q95,q96,q97,q98,q99,q100
0,"Likes art, but not abstact ideas. Liberal. Fri...",39.0,1,5f4912fc3c25512e73761c48,13.0,3.0,,,,1.0,...,,,,,,,,,,2.0
1,"A normal, somewhat introverted person.",23.0,1,62ed7ba7a109290a0e53dc95,23.0,9.0,,,,,...,,,,,,,,,,
2,Life of the party with conservative leaning vi...,35.0,2,5ea9a252ef7ece0cd4993841,38.0,6.0,3.0,,,,...,,2.0,,,,,,,,
3,people pleasing and anxious,26.0,2,60fcfc901e6ff71aa60cc193,26.0,6.0,5.0,,,,...,,2.0,,,,,,,,
4,This person strikes me as a middle of the road...,32.0,1,615dcb204ee13e6299a50595,9.0,1.0,,,2.0,,...,,,,1.0,,4.0,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
597,I get the impression that they are a younger p...,66.0,1,62a0294e78bac60d6f9c49df,60.0,3.0,,,,2.0,...,,,,,,,,,,1.0
598,They seem a bit undecided about different things.,55.0,2,588e8b95670d6600012bd8dc,27.0,7.0,,,,,...,,,,,,,,,,
599,"Extroverted, happy, empathetic, creative",26.0,2,62865954398c6b852e1b4c22,61.0,5.0,,1.0,,,...,,,,,,,,,,
600,My first impressions are that this person thin...,23.0,2,61081e56eef3ebeb3f9e684c,57.0,8.0,,,,,...,,,,,5.0,,,1.0,2.0,


In [138]:
df.target.sort_values().drop_duplicates().tolist()[:-1] == list(range(2,62))
# repeat target 54, fold 3 (error: target variable was 545 -> did not see the questions)
# otherwise all folds included

True

In [139]:
df.loc[:,["target", "fold"]].drop_duplicates().shape
# duplicate data!

(573, 2)

In [140]:
import itertools
folds = df.loc[:,["target", "fold"]].sort_values(["target", "fold"]).values.tolist()
all_folds = [list(tup) for tup in itertools.product(range(2,62), range(1,11))]
missing = [x for x in all_folds if x not in folds]
len(missing) # 28 conditions missing
missing

[[2, 10],
 [6, 1],
 [6, 4],
 [10, 10],
 [11, 3],
 [11, 6],
 [11, 9],
 [12, 4],
 [13, 9],
 [20, 4],
 [27, 1],
 [27, 6],
 [29, 6],
 [32, 3],
 [36, 5],
 [38, 5],
 [39, 4],
 [42, 10],
 [47, 8],
 [48, 7],
 [50, 2],
 [50, 8],
 [53, 7],
 [54, 3],
 [54, 6],
 [55, 10],
 [57, 2],
 [59, 6]]