In [1]:
import pandas as pd
import numpy as np
from scipy import stats
from pathlib import Path

#Preprocessing
from scipy.stats import pearsonr, mode 
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.model_selection import KFold

#Models
from sklearn.svm import SVC
from sklearn.linear_model import RidgeClassifier, Ridge
from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor
from sklearn.cluster import KMeans

#plots
import seaborn as sns
from matplotlib import pyplot as plt

#random state
randState = 0

## Choose data set

In [2]:
data = "BIG5"
h_path = "human_studies/" + data + "/" + data.lower() 

### Import data


In [123]:
df_raw = pd.read_csv(h_path + "_qualtrics_raw.csv")

#filter columns and rename
cols = ["Q6.2", "Q27.1", "Q28.2", "Q28.3", "Q8.2", "target", "fold"]
cols_response = [str(x) + "_Q589_4" for x in range(1,101)]
cols.extend(cols_response)
df_raw_pre = df_raw.loc[:, cols]
rename_dict = {k:v for k,v in zip(cols_response, ["q" + str(x) for x in range(1,101)])}
rename_dict = rename_dict | {'Q6.2': 'attention_check', 'Q27.1': 'impressions', "Q28.2": "age", "Q28.3": "gender", "Q8.2": "id"}
df_raw_pre = df_raw_pre.rename(columns=rename_dict)
df_raw_pre = df_raw_pre.drop([0,1], axis=0)

# convert scale responses to never
replacer = {'1': 1, '2': 2, '3': 3, '4': 4, '5': 5}
df_raw_pre[["q" + str(x) for x in range(1,101)]] = df_raw_pre[["q" + str(x) for x in range(1,101)]].replace(replacer)
df_raw_pre.loc[:, ["age", "target", "fold"]] = df_raw_pre.loc[:, ["age", "target", "fold"]].astype(float)

# drop failed attention check
df = df_raw_pre[df_raw_pre.attention_check == "1"].reset_index(drop=True).drop(["attention_check"], axis = 1)
[df.age.mean(), np.mean(df.gender=="Female"), df.shape[0]] #mean age, gender, sample size
df.to_csv(h_path + "_qualtrics_cleaned.csv")

In [124]:
df

Unnamed: 0,impressions,age,gender,id,target,fold,q1,q2,q3,q4,...,q91,q92,q93,q94,q95,q96,q97,q98,q99,q100
0,"Likes art, but not abstact ideas. Liberal. Fri...",39.0,1,5f4912fc3c25512e73761c48,13.0,3.0,,,,1,...,,,,,,,,,,2
1,"A normal, somewhat introverted person.",23.0,1,62ed7ba7a109290a0e53dc95,23.0,9.0,,,,,...,,,,,,,,,,
2,Life of the party with conservative leaning vi...,35.0,2,5ea9a252ef7ece0cd4993841,38.0,6.0,3,,,,...,,2,,,,,,,,
3,people pleasing and anxious,26.0,2,60fcfc901e6ff71aa60cc193,26.0,6.0,5,,,,...,,2,,,,,,,,
4,This person strikes me as a middle of the road...,32.0,1,615dcb204ee13e6299a50595,9.0,1.0,,,2,,...,,,,1,,4,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
597,I get the impression that they are a younger p...,66.0,1,62a0294e78bac60d6f9c49df,60.0,3.0,,,,2,...,,,,,,,,,,1
598,They seem a bit undecided about different things.,55.0,2,588e8b95670d6600012bd8dc,27.0,7.0,,,,,...,,,,,,,,,,
599,"Extroverted, happy, empathetic, creative",26.0,2,62865954398c6b852e1b4c22,61.0,5.0,,1,,,...,,,,,,,,,,
600,My first impressions are that this person thin...,23.0,2,61081e56eef3ebeb3f9e684c,57.0,8.0,,,,,...,,,,,5,,,1,2,


In [82]:
df.target.sort_values().drop_duplicates().tolist()[:-1] == list(range(2,62))
# repeat target 54, fold 3 (error: target variable was 545 -> did not see the questions)
# otherwise all folds included

# include in test
df.loc[df.target==545, "target"]=54.0 #include in test
df.to_csv(h_path + "_qualtrics_cleaned.csv")

In [87]:
df.loc[:,["target", "fold"]].drop_duplicates().shape
# duplicate data!

(573, 2)

In [85]:
import itertools
folds = df.loc[:,["target", "fold"]].sort_values(["target", "fold"]).values.tolist()
all_folds = [list(tup) for tup in itertools.product(range(2,62), range(1,11))]
missing = [x for x in all_folds if x not in folds]
len(missing) # 28 conditions missing
df.loc[df.target==54

27

In [111]:
df_raw.loc[(df_raw.target=="2") & (df_raw.fold=="1"),"94_Q589_4"]

326     
Name: 94_Q589_4, dtype: object

In [117]:
df_raw_pre.loc[df_raw_pre.target==2]

Unnamed: 0,attention_check,impressions,age,gender,id,target,fold,q1,q2,q3,...,q91,q92,q93,q94,q95,q96,q97,q98,q99,q100
60,Never,Confused perhaps a bit shy but able to keep up...,33.0,Male,5ecc1798e3ca777cfb37b0f5,2.0,5.0,,3.0,,...,,,,,,,,,,
117,Never,"Mixed impressions, pretty emotional, sometimes...",31.0,Male,5aeb7cda72e5160001c476db,2.0,9.0,,,,...,,,,,,,,,,
137,Never,I feel like this person is hard to get to know...,55.0,Female,60859f820e5009a67adb3921,2.0,2.0,,,,...,,,5.0,,,,,,,
284,Never,"Introverted, conservative, distrustful, doesn'...",43.0,Male,590f89c75cad260001603033,2.0,3.0,,,,...,,,,,,,,,,3.0
326,Never,"Disorganized, imaginative, doesn't trust peopl...",28.0,Female,5d092069fb1a540001dd5028,2.0,1.0,,,5.0,...,,,,,,5.0,,,,
327,Never,I think they are wishy washy if that was all t...,62.0,Female,5d321a7348bf98001646aeb2,2.0,4.0,,,,...,,,,,,,,,,
423,Never,"artistic, outgoing but introverted",29.0,Male,628feaaa86e683cdbebf98a0,2.0,7.0,,,,...,,,,,,,,,,
564,Never,They seem a bit self involved and anxious. The...,46.0,Male,611830fe665c7b68723910b3,2.0,6.0,5.0,,,...,,1.0,,,,,,,,
591,Never,This person does not seem like a fun person to...,22.0,Male,5ff17f71b9b6090e4fa1841c,2.0,8.0,,,,...,,,,,5.0,,,3.0,1.0,
