In [1]:
import pandas as pd
import numpy as np
from scipy import stats
from pathlib import Path

#Preprocessing
from scipy.stats import pearsonr, mode 
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.model_selection import KFold

#Models
from sklearn.svm import SVC
from sklearn.linear_model import RidgeClassifier, Ridge
from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor
from sklearn.cluster import KMeans

#plots
import seaborn as sns
from matplotlib import pyplot as plt

#random state
randState = 0

## Choose data set

In [54]:
data = "BIG5"
h_path = "human_studies/" + data + "/" + data.lower() 

### Import data


In [58]:
df_raw = pd.read_csv(h_path + "_qualtrics_export.csv")

#filter columns and rename
cols = ["Q6.2", "Q27.1", "Q28.2", "Q28.3", "PROLIFIC_PID", "target", "fold"]
cols_response = [str(x) + "_Q589_1" for x in range(1,101)]
cols.extend(cols_response)
df_raw_pre = df_raw.loc[:, cols]
rename_dict = {k:v for k,v in zip(cols_response, ["q" + str(x) for x in range(1,101)])}
rename_dict = rename_dict | {'Q6.2': 'attention_check', 'Q27.1': 'impressions', "Q28.2": "age", "Q28.3": "gender", "PROLIFIC_PID": "id"}
df_raw_pre = df_raw_pre.rename(columns=rename_dict)
df_raw_pre = df_raw_pre.drop([0,1], axis=0)

# convert scale responses to never
replacer = {'Disagree': 1, '2': 2, 'Neutral': 3, '4': 4, 'Agree': 5}
df_raw_pre[["q" + str(x) for x in range(1,101)]] = df_raw_pre[["q" + str(x) for x in range(1,101)]].replace(replacer)
df_raw_pre.loc[:, ["age", "target", "fold"]] = df_raw_pre.loc[:, ["age", "target", "fold"]].astype(float)

# drop failed attention check
df = df_raw_pre[df_raw_pre.attention_check == "Never"].reset_index(drop=True).drop(["attention_check"], axis = 1)
[df.age.mean(), np.mean(df.gender=="Female"), df.shape[0]] #mean age, gender, sample size
df.to_csv(h_path + "_qualtrics_cleaned.csv")