In [1]:
# data frames and calculations
import pandas as pd
import numpy as np
from scipy import stats

# file structure
from pathlib import Path

# cross-validation
from sklearn.model_selection import KFold

#custom functions
from functions import getResponses, chooseData, chooseEmb, getEmbeddings, getData, predModel, corrUserBased #general auxiliary function

#plots
import seaborn as sns
from matplotlib import pyplot as plt

#warning messages
import warnings
warnings.filterwarnings("ignore", message="An input array is constant; the correlation coefficient is not defined.")

#set random state for stochastic processes
randState = 0

## Choose data set

In [7]:
d = "hsq" #BIG5. 16PF, RIASEC, HSQ
h_path = "../human_studies/" + d.upper() + "/" + d.lower() 

# create folder for saving human raters data (files to run create survey)
Path("../human_studies/" + d.upper()).mkdir(parents=True, exist_ok = True)

Load Data

In [8]:
#Choose Data Set:
R = 2           #1: reversed-coded, #2: nonReversed-coded

#best Model (KnnRegression k=5, no reverse-coding):
m = 4          #0: Ridge, #1: RidgeClass, #2:KNN, #3: Kernel SVM (RBF), #4: KNN regression
par = 5
e = 'sentencebert'
model, modelName = predModel(m,par) 

#load path and necessary variables:
folder, data = chooseData(d)        # BIG5, IPIP (all items), IPIP2 (only assigned items), RIASEC, HEXACO, 16PF
embeddings, save = chooseEmb(e)     #USE, BERT, SENTENCEBERT
responses, savePath, items, _ = getResponses(folder, data, R) #1: Reversed, #2: nonReversed
responses = responses.astype(float) #get observed responses as floats
X, X_stand, X_pca_stand = getEmbeddings(folder, data, embeddings, responses)

#get embeddings name:
embName = embeddings.split("_")[2].split(".")[0]

# import required data and labels:
data_q, constructs_list, list_par, constrAssigned = getData(1, responses, X_pca_stand, folder, data)

# get predicted responses of chosen model:
total_preds = pd.read_csv(savePath + modelName + "_" + str(par) + "_" + embName + "_responses.csv", index_col=0)
total_preds.index = total_preds.index.map(str)
total_preds = total_preds.astype(float)

#get performance:
corr, means = corrUserBased(total_preds, responses)

Find Target Participants (0th-100th percentiles in 60 steps)

In [9]:
error = 0.0001 # for rounding errors

# sort targets based on model's predictive performance
sorted = corr.sort_values("Correlation")
S = sorted.Correlation
P = sorted['p-value']
percentage_rank = S.rank(method="max", pct=True)
sorted["percentile"] = percentage_rank

# set number of targets and corresponding predictive accuracy percentiles (equidistant)
nr_targets = 60
percentiles = np.linspace(0, 1, nr_targets)

#print details (yes/no)
verbose = False

ids = []
# iterate over sorted list and save all targets (chosen based on accuracy percentiles)
for q in percentiles:
  idx = S.index[percentage_rank >= q-error]
  ids.append(idx[0])
  
  #print predictive accuracy for each target
  if verbose==1:
      print("Rank: " + str(round(percentage_rank[idx[0]], 3)))
      print('Target ID: ' + idx[0])
      print("Correlation: " + str(S[idx[0]]))
      print("p-value: " + str(P[idx[0]]))
      CI = r_confidence_interval(S[idx[0]], 0.95, responses.shape[1])
      print("CI: [" + str(CI[0]) + ", " + str(CI[1]) + "]")
      print("\n")

# Save Target information (performance) in dataframe
targets_ranked = sorted.loc[ids]
targets_ranked.insert(0, "target_nr", list(range(2,62)))
targets_responses = responses.loc[ids]
targets_ranked_nr = ["Field " + str(i) for i in range(2,62)]

# Merge target performance information and item responses
targets_data = pd.merge(targets_ranked, targets_responses, left_index=True, right_index=True).drop(labels = ["L1 Loss"], axis = 1)
# format target responses (rows -- questions, columns -- targets)
targets_data_processed = targets_data.drop(["target_nr", "Correlation", "p-value", "percentile"], axis=1).T
targets_data_processed.columns = list(targets_ranked_nr)
targets_data_processed.index.name = "question-id"

# Save target information
targets_data.to_csv(h_path + "_targets_data.csv") # save to files

### in case something breaks, load backUp:
# targets_data = pd.read_csv("../human_studies/BackUps" + "/" + d.lower() + "_full_targets_questions.csv", index_col = 0) # load backUp to replicate study

Extract train/test-folds

In [12]:
kf = KFold(n_splits=10, random_state=randState, shuffle=True)
questions = list(kf.split(data_q))
questions = [(x+1,y+1) for x,y in questions]
question_texts = pd.read_csv("../embeddings/" + d.upper() + "/" + d.lower() + "_questions_text.csv", index_col=0) #get question texts

if d.upper() == "BIG5":
    colname = "grammartical_item"
else:
    colname = "item"
# merge response data (sorted targets) with question texts for qualtrics import (lopp & merge)
target_data_full = pd.merge(question_texts[colname], targets_data_processed, on="question-id")
target_data_full = target_data_full.rename(columns={colname: 'full_item'})
target_data_full.full_item = target_data_full.full_item.str.capitalize()
target_data_full.to_csv(h_path + "_full_targets_questions.csv")

### in case something breaks, load a backup:
# target_data_full = pd.read_csv("../human_studies/BackUps" + "/" + d.lower() + "_full_targets_questions.csv", index_col = 0) 

folds = []
for nr, fold in enumerate(questions):
    train = questions[nr][0]
    test = questions[nr][1]
    folds.append([nr+1, train, test])
    target_data_fold_train = target_data_full.iloc[train-1]
    target_data_fold_test  = target_data_full.iloc[test-1]
    target_data_fold_train.to_csv(h_path + "_train_fold_" + str(nr+1) + ".csv")
    target_data_fold_test.to_csv(h_path + "_test_fold_" + str(nr+1) + ".csv")
    
df_folds = pd.DataFrame(folds, columns=["fold_nr", "train_items", "test_items"])
df_folds.to_csv(h_path + "_question_folds.csv", index=False) # save to files

Show the test folds (for survey creation in qualtrics)

In [6]:
fold_overview = []
for i in range(question_texts.shape[0]):
    foldNr = df_folds.fold_nr[df_folds.test_items.apply(lambda x: i+1 in x)].iloc[0]
    fold_overview.append([i+1, foldNr])
    
fold_overview

[[1, 7],
 [2, 7],
 [3, 4],
 [4, 6],
 [5, 8],
 [6, 8],
 [7, 6],
 [8, 1],
 [9, 2],
 [10, 10],
 [11, 4],
 [12, 7],
 [13, 7],
 [14, 5],
 [15, 7],
 [16, 6],
 [17, 3],
 [18, 8],
 [19, 4],
 [20, 3],
 [21, 6],
 [22, 10],
 [23, 3],
 [24, 6],
 [25, 2],
 [26, 9],
 [27, 1],
 [28, 3],
 [29, 9],
 [30, 9],
 [31, 4],
 [32, 9],
 [33, 9],
 [34, 2],
 [35, 8],
 [36, 7],
 [37, 10],
 [38, 1],
 [39, 8],
 [40, 10],
 [41, 3],
 [42, 7],
 [43, 8],
 [44, 4],
 [45, 1],
 [46, 2],
 [47, 7],
 [48, 10],
 [49, 5],
 [50, 5],
 [51, 5],
 [52, 2],
 [53, 6],
 [54, 8],
 [55, 1],
 [56, 2],
 [57, 3],
 [58, 7],
 [59, 10],
 [60, 4],
 [61, 3],
 [62, 4],
 [63, 1],
 [64, 2],
 [65, 5],
 [66, 7],
 [67, 3],
 [68, 10],
 [69, 6],
 [70, 5],
 [71, 10],
 [72, 4],
 [73, 9],
 [74, 1],
 [75, 4],
 [76, 6],
 [77, 6],
 [78, 9],
 [79, 6],
 [80, 9],
 [81, 1],
 [82, 10],
 [83, 9],
 [84, 5],
 [85, 6],
 [86, 4],
 [87, 4],
 [88, 10],
 [89, 10],
 [90, 2],
 [91, 1],
 [92, 7],
 [93, 2],
 [94, 2],
 [95, 1],
 [96, 6],
 [97, 4],
 [98, 4],
 [99, 5],
 [100, 9

In [20]:
d = "hsq" #BIG5. 16PF, RIASEC, HSQ
h_path = "../human_studies/" + d.upper() + "/" + d.lower() 

# create folder for saving human raters data (files to run create survey)
Path("../human_studies/" + d.upper()).mkdir(parents=True, exist_ok = True)

#Choose Data Set:
R = 2           #1: reversed-coded, #2: nonReversed-coded

#best Model (KnnRegression k=5, no reverse-coding):
m = 4          #0: Ridge, #1: RidgeClass, #2:KNN, #3: Kernel SVM (RBF), #4: KNN regression
par = 5
e = 'sentencebert'
model, modelName = predModel(m,par) 

#load path and necessary variables:
folder, data = chooseData(d)        # BIG5, IPIP (all items), IPIP2 (only assigned items), RIASEC, HEXACO, 16PF
embeddings, save = chooseEmb(e)     #USE, BERT, SENTENCEBERT
responses, savePath, items, _ = getResponses(folder, data, R) #1: Reversed, #2: nonReversed
responses = responses.astype(float) #get observed responses as floats
X, X_stand, X_pca_stand = getEmbeddings(folder, data, embeddings, responses)

#get embeddings name:
embName = embeddings.split("_")[2].split(".")[0]

# import required data and labels:
data_q, constructs_list, list_par, constrAssigned = getData(1, responses, X_pca_stand, folder, data)

# get predicted responses of chosen model:
total_preds = pd.read_csv(savePath + modelName + "_" + str(par) + "_" + embName + "_responses.csv", index_col=0)
total_preds.index = total_preds.index.map(str)
total_preds = total_preds.astype(float)

#get performance:
corr, means = corrUserBased(total_preds, responses)

In [21]:
error = 0.0001 # for rounding errors

# sort targets based on model's predictive performance
sorted = corr.sort_values("Correlation")
S = sorted.Correlation
P = sorted['p-value']
percentage_rank = S.rank(method="max", pct=True)
sorted["percentile"] = percentage_rank

# set number of targets and corresponding predictive accuracy percentiles (equidistant)
nr_targets = 60
percentiles = np.linspace(0, 1, nr_targets)

#print details (yes/no)
verbose = False

ids = []
# iterate over sorted list and save all targets (chosen based on accuracy percentiles)
for q in percentiles:
  idx = S.index[percentage_rank >= q-error]
  ids.append(idx[0])
  
  #print predictive accuracy for each target
  if verbose==1:
      print("Rank: " + str(round(percentage_rank[idx[0]], 3)))
      print('Target ID: ' + idx[0])
      print("Correlation: " + str(S[idx[0]]))
      print("p-value: " + str(P[idx[0]]))
      CI = r_confidence_interval(S[idx[0]], 0.95, responses.shape[1])
      print("CI: [" + str(CI[0]) + ", " + str(CI[1]) + "]")
      print("\n")

# Save Target information (performance) in dataframe
targets_ranked = sorted.loc[ids]
targets_ranked.insert(0, "target_nr", list(range(2,62)))
targets_responses = responses.loc[ids]
targets_ranked_nr = ["Field " + str(i) for i in range(2,62)]

# Merge target performance information and item responses
targets_data = pd.merge(targets_ranked, targets_responses, left_index=True, right_index=True).drop(labels = ["L1 Loss"], axis = 1)
# format target responses (rows -- questions, columns -- targets)
targets_data_processed = targets_data.drop(["target_nr", "Correlation", "p-value", "percentile"], axis=1).T
targets_data_processed.columns = list(targets_ranked_nr)
targets_data_processed.index.name = "question-id"

# Save target information
# targets_data.to_csv(h_path + "_targets_data.csv") # save to files

### in case something breaks, load backUp:
# targets_data = pd.read_csv("../human_studies/BackUps" + "/" + d.lower() + "_full_targets_questions.csv", index_col = 0) # load backUp to replicate study

In [22]:
kf = KFold(n_splits=10, random_state=randState, shuffle=True)
questions = list(kf.split(data_q))
questions = [(x+1,y+1) for x,y in questions]
question_texts = pd.read_csv("../embeddings/" + d.upper() + "/" + d.lower() + "_questions_text.csv", index_col=0) #get question texts

if d.upper() == "BIG5":
    colname = "grammartical_item"
else:
    colname = "item"
# merge response data (sorted targets) with question texts for qualtrics import (lopp & merge)
target_data_full = pd.merge(question_texts[colname], targets_data_processed, on="question-id")
target_data_full = target_data_full.rename(columns={colname: 'full_item'})
target_data_full.full_item = target_data_full.full_item.str.capitalize()
# target_data_full.to_csv(h_path + "_full_targets_questions.csv")

### in case something breaks, load a backup:
# target_data_full = pd.read_csv("../human_studies/BackUps" + "/" + d.lower() + "_full_targets_questions.csv", index_col = 0) 

folds = []
for nr, fold in enumerate(questions):
    train = questions[nr][0]
    test = questions[nr][1]
    folds.append([nr+1, train, test])
    target_data_fold_train = target_data_full.iloc[train-1]
    target_data_fold_test  = target_data_full.iloc[test-1]
    # target_data_fold_train.to_csv(h_path + "_train_fold_" + str(nr+1) + ".csv")
    # target_data_fold_test.to_csv(h_path + "_test_fold_" + str(nr+1) + ".csv")
    
df_folds = pd.DataFrame(folds, columns=["fold_nr", "train_items", "test_items"])
# df_folds.to_csv(h_path + "_question_folds.csv", index=False) # save to files

In [23]:
fold_overview = []
for i in range(question_texts.shape[0]):
    foldNr = df_folds.fold_nr[df_folds.test_items.apply(lambda x: i+1 in x)].iloc[0]
    fold_overview.append([i+1, foldNr])
    
fold_overview

[[1, 9],
 [2, 6],
 [3, 1],
 [4, 9],
 [5, 7],
 [6, 4],
 [7, 6],
 [8, 8],
 [9, 4],
 [10, 8],
 [11, 1],
 [12, 1],
 [13, 10],
 [14, 3],
 [15, 2],
 [16, 10],
 [17, 2],
 [18, 4],
 [19, 7],
 [20, 8],
 [21, 3],
 [22, 10],
 [23, 1],
 [24, 5],
 [25, 3],
 [26, 5],
 [27, 2],
 [28, 9],
 [29, 2],
 [30, 7],
 [31, 5],
 [32, 6]]