In this notebook, we aim at computing score function assessing the way participants sorted the shapes on the slots


-------------------


# import and settings

In [2]:
import pandas as pd
import numpy as np
import os
import seaborn as sns

# Import clean csv

In [3]:
path = ""
slotsDf = pd.read_csv(os.path.join(path, "clean-data/slots.csv"))
participantsDf = pd.read_csv(os.path.join(path, "clean-data/participants.csv"))
focusTimesDf = pd.read_csv(os.path.join(path, "clean-data/focusTimes.csv"))
expDf = pd.read_csv(os.path.join(path, "clean-data/experiments.csv"))

# number of true answer

# True answer proportion

we will here compute the proportion of true answers, no matter their position on the slots.

For example, someone that did

> true 1 ; true 4 ; false 4 ; true 2 ; false 5 ; false 7 ; false 3

will have a trueProp of $\frac{3}{7}$

In [59]:
expDf =(expDf
  .set_index(["participantId","device","datasetId"])
  .join
  (slotsDf
    .groupby(['participantId', 'device', 'datasetId'])
    .mean()
    ['isTrue']
    .rename("trueProp"))
  .reset_index())

we will also save here the absolute number of true answsers

In [58]:
expDf = (expDf
  .set_index(["participantId","device","datasetId"])
  .join
  (slotsDf
    .groupby(['participantId', 'device', 'datasetId'])
    .sum()
    ['isTrue']
    .rename("nbTrue"))
  .reset_index())

# Max Streak

The max streak function computes the longest ordered row of "trues" or "falses" that can be found in one set of submitted answers.

In [61]:
def compute_max_streak(dataFrame, condition = True, direction="forward") -> int:
  """Condition filters for True or False i the anwsers. Can accept three direction: forward, backward, both"""
  diffDf = [1]
  for i in range(1,len(dataFrame)):
      if dataFrame["isTrue"].values[i] == condition :
        diffDf.append(dataFrame["answerId"].values[i] - dataFrame["answerId"].values[i-1])
      else:
        diffDf.append(0)
  
  maxStreak = currentStreak = 0
  for d in diffDf:
      if direction == "both":
        d = np.abs(d)
      if direction == "backward":
        d += 2
      if d == 1:
        currentStreak += 1
        maxStreak = max(maxStreak, currentStreak)
      else:
        currentStreak = 0
  return maxStreak

In [62]:
expDf = (expDf
  .set_index(["participantId","device","datasetId"])
  .join
    (slotsDf
      .groupby(['participantId', 'device', 'datasetId'], as_index=False)
      .apply(compute_max_streak, True,"forward")
      .set_index(["participantId","device","datasetId"])[None]
      .rename("maxTrueStreak"))
  .reset_index())

In [63]:
expDf = (expDf
  .set_index(["participantId","device","datasetId"])
  .join
    (slotsDf
      .groupby(['participantId', 'device', 'datasetId'], as_index=False)
      .apply(compute_max_streak, True,"both")
      .set_index(["participantId","device","datasetId"])[None]
      .rename("maxTrueStreakBoth"))
  .reset_index())

In [64]:
expDf = (expDf
  .set_index(["participantId","device","datasetId"])
  .join
    (slotsDf
      .groupby(['participantId', 'device', 'datasetId'], as_index=False)
      .apply(compute_max_streak, False,"both")
      .set_index(["participantId","device","datasetId"])[None]
      .rename("maxFalseStreakBoth"))
  .reset_index())

# Veracity scoring
Grades the distance to the perfect answer by looking at the trues' distance to their ideal position and considering that a false is equivalent to having a true at maximal distance from its assigned slot (here 7).

Let $X$ be the how the participant sorted the shapes

$$\text{VeracityScore}( X ) = \frac{\underset{x \in X}{\sum} f(x) - 7^2}{7^2}$$

With
$$f(x) = \left\{ \begin{array}{c}
\left| \text{pos}(x) - \text{target}(x) \right| & \text{if } x \text{ is true} \\
7 & \text{else}\\
\end{array}\right.$$

In [65]:
def veracity_scoring(dataFrame):
  distList = []
  answerLen = len(dataFrame)
  worstScore = answerLen * answerLen
  totalDist = 0
  for i in range(len(dataFrame)):
    if dataFrame["isTrue"].values[i]:
      distList.append(abs(dataFrame["slotId"].values[i]-dataFrame["answerId"].values[i]))
    else:
      distList.append(answerLen)
  for i in distList:
    totalDist += i
  return (worstScore - totalDist)/worstScore


In [66]:
expDf = (expDf
  .set_index(["participantId","device","datasetId"])
  .join
    (slotsDf
      .groupby(['participantId', 'device', 'datasetId'], as_index=False)
      .apply(veracity_scoring)
      .set_index(["participantId","device","datasetId"])[None]
      .rename("veracityScore"))
  .reset_index())

# Levenstein distance

Here we want to compute the levenstein distance of the sequence sorted vy the participants to the perfect sequence (only true and always correctly placed)

In [67]:
def levenDistIter(seq1,seq2,distScore):
  a = np.zeros((len(seq1)+1,len(seq2)+1), dtype = np.int32)
  for i in range(len(seq1)+1): # for each element of the matrix
    for j in range(len(seq2)+1): 
      if i == 0 : # first line
        a[i][j]=j
      elif j == 0 : # first column
        a[i][j]=i
      elif seq1[i-1]==seq2[j-1]: # identity
        a[i][j]=a[i-1][j-1]
      else: # mismatch
        a[i][j]=np.min([a[i-1][j]+ distScore['del'], #deletion
                        a[i][j-1] + distScore['ins'] , # insertion
                        a[i-1][j-1] + distScore['sub']]) # substitution

  return a[-1][-1]

classicDistScore = {
    'sub' : 1,
    'del' : 1,
    'ins' : 1
}

In [68]:
def compLevenDist(myExp : pd.DataFrame) -> int :
  participantSequence = list(zip(myExp["isTrue"], myExp["answerId"]))
  perfectSequence = [(True,i+1) for i in range(7) ]
  return levenDistIter(participantSequence,perfectSequence,classicDistScore)

In [69]:
expDf = (expDf
  .set_index(["participantId","device","datasetId"])
  .join(
    slotsDf
    .groupby(["participantId","device","datasetId"])
    .apply(compLevenDist)
    .rename("levenDist"))
  .reset_index())

In [70]:
scoresDf = expDf.drop(["pauseTime","sortingTime","time","experimentOrder"],axis=1)

In [71]:
scoresDf

Unnamed: 0,participantId,device,datasetId,nbTrue,trueProp,maxTrueStreak,maxTrueStreakBoth,maxFalseStreakBoth,veracityScore,levenDist
0,16,VR,1,6,0.857143,1,1,1,0.653061,5
1,16,VR,2,4,0.571429,1,1,1,0.244898,7
2,13,VR,2,3,0.428571,1,1,1,0.387755,5
3,13,VR,1,5,0.714286,1,1,1,0.530612,5
4,13,Comp,1,3,0.428571,1,1,3,0.224490,7
...,...,...,...,...,...,...,...,...,...,...
85,15,Comp,2,4,0.571429,1,1,1,0.326531,7
86,15,Comp,1,4,0.571429,1,1,1,0.346939,7
87,16,Comp,1,5,0.714286,1,2,1,0.591837,6
88,16,Comp,2,2,0.285714,1,1,1,0.204082,7


In [72]:
scoresDf.to_csv(os.path.join(path,"computed-data/scores.csv"),index=False)

In [73]:
expDf.to_csv(os.path.join(path, "computed-data/experiments+scores.csv"), index = False)