```
Copyright 2023 by LMU Munich Media Informatics Group. All rights reserved.
Created by [Changkun Ou](https://changkun.de)

Use of this source code is governed by a GPLv3 license that
can be found in the LICENSE file.
```

This notebook reproduces the "Section 4.1.1 Inferred Expertise" in the paper of "The Impact of Expertise in the Loop for Exploring Machine Rationality".

The statistical method used in this notebook is quantile-based discretization.

In [1]:
import datetime
import numpy as np
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
import warnings
warnings.filterwarnings("ignore")

domains = ['text', 'image', 'mesh']
df_expertise = pd.read_csv('../data/processed/expertise.csv')

df_expertise.groupby(['domain', 'workExperience']).workExperience.describe()

Unnamed: 0_level_0,Unnamed: 1_level_0,count,mean,std,min,25%,50%,75%,max
domain,workExperience,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
image,0,10.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
image,1,2.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0
image,12,4.0,12.0,0.0,12.0,12.0,12.0,12.0,12.0
image,15,1.0,15.0,,15.0,15.0,15.0,15.0,15.0
image,24,1.0,24.0,,24.0,24.0,24.0,24.0,24.0
image,84,1.0,84.0,,84.0,84.0,84.0,84.0,84.0
image,120,1.0,120.0,,120.0,120.0,120.0,120.0,120.0
mesh,0,12.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
mesh,1,1.0,1.0,,1.0,1.0,1.0,1.0,1.0
mesh,2,2.0,2.0,0.0,2.0,2.0,2.0,2.0,2.0


In [2]:
df_expertise_text = df_expertise[df_expertise.domain == 'text']
df_expertise_text['expertise'] = MinMaxScaler().fit_transform(np.array(df_expertise_text['expertise']).reshape(-1,1))
df_expertise_text['workExperience'] = MinMaxScaler().fit_transform(np.array(df_expertise_text['workExperience']).reshape(-1,1))
df_expertise_text['lastExperience'] = MinMaxScaler().fit_transform(np.array(df_expertise_text['lastExperience']).reshape(-1,1))
df_expertise_text['inferredExpertise'] = (df_expertise_text['expertise'] + df_expertise_text['workExperience'] + df_expertise_text['lastExperience']) / 3
df_expertise_text['expertiseLevel'] = pd.qcut(df_expertise_text['inferredExpertise'], 3, labels=['N', 'I', 'E'])

df_expertise_image = df_expertise[df_expertise.domain == 'image']
df_expertise_image['expertise'] = MinMaxScaler().fit_transform(np.array(df_expertise_image['expertise']).reshape(-1,1))
df_expertise_image['workExperience'] = MinMaxScaler().fit_transform(np.array(df_expertise_image['workExperience']).reshape(-1,1))
df_expertise_image['lastExperience'] = MinMaxScaler().fit_transform(np.array(df_expertise_image['lastExperience']).reshape(-1,1))
df_expertise_image['inferredExpertise'] = (df_expertise_image['expertise'] + df_expertise_image['workExperience'] + df_expertise_image['lastExperience']) / 3
df_expertise_image['expertiseLevel'] = pd.qcut(df_expertise_image['inferredExpertise'], 3, labels=['N', 'I', 'E'])

df_expertise_mesh = df_expertise[df_expertise.domain == 'mesh']
df_expertise_mesh['expertise'] = MinMaxScaler().fit_transform(np.array(df_expertise_mesh['expertise']).reshape(-1,1))
df_expertise_mesh['workExperience'] = MinMaxScaler().fit_transform(np.array(df_expertise_mesh['workExperience']).reshape(-1,1))
df_expertise_mesh['lastExperience'] = MinMaxScaler().fit_transform(np.array(df_expertise_mesh['lastExperience']).reshape(-1,1))
df_expertise_mesh['inferredExpertise'] = (df_expertise_mesh['expertise'] + df_expertise_mesh['workExperience'] + df_expertise_mesh['lastExperience']) / 3
df_expertise_mesh['expertiseLevel'] = pd.qcut(df_expertise_mesh['inferredExpertise'], 3, labels=['N', 'I', 'E'])

df_expertise = pd.concat([df_expertise_text, df_expertise_image, df_expertise_mesh], ignore_index=True).reset_index()
df_expertise

Unnamed: 0,index,userID,age,domain,expertise,workExperience,lastExperience,inferredExpertise,expertiseLevel
0,0,6116ee8fcd5414630d3527e8,30,text,1.0,1.0,0.333333,0.777778,E
1,1,611f5fca1c852d6d56e5bf8d,31,text,0.666667,0.694215,1.0,0.786961,E
2,2,61687f084dd0608f43a02d0e,27,text,0.333333,0.049587,0.166667,0.183196,N
3,3,5d8c9182bc22d700191aef2b,39,text,1.0,0.272727,1.0,0.757576,E
4,4,611e182dfe4a100e7d271c7e,27,text,0.666667,0.0,0.166667,0.277778,N
5,5,5eceef5fa487421604c337ba,25,text,1.0,0.0,0.833333,0.611111,E
6,6,61227cb134d2ee7a4a125ef7,25,text,0.333333,0.066116,0.5,0.299816,N
7,7,5aad8904e1546900019aef8e,39,text,0.333333,0.0,0.666667,0.333333,N
8,8,602fd43a4bcf02d4aac8ef3c,23,text,0.666667,0.049587,0.333333,0.349862,I
9,9,5fa507fa0776431b370bc0c7,41,text,1.0,1.0,0.333333,0.777778,E


In [3]:
df_expertise.groupby(['domain', 'expertiseLevel']).count().reset_index()

Unnamed: 0,domain,expertiseLevel,index,userID,age,expertise,workExperience,lastExperience,inferredExpertise
0,image,N,7,7,7,7,7,7,7
1,image,I,7,7,7,7,7,7,7
2,image,E,6,6,6,6,6,6,6
3,mesh,N,7,7,7,7,7,7,7
4,mesh,I,7,7,7,7,7,7,7
5,mesh,E,6,6,6,6,6,6,6
6,text,N,7,7,7,7,7,7,7
7,text,I,6,6,6,6,6,6,6
8,text,E,7,7,7,7,7,7,7
