# Labeling pilot

Analyze agreement on the 100 labels across 2-3 annotators.

## Setup

In [1]:
import json
from pyprojroot import here
from pathlib import Path
here()

PosixPath('/home/ck37/projects/clinical-sentiment-keywords')

## Import data

In [2]:
import pandas as pd

data_raw = here() / "data-raw"

# Exported from Label Studio
tsv_files = json_files = list(data_raw.glob("labels100-*.tsv"))

tsvs = {tsv_file.stem: pd.read_csv(tsv_file, sep = '\t') for tsv_file in tsv_files}
tsvs.keys()


dict_keys(['labels100-8082-mac', 'labels100-8080-mary', 'labels100-8081-dan'])

In [3]:
tsvs['labels100-8080-mary']['rater'] = 'mary'
tsvs['labels100-8081-dan']['rater'] = 'daniel'
tsvs['labels100-8082-mac']['rater'] = 'macgregor'

In [4]:
print(tsvs['labels100-8081-dan'].shape)
#svs['labels100-8081-dan']['overall-sentiment'].value_counts()
# There are two annotations for this excerpt for some reason.
tsvs['labels100-8081-dan'] = tsvs['labels100-8081-dan'][tsvs['labels100-8081-dan']['annotation_id'] != 20]
#tsvs['labels100-8081-dan'].drop_duplicates(subset = ['row_id'])
print(tsvs['labels100-8081-dan'].shape)

(101, 20)
(100, 20)


In [5]:
# Combine tsvs into a single df.
df = pd.concat(tsvs)

#print(df.describe())

print(f'Original annotator code counts:\n{df.annotator.value_counts()}\n')

# Drop the 2 annotations from #4 for now.
df = df.loc[df.annotator.values != 4]

print(f'Updated annotator code counts:\n{df.annotator.value_counts()}\n')
print(f'Rater counts:\n{df.rater.value_counts()}')

Original annotator code counts:
2    200
3    100
4      2
Name: annotator, dtype: int64

Updated annotator code counts:
2    200
3    100
Name: annotator, dtype: int64

Rater counts:
macgregor    100
mary         100
daniel       100
Name: rater, dtype: int64


In [6]:
df.sort_values(by = ['row_id'], inplace = True)
df.head(10)

Unnamed: 0,Unnamed: 1,row_id,sent_num,text,chars,words,keywords,keyword_count,id,aspect,uncertainty,overall-sentiment,aspect2-sentiment,aspect1-sentiment,annotator,annotation_id,created_at,updated_at,lead_time,aspect3-sentiment,rater
labels100-8082-mac,90,624,51,The patient's heart rate was improved and the ...,85,17,['improved'],1,10,"[{""start"": 4, ""end"": 24, ""text"": ""patient's he...",No,Positive,Positive,Positive,2,9,2022-01-26T21:24:36.827544Z,2022-01-26T21:24:36.827553Z,,,macgregor
labels100-8081-dan,91,624,51,The patient's heart rate was improved and the ...,85,17,['improved'],1,10,"[{""start"": 14, ""end"": 24, ""text"": ""heart rate""...",No,Positive,Positive,Positive,2,35,2022-02-16T04:30:18.479798Z,2022-02-16T04:30:18.479834Z,117.852,Positive,daniel
labels100-8080-mary,12,624,51,The patient's heart rate was improved and the ...,85,17,['improved'],1,93,"[{""start"": 14, ""end"": 24, ""text"": ""heart rate""...",No,Very Positive,Positive,Positive,3,159,2022-01-31T07:11:31.964366Z,2022-01-31T07:11:31.964409Z,24.881,,mary
labels100-8082-mac,59,6028,6,In the early afternoon she was so lethargic sh...,74,16,['unable'],1,41,"[{""start"": 31, ""end"": 73, ""text"": ""so lethargi...",Yes,Negative,,Negative,2,42,2022-01-31T00:47:14.031546Z,2022-01-31T00:47:14.031585Z,25.847,,macgregor
labels100-8080-mary,41,6028,6,In the early afternoon she was so lethargic sh...,74,16,['unable'],1,62,"[{""start"": 34, ""end"": 43, ""text"": ""lethargic"",...",Can't tell,Very Negative,Negative,Negative,3,128,2022-01-31T06:48:25.631980Z,2022-01-31T06:48:25.632020Z,32.69,,mary
labels100-8081-dan,61,6028,6,In the early afternoon she was so lethargic sh...,74,16,['unable'],1,41,"[{""start"": 34, ""end"": 43, ""text"": ""lethargic"",...",No,Very Negative,,Negative,2,13,2022-01-26T23:50:48.658822Z,2022-01-26T23:50:48.658831Z,,,daniel
labels100-8082-mac,27,6870,10,"On the floor, she reports some improvement and...",79,16,['improvement'],1,74,"[{""start"": 26, ""end"": 78, ""text"": ""some improv...",No,Positive,,Positive,2,75,2022-02-04T19:55:10.248898Z,2022-02-04T19:55:10.248970Z,129.56,,macgregor
labels100-8081-dan,27,6870,10,"On the floor, she reports some improvement and...",79,16,['improvement'],1,74,"[{""start"": 31, ""end"": 42, ""text"": ""improvement...",No,Positive,,Positive,2,76,2022-02-20T07:22:03.306734Z,2022-02-20T07:22:03.306778Z,277.387,,daniel
labels100-8080-mary,75,6870,10,"On the floor, she reports some improvement and...",79,16,['improvement'],1,28,"[{""start"": 31, ""end"": 42, ""text"": ""improvement...",No,Very Positive,Positive,Positive,3,43,2022-01-13T13:44:02.832653Z,2022-01-13T13:44:02.832706Z,34212.299,,mary
labels100-8082-mac,63,7995,169,This was re-positioned by interventional pulmo...,86,14,['resolution'],1,37,"[{""start"": 9, ""end"": 22, ""text"": ""re-positione...",No,Positive,Positive,Neutral,2,26,2022-01-26T21:24:36.828729Z,2022-01-26T21:24:36.828738Z,,,macgregor


## Overall sentiment

In [55]:
# We only need these two columns for sentiment analysis.
df2 = df[['row_id', 'text', 'rater', 'overall-sentiment']]
df3 = df2.pivot(index = ['row_id', 'text'], columns = ['rater'], values = 'overall-sentiment')

df3.reset_index(inplace = True)
#df3.drop(columns = ['annotator'], inplace = True)

df3.head()

In [58]:
df3.columns.name = ''
df3.columns

Index(['row_id', 'text', 'daniel', 'macgregor', 'mary'], dtype='object', name='')

In [59]:
df3.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100 entries, 0 to 99
Data columns (total 5 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   row_id     100 non-null    int64 
 1   text       100 non-null    object
 2   daniel     100 non-null    object
 3   macgregor  100 non-null    object
 4   mary       100 non-null    object
dtypes: int64(1), object(4)
memory usage: 4.0+ KB


In [62]:
df3.to_excel("data/labeling-pilot.xlsx", index = False)

In [63]:
raters = ['daniel', 'macgregor', 'mary']
kdat = df3[raters].to_numpy().T

In [64]:
from krippendorff import alpha
# 72.1% reliability.
result = alpha(reliability_data = kdat.tolist(),
            level_of_measurement = "ordinal",
            value_domain = ['Very Negative', 'Negative', 'Neutral', 'Positive', 'Very Positive'])
print(round(result, 3))

0.721


### Collapsed coding (3)

In [65]:
df4 = df3.copy()
# Collapse to 3-value rating scale.
for rater in raters:
    df4[rater] = df4[rater].str.replace('Very Negative', 'Negative').replace('Very Positive', 'Positive')
    print(df4[rater].value_counts())

Positive    40
Negative    40
Neutral     20
Name: daniel, dtype: int64
Neutral     37
Negative    32
Positive    31
Name: macgregor, dtype: int64
Positive    51
Negative    33
Neutral     16
Name: mary, dtype: int64


In [66]:
# 75.4% reliability.
result = alpha(reliability_data = df4[raters].to_numpy().T.tolist(),
            level_of_measurement = "ordinal",
            value_domain = ['Negative', 'Neutral', 'Positive'])
print(round(result, 3))

0.754


In [67]:
#df4[['rater2', 'rater3']].to_feather("data/labeling-pilot.feather")
df4[raters].to_excel("data/labeling-pilot-collapsed.xlsx", index = False)

## Uncertainty

In [21]:
# We only need these two columns for sentiment analysis.
df2 = df[['row_id', 'text', 'rater', 'uncertainty']]
print(f"Rating distribution:\n{df2.uncertainty.value_counts()}")
df3 = df2.pivot(index = ['row_id', 'text'], columns = ['rater'], values = 'uncertainty')

df3.reset_index(inplace = True)
#df3.drop(columns = ['annotator'], inplace = True)

df3.head()

Rating distribution:
No            211
Yes            69
Can't tell     20
Name: uncertainty, dtype: int64


rater,row_id,text,daniel,macgregor,mary
0,624,The patient's heart rate was improved and the ...,No,No,No
1,6028,In the early afternoon she was so lethargic sh...,No,Yes,Can't tell
2,6870,"On the floor, she reports some improvement and...",No,No,No
3,7995,This was re-positioned by interventional pulmo...,No,No,No
4,8773,PHYSICAL EXAMINATION: The patient was afebrile...,No,No,Can't tell


In [22]:
df3.columns.name = ''
df3.columns

Index(['row_id', 'text', 'daniel', 'macgregor', 'mary'], dtype='object', name='')

In [23]:
df3.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100 entries, 0 to 99
Data columns (total 5 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   row_id     100 non-null    int64 
 1   text       100 non-null    object
 2   daniel     100 non-null    object
 3   macgregor  100 non-null    object
 4   mary       100 non-null    object
dtypes: int64(1), object(4)
memory usage: 4.0+ KB


In [24]:
df3.to_excel("data/labeling-pilot-uncertainty.xlsx", index = False)

In [25]:
raters = ['daniel', 'macgregor', 'mary']
kdat = df3[raters].to_numpy().T

In [27]:
from krippendorff import alpha
# 38.5%
result = alpha(reliability_data = kdat.tolist(),
            level_of_measurement = "ordinal",
            value_domain = ['No', 'Can\'t tell', 'Yes'])
print(round(result, 3))

0.385


### Collapsed coding (2)

In [28]:
df4 = df3.copy()
# Collapse to 3-value rating scale.
for rater in raters:
    df4[rater] = df4[rater].str.replace('Can\'t tell', 'No')
    print(df4[rater].value_counts())

No     78
Yes    22
Name: daniel, dtype: int64
No     68
Yes    32
Name: macgregor, dtype: int64
No     85
Yes    15
Name: mary, dtype: int64


In [30]:
# 40 reliability.
result = alpha(reliability_data = df4[raters].to_numpy().T.tolist(),
            level_of_measurement = "ordinal",
            value_domain = ['No', 'Yes'])
print(round(result, 3))

0.4
