# PAP DATASET

### List all existing files in "dataset"

In [2]:
import os, ast

import pandas as pd 
import numpy as np


In [3]:
base_directory = "/mount/studenten/semantic-plausibility/datasets/pap"

# walkthrough the directory and its subdirs
for root, dirs, files in os.walk(base_directory):
    for file in files:
        print(root)
        print(file)

/mount/studenten/semantic-plausibility/datasets/pap/additional-resources/eichel-ssiw-2023
Amendment.pdf
/mount/studenten/semantic-plausibility/datasets/pap/raw-annotations
README.md
/mount/studenten/semantic-plausibility/datasets/pap/raw-annotations
dataset.tsv
/mount/studenten/semantic-plausibility/datasets/pap/raw-annotations/mace_aggregations
aggregated_predictions.tsv
/mount/studenten/semantic-plausibility/datasets/pap/raw-annotations/mace_aggregations
2-class-dist.prediction
/mount/studenten/semantic-plausibility/datasets/pap/raw-annotations/mace_aggregations
4-class-dist.prediction
/mount/studenten/semantic-plausibility/datasets/pap/train-dev-test-split
README.md
/mount/studenten/semantic-plausibility/datasets/pap/train-dev-test-split/multiclass
train.csv
/mount/studenten/semantic-plausibility/datasets/pap/train-dev-test-split/multiclass
dev.csv
/mount/studenten/semantic-plausibility/datasets/pap/train-dev-test-split/multiclass
test.csv
/mount/studenten/semantic-plausibility/data

### Raw annotation

In [4]:
raw_data_fp = "/mount/studenten/semantic-plausibility/datasets/pap/raw-annotations/dataset.tsv"

In [5]:
raw_df = pd.read_csv(raw_data_fp, sep='\t')
raw_df.dtypes


event                       object
original_label              object
abstractness_combination    object
rating                      object
majority_multiclass         object
distribution_multiclass     object
majority_binary             object
distribution_binary         object
dtype: object

In [6]:
# convert the string representation to actual numerical representation
lists = ['rating', 'distribution_multiclass', 'distribution_binary']
raw_df[lists] = raw_df[lists].map(lambda x: ast.literal_eval(x.strip()))
raw_df.describe()


Unnamed: 0,event,original_label,abstractness_combination,rating,majority_multiclass,distribution_multiclass,majority_binary,distribution_binary
count,1733,1733,1733,1733,1733,1733,1733,1733
unique,1733,2,27,1708,2,409,3,33
top,ability means mobility,implausible,m-a-a,"[5, 5, 5, 4, 4, 5, 4, 5, 5]",unsure,"[0.0, 0.0, 50.0, 50.0]",1,"[100.0, 0.0]"
freq,1,871,72,3,1667,23,931,159


#### Aspect 1: Basic statistic
* Number of events: 1733 (no missing data)
* Number of original labels: 2 (871 implausible)
* Number of unique abstractness combination: 27
* For multiclass, there are a significant amount of unsure class (1677/1733)
* For binary class, there are events rated as plausible (931/1733)
* Number of events where all annotators agree 100%: 159


In [7]:
# look at the first 10 rows
raw_df.head(10)

Unnamed: 0,event,original_label,abstractness_combination,rating,majority_multiclass,distribution_multiclass,majority_binary,distribution_binary
0,ability means mobility,plausible,a-m-a,"[2, 5, 4, 5, 5, 2, 5, 5, 5, 5]",5,"[0.0, 20.0, 10.0, 70.0]",1,"[80.0, 20.0]"
1,ability permits multiplication,plausible,a-c-m,"[5, 5, 5, 5, 4, 5, 5, 4]",5,"[0.0, 0.0, 25.0, 75.0]",1,"[100.0, 0.0]"
2,access diminishes power,plausible,a-m-a,"[4, 2, 5, 5, 5, 4, 4, 2, 4]",unsure,"[0.0, 22.22222222222222, 44.44444444444444, 33...",1,"[77.77777777777779, 22.22222222222222]"
3,achievement reaches community,plausible,a-c-m,"[1, 4, 1, 4, 5, 4, 5, 4, 5, 4]",unsure,"[20.0, 0.0, 50.0, 30.0]",1,"[80.0, 20.0]"
4,acquisition casts glance,plausible,a-c-m,"[4, 5, 4, 4, 4, 4, 1, 2]",unsure,"[12.5, 12.5, 62.5, 12.5]",1,"[75.0, 25.0]"
5,action catches interest,plausible,a-c-a,"[4, 4, 4, 4, 5, 5, 5, 4]",unsure,"[0.0, 0.0, 62.5, 37.5]",1,"[100.0, 0.0]"
6,advance decreases time,plausible,a-m-m,"[1, 5, 4, 5, 5, 2, 5, 4, 4]",unsure,"[11.11111111111111, 11.11111111111111, 33.3333...",1,"[77.77777777777779, 22.22222222222222]"
7,advance guarantees freedom,plausible,a-a-a,"[4, 4, 2, 4, 4, 2, 4, 2, 4]",unsure,"[0.0, 33.33333333333333, 66.66666666666666, 0.0]",unsure,"[66.66666666666666, 33.33333333333333]"
8,advantage leads state,plausible,a-c-m,"[5, 4, 4, 4, 4, 4, 4, 1]",unsure,"[12.5, 0.0, 75.0, 12.5]",1,"[87.5, 12.5]"
9,advantage outweighs risk,plausible,a-m-a,"[4, 4, 5, 5, 2, 4, 4, 4, 1]",unsure,"[11.11111111111111, 11.11111111111111, 55.5555...",1,"[77.77777777777779, 22.22222222222222]"


In average, there are 8.98 ratings for each event; all events have at least 8 ratings, there are some events with 12 ratings.

In [8]:
raw_df['number_ratings'] = raw_df['rating'].apply(len)
raw_df['number_ratings'].describe()

count    1733.000000
mean        8.984997
std         0.811867
min         8.000000
25%         8.000000
50%         9.000000
75%        10.000000
max        12.000000
Name: number_ratings, dtype: float64

In [9]:
raw_df['average_ratings'] = raw_df['rating'].apply(pd.Series).median(axis=1)
raw_df['average_ratings'].describe()

count    1733.000000
mean        3.786786
std         0.957367
min         1.000000
25%         4.000000
50%         4.000000
75%         4.000000
max         5.000000
Name: average_ratings, dtype: float64