In [1]:
import numpy as np
import pandas as pd
import pingouin as pg
from scipy.stats import zscore, spearmanr
from tqdm import tqdm

# Read Human Ratings

### Simplicity DA

In [2]:
df_ratings_simp = pd.read_csv("../data/ratings_per_annotator/simplicity_DA_ratings.csv")

In [3]:
def standardise_ratings(df, rater_id, aspect):
    return df.groupby(by=rater_id)[aspect].transform(lambda x: zscore(x))

df_ratings_simp[f"simplicity_zscore"] = standardise_ratings(df_ratings_simp, rater_id='rater_id', aspect="simplicity")

### Simplicity Gain

In [7]:
df_ratings_simpgain = pd.read_csv("../data/ratings_per_annotator/simplicity_gain_ratings.csv")

### Structural Simplicity

In [6]:
df_ratings_struct = pd.read_csv("../data/ratings_per_annotator/structural_simplicity_ratings.csv")

# Compute ICC

### Simplicity-DA

In [8]:
# Reformat the dataset
df_ratings_simp['segment_id'] = df_ratings_simp['sent_id'].astype(str) + df_ratings_simp['sys_name']
df_ratings_simp["rater_num"] = df_ratings_simp.groupby(["segment_id"]).cumcount()

# Compute Intraclass Correlation Coeficient (ICC)
icc = pg.intraclass_corr(data=df_ratings_simp, targets='segment_id', raters='rater_num', ratings='simplicity_zscore').round(3)

icc

Unnamed: 0,Type,Description,ICC,F,df1,df2,pval,CI95%
0,ICC1,Single raters absolute,0.386,10.436,599,8400,0.0,"[0.36, 0.42]"
1,ICC2,Single random raters,0.386,10.438,599,8386,0.0,"[0.36, 0.42]"
2,ICC3,Single fixed raters,0.386,10.438,599,8386,0.0,"[0.36, 0.42]"
3,ICC1k,Average raters absolute,0.904,10.436,599,8400,0.0,"[0.89, 0.92]"
4,ICC2k,Average random raters,0.904,10.438,599,8386,0.0,"[0.89, 0.92]"
5,ICC3k,Average fixed raters,0.904,10.438,599,8386,0.0,"[0.89, 0.92]"


### Simplicity Gain

In [9]:
# Reformat the dataset
df_ratings_simpgain['segment_id'] = df_ratings_simpgain['sent_id'].astype(str) + df_ratings_simpgain['sys_name']
df_ratings_simpgain["rater_num"] = df_ratings_simpgain.groupby(["segment_id"]).cumcount()

# Compute Intraclass Correlation Coeficient (ICC)
icc = pg.intraclass_corr(data=df_ratings_simpgain, targets='segment_id', raters='rater_num', ratings='simplicity_gain').round(3)

icc

Unnamed: 0,Type,Description,ICC,F,df1,df2,pval,CI95%
0,ICC1,Single raters absolute,0.176,2.067,371,1488,0.0,"[0.13, 0.22]"
1,ICC2,Single random raters,0.179,2.111,371,1484,0.0,"[0.14, 0.23]"
2,ICC3,Single fixed raters,0.182,2.111,371,1484,0.0,"[0.14, 0.23]"
3,ICC1k,Average raters absolute,0.516,2.067,371,1488,0.0,"[0.43, 0.59]"
4,ICC2k,Average random raters,0.521,2.111,371,1484,0.0,"[0.44, 0.59]"
5,ICC3k,Average fixed raters,0.526,2.111,371,1484,0.0,"[0.45, 0.6]"


### Structural Simplicity

In [10]:
# Reformat the dataset
df_ratings_struct['segment_id'] = df_ratings_struct['sent_id'].astype(str) + df_ratings_struct['sys_name']

# Compute Intraclass Correlation Coeficient (ICC)
icc = pg.intraclass_corr(data=df_ratings_struct, targets='segment_id', raters='rater_id', ratings='structural_simplicity').round(3)

icc

Unnamed: 0,Type,Description,ICC,F,df1,df2,pval,CI95%
0,ICC1,Single raters absolute,0.465,3.605,1749,3500,0.0,"[0.44, 0.49]"
1,ICC2,Single random raters,0.476,4.073,1749,3498,0.0,"[0.41, 0.53]"
2,ICC3,Single fixed raters,0.506,4.073,1749,3498,0.0,"[0.48, 0.53]"
3,ICC1k,Average raters absolute,0.723,3.605,1749,3500,0.0,"[0.7, 0.74]"
4,ICC2k,Average random raters,0.731,4.073,1749,3498,0.0,"[0.68, 0.77]"
5,ICC3k,Average fixed raters,0.754,4.073,1749,3498,0.0,"[0.73, 0.77]"


# Compute Correlation

In [11]:
def simulate_two_annotators(ratings, num_ratings_annotatorA=1):
    ratings_shuffled = np.random.permutation(ratings)
    ratingA = np.mean(ratings_shuffled[:num_ratings_annotatorA])
    ratingB = np.mean(ratings_shuffled[num_ratings_annotatorA:])
    return [ratingA, ratingB]

In [12]:
def compute_correlation(df_ratings, segment_id, aspects, n_simulations=1000):
    corr_per_aspect = {}
    for aspect in aspects:
        df_scores = df_ratings[[segment_id, aspect]]
        corr_values = []
        for _ in tqdm(range(n_simulations)):
            ratings_simulation = df_scores.groupby(segment_id)[aspect].apply(simulate_two_annotators).to_list()
            raterA, raterB = zip(*ratings_simulation)
            corr_values.append(spearmanr(raterA, raterB)[0])
        corr_per_aspect[aspect] = (np.mean(corr_values), np.std(corr_values))
    return corr_per_aspect

### Simplicity-DA

In [13]:
compute_correlation(df_ratings=df_ratings_simp, segment_id="segment_id", aspects=["simplicity_zscore"])

100%|██████████| 1000/1000 [00:18<00:00, 54.88it/s]


{'simplicity_zscore': (0.6062359392691102, 0.024112537162681632)}

### Simplicity Gain

In [14]:
compute_correlation(df_ratings=df_ratings_simpgain, segment_id="segment_id", aspects=["simplicity_gain"])

100%|██████████| 1000/1000 [00:11<00:00, 85.72it/s]


{'simplicity_gain': (0.3016893780647777, 0.037100423922126737)}

### Structural Simplicity

In [15]:
compute_correlation(df_ratings=df_ratings_struct, segment_id="segment_id", aspects=["structural_simplicity"])

100%|██████████| 1000/1000 [00:53<00:00, 18.67it/s]


{'structural_simplicity': (0.5081216794461466, 0.012956320126838313)}