<a href="https://colab.research.google.com/github/dml2611/Chinese-Idioms/blob/main/krippendorff.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
# Install Krippendorff
!pip install krippendorff

Collecting krippendorff
  Downloading krippendorff-0.6.1-py3-none-any.whl (18 kB)
Installing collected packages: krippendorff
Successfully installed krippendorff-0.6.1


In [3]:
# Import libraries
import pandas as pd
import numpy as np
import krippendorff
from sklearn.preprocessing import LabelEncoder

In [4]:
# Initialise the data path
data_path = ""     # Enter your data path

In [5]:
# Enter all the human evaluation results for different annotators, in this case 3
eval_1 = pd.read_excel(data_path + "")
eval_2 = pd.read_excel(data_path + "")
eval_3 = pd.read_excel(data_path + "")

In [22]:
# Encode all categorical values in numeric format
enc_a = LabelEncoder()
enc_i = LabelEncoder()
enc_f = LabelEncoder()

Accuracy = ['inaccurate', 'poor', 'moderate', 'good', 'high']
acc = enc_a.fit(Accuracy)

Intelligibility = ['unintelligible', 'little',  'much', 'most', 'all']
intel = enc_i.fit(Intelligibility)

Fluency = ['incomprehensible', 'disfluent', 'correct', 'good', 'elegant']
flue = enc_f.fit(Fluency)

In [23]:
# Krippendorff Alpha
def calculate_krippendorff_alpha(data):
    return krippendorff.alpha(reliability_data=data, level_of_measurement='ordinal')

def krippendorff_calc(num):
  # Accuracy
  a1 = acc.transform(eval_1[f'A{num}'])
  a2 = acc.transform(eval_2[f'A{num}'])
  a3 = acc.transform(eval_3[f'A{num}'])

  # Intelligibility
  i1 = intel.transform(eval_1[f'I{num}'])
  i2 = intel.transform(eval_2[f'I{num}'])
  i3 = intel.transform(eval_3[f'I{num}'])

  # Fluency
  f1 = flue.transform(eval_1[f'F{num}'])
  f2 = flue.transform(eval_2[f'F{num}'])
  f3 = flue.transform(eval_3[f'F{num}'])

  accuracy_data = np.array([a1, a2, a3])

  intelligibility_data = np.array([i1, i2, i3])

  fluency_data = np.array([f1, f2, f3])

  alpha_accuracy = calculate_krippendorff_alpha(accuracy_data)
  alpha_intelligibility = calculate_krippendorff_alpha(intelligibility_data)
  alpha_fluency = calculate_krippendorff_alpha(fluency_data)

  print(f"------------------------------| Model {num} |----------------------------------")
  print(f"Krippendorff's Alpha for Accuracy: {alpha_accuracy}")
  print(f"Krippendorff's Alpha for Intelligibility: {alpha_intelligibility}")
  print(f"Krippendorff's Alpha for Fluency: {alpha_fluency}")

  return alpha_accuracy, alpha_intelligibility, alpha_fluency

In [25]:
# Calculate Krippendorff's Alpha for all models, in this case 9 (GoogleTrans, Microsoft, DeepL, ChatGPT, Llama, GLM, VolcanoTrans, NiuTrans, Baidu)
for j in range(1, 10):
  exec(f'mt_{j}_alpha_accuracy, mt_{j}_alpha_intelligibility, mt_{j}_alpha_fluency = krippendorff_calc({j})')
  print("\n")

------------------------------| Model 1 |----------------------------------
Krippendorff's Alpha for Accuracy: 0.8145224977043158
Krippendorff's Alpha for Intelligibility: 0.8954307734654555
Krippendorff's Alpha for Fluency: 0.06594222222222224


------------------------------| Model 2 |----------------------------------
Krippendorff's Alpha for Accuracy: 0.30499637067505436
Krippendorff's Alpha for Intelligibility: 0.8686107044086546
Krippendorff's Alpha for Fluency: 0.10127476313522821


------------------------------| Model 3 |----------------------------------
Krippendorff's Alpha for Accuracy: 0.4457305657305657
Krippendorff's Alpha for Intelligibility: 0.49317957572767634
Krippendorff's Alpha for Fluency: -0.029203216374268814


------------------------------| Model 4 |----------------------------------
Krippendorff's Alpha for Accuracy: 0.29194144268957234
Krippendorff's Alpha for Intelligibility: 0.16252826775214824
Krippendorff's Alpha for Fluency: 0.5054621848739496


-------