#Data Helper

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
workfolder = "MyDrive/HumanLLMTextAnswerAgg/ChatGPT/"
folder = "data/"

import os
os.chdir('/content/drive')
os.chdir(workfolder)
!ls

In [None]:
import pandas as pd
import numpy as np

def loaddata(labelfilename,gtfilename):
  gtdf = pd.read_csv(gtfilename,sep='\t',encoding='utf-8')

  sentences = gtdf['sentence'].values.tolist()
  truelabels = {}
  for idx,row in gtdf.iterrows():
    truelabels[row['sentence']] = row['trueanswer']

  labeldf = pd.read_csv(labelfilename,sep='\t',encoding='utf-8')

  workers = []
  swlabels = []

  for idx,row in labeldf.iterrows():
    worker = row['worker']
    if worker not in workers:
      workers.append(worker)
    workerid = workers.index(worker)
    sentenceid = sentences.index(row['sentence'])
    label = row['workeranswer'].strip()
    swlabels.append((sentenceid,workerid,label))

  return (workers,sentences,swlabels,truelabels)

def labelformatconversion(workers,sentences,swlabels):
  wlabelidlists = {}
  wsentenceidlists = {}
  for worker in workers:
    wlabelidlists[worker] = []
    wsentenceidlists[worker] = []

  slabelidlists = {}
  sworkeridlists = {}
  for sentence in sentences:
    slabelidlists[sentence] = []
    sworkeridlists[sentence] = []

  labellist = []
  labelidx = 0
  for (sentenceid,workerid,label) in swlabels:
    labellist.append(label)
    sentence = sentences[sentenceid]
    slabelidlists[sentence].append(labelidx)
    sworkeridlists[sentence].append(workerid)
    worker = workers[workerid]
    wlabelidlists[worker].append(labelidx)
    wsentenceidlists[worker].append(sentenceid)
    labelidx += 1

  return (labellist,slabelidlists,sworkeridlists,wlabelidlists,wsentenceidlists)

def truelabelformatonversion(sentences,truelabels):
  truelabellist = []
  for i in range(len(sentences)):
    sentence = sentences[i]
    truelabellist.append(truelabels[sentence])

  return truelabellist

# Universal Sentence Encoder

In [None]:
# Install the latest Tensorflow version.
!pip3 install --quiet "tensorflow>=1.7"
# Install TF-Hub.
!pip3 install --quiet tensorflow-hub
!pip3 install --quiet seaborn

In [None]:
#import tensorflow as tf
import tensorflow.compat.v1 as tf
tf.disable_v2_behavior()
import tensorflow_hub as hub
import os
import re

Instructions for updating:
non-resource variables are not supported in the long term


In [None]:
#module_url = "https://tfhub.dev/google/universal-sentence-encoder/2" #@param ["https://tfhub.dev/google/universal-sentence-encoder/2", "https://tfhub.dev/google/universal-sentence-encoder-large/3"]

In [None]:
module_url = "https://tfhub.dev/google/universal-sentence-encoder/4"

# Evaluation Method

In [None]:
DEFAULT_SIM_TYPE = 'COSINE'
def similarity(embed1, embed2, simtype = DEFAULT_SIM_TYPE):
  if (simtype == 'COSINE'):
    l1 = np.sqrt(np.sum(embed1**2))
    l2 = np.sqrt(np.sum(embed2**2))
    sim = np.inner(embed1,embed2) / (l1*l2)

  return sim

## Embedding

In [None]:
def evaluationbyEmbedding(sentences, elabels, label_embeddings, truelabels, truelabel_embeddings):
  totalsim = 0
  for sentence in elabels:
    elabel_embedding = label_embeddings[elabels[sentence]]
    truelabel_embedding = truelabel_embeddings[sentences.index(sentence)]
    totalsim += similarity(elabel_embedding,truelabel_embedding)

  #print(len(elabels),totalsim/len(elabels))
  return totalsim/len(elabels)


## GLEU

In [None]:
import nltk
import nltk.translate.gleu_score as gleu

try:
  nltk.data.find('tokenizers/punkt')
except LookupError:
  nltk.download('punkt')

def evaluationbyGLEU(sentences, labels, elabelidxs, truelabels):
  n = len(sentences)
  sim = 0
  for i in range(n):
    sentence = sentences[i]
    labelidx = elabelidxs[sentence]
    label = labels[labelidx]
    truelabel = truelabels[sentence]
    sim += gleu.sentence_gleu([truelabel.split()], label.split())
  #print(n,sim/n)
  return sim/n

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


## METEOR

In [None]:
import nltk
import nltk.translate.meteor_score as meteor

try:
  nltk.data.find('tokenizers/punkt')
except LookupError:
  nltk.download('punkt')

try:
  nltk.data.find('wordnet')
except LookupError:
  nltk.download('wordnet')

def evaluationbyMETEOR(sentences, labels, elabelidxs, truelabels):
  n = len(sentences)
  sim = 0
  for i in range(n):
    sentence = sentences[i]
    labelidx = elabelidxs[sentence]
    label = labels[labelidx]
    truelabel = truelabels[sentence]
    sim += meteor.meteor_score([truelabel.split()], label.split())
  #print(n,sim/n)
  return sim/n

[nltk_data] Downloading package wordnet to /root/nltk_data...


# Figures in Exploration Study

## Embedding Functions

### Worker Reliability

In [None]:
def WorkerReliabilityEmbedding(workers, sentences, wlabelidlists, wsentenceidlists, label_embeddings, truelabel_embeddings):
  w_num = len(workers)
  reliability = np.zeros(w_num)
  s_num = len(sentences)

  # compute reliability
  for j in range(w_num):
    worker = workers[j]
    wtruelabel_embeddings = truelabel_embeddings[wsentenceidlists[worker]]
    wlabelidlist = wlabelidlists[worker]
    wlabel_embeddings = label_embeddings[wlabelidlist]

    nw = len(wlabel_embeddings)
    totalsim = 0
    for i in range(nw):
      elabel_embedding = wlabel_embeddings[i]
      truelabel_embedding = wtruelabel_embeddings[i]
      totalsim += similarity(elabel_embedding,truelabel_embedding)
    reliability[j] = totalsim / nw

  return reliability

def DatasetWorkerReliabilityEmbedding(labelfilename,gtfilename):
  (workers,sentences,swlabels,truelabels) = loaddata(labelfilename,gtfilename)
  (labellist,slabelidlists,sworkeridlists,wlabelidlists,wsentenceidlists) = labelformatconversion(workers,sentences,swlabels)
  (truelabellist) = truelabelformatonversion(sentences,truelabels)

  #embed = hub.Module(module_url)
  embed = hub.load(module_url)
  tf.logging.set_verbosity(tf.logging.ERROR)
  with tf.Session() as session:
    session.run([tf.global_variables_initializer(), tf.tables_initializer()])
    label_embeddings = session.run(embed(labellist))
  with tf.Session() as session:
    session.run([tf.global_variables_initializer(), tf.tables_initializer()])
    truelabel_embeddings = session.run(embed(truelabellist))

  workerreliability = WorkerReliabilityEmbedding(workers, sentences, wlabelidlists, wsentenceidlists, label_embeddings, truelabel_embeddings)

  return workerreliability


### Question Reliability

In [None]:
def QuestionReliabilityEmbedding(sentences, slabelidlists, label_embeddings, truelabel_embeddings):
  reliability = np.zeros(len(sentences))
  for i in range(len(sentences)):
    sentence = sentences[i]
    truelabel_embedding = truelabel_embeddings[sentences.index(sentence)]
    slabel_embeddings = label_embeddings[slabelidlists[sentence]]
    ns = len(slabel_embeddings)
    sim = 0
    for k in range(ns):
      label_embedding = slabel_embeddings[k]
      sim += similarity(label_embedding,truelabel_embedding)
    reliability[i] = sim / ns

  return reliability

def DatasetQuestionReliabilityEmbedding(labelfilename,gtfilename):
  (workers,sentences,swlabels,truelabels) = loaddata(labelfilename,gtfilename)
  (labellist,slabelidlists,sworkeridlists,wlabelidlists,wsentenceidlists) = labelformatconversion(workers,sentences,swlabels)
  (truelabellist) = truelabelformatonversion(sentences,truelabels)

  #embed = hub.Module(module_url)
  embed = hub.load(module_url)
  tf.logging.set_verbosity(tf.logging.ERROR)
  with tf.Session() as session:
    session.run([tf.global_variables_initializer(), tf.tables_initializer()])
    label_embeddings = session.run(embed(labellist))
  with tf.Session() as session:
    session.run([tf.global_variables_initializer(), tf.tables_initializer()])
    truelabel_embeddings = session.run(embed(truelabellist))

  questionreliability = QuestionReliabilityEmbedding(sentences, slabelidlists, label_embeddings, truelabel_embeddings)

  return questionreliability

In [None]:
def QuestionReliabilityEmbeddingInter(sentences, slabelidlists, label_embeddings, truelabel_embeddings):
  reliability = np.zeros(len(sentences))
  for i in range(len(sentences)):
    sentence = sentences[i]
    truelabel_embedding = truelabel_embeddings[sentences.index(sentence)]
    slabel_embeddings = label_embeddings[slabelidlists[sentence]]
    ns = len(slabel_embeddings)
    sim = 0
    count = 0
    for k1 in range(ns):
      label1_embedding = slabel_embeddings[k1]
      for k2 in range(k1+1,ns):
        label2_embedding = slabel_embeddings[k2]
        sim += similarity(label2_embedding,label1_embedding)
        count += 1
    reliability[i] = sim / count

  return reliability

def DatasetQuestionReliabilityEmbeddingInter(labelfilename,gtfilename):
  (workers,sentences,swlabels,truelabels) = loaddata(labelfilename,gtfilename)
  (labellist,slabelidlists,sworkeridlists,wlabelidlists,wsentenceidlists) = labelformatconversion(workers,sentences,swlabels)
  (truelabellist) = truelabelformatonversion(sentences,truelabels)

  #embed = hub.Module(module_url)
  embed = hub.load(module_url)
  tf.logging.set_verbosity(tf.logging.ERROR)
  with tf.Session() as session:
    session.run([tf.global_variables_initializer(), tf.tables_initializer()])
    label_embeddings = session.run(embed(labellist))
  with tf.Session() as session:
    session.run([tf.global_variables_initializer(), tf.tables_initializer()])
    truelabel_embeddings = session.run(embed(truelabellist))

  questionreliability = QuestionReliabilityEmbeddingInter(sentences, slabelidlists, label_embeddings, truelabel_embeddings)

  return questionreliability

##GLEU Functions

### Worker GLEU

In [None]:
def WorkerReliabilityGLEU(workers, sentences, wlabelidlists, wsentenceidlists, labels, truelabels):
  w_num = len(workers)
  reliability = np.zeros(w_num)
  s_num = len(sentences)

  for j in range(w_num):
    worker = workers[j]
    wtruelabels = truelabels[wsentenceidlists[worker]]
    wlabelidlist = wlabelidlists[worker]
    wlabels = labels[wlabelidlist]

    nw = len(wlabels)
    totalsim = 0
    for i in range(nw):
      label = wlabels[i]
      truelabel = wtruelabels[i]
      totalsim += gleu.sentence_gleu([truelabel.split()], label.split())
    reliability[j] = totalsim / nw

  return reliability

###Question GLEU

In [None]:
def QuestionReliabilityGLEU(sentences, slabelidlists, labels, truelabels):
  reliability = np.zeros(len(sentences))
  for i in range(len(sentences)):
    sentence = sentences[i]
    truelabel = truelabels[sentences.index(sentence)]
    slabels = labels[slabelidlists[sentence]]
    ns = len(slabels)
    sim = 0
    for k in range(ns):
      label = slabels[k]
      sim += gleu.sentence_gleu([truelabel.split()], label.split())
    reliability[i] = sim / ns

  return reliability

In [None]:
def QuestionReliabilityGLEUVec(sentences, slabelidlists, labels, truelabels):
  reliabilityvec = np.zeros((len(sentences),len(labels[slabelidlists[sentences[0]]])))
  for i in range(len(sentences)):
    sentence = sentences[i]
    truelabel = truelabels[sentences.index(sentence)]
    slabels = labels[slabelidlists[sentence]]
    ns = len(slabels)
    sim = 0
    for k in range(ns):
      label = slabels[k]
      #sim += gleu.sentence_gleu([truelabel.split()], label.split())
      reliabilityvec[i][k] = gleu.sentence_gleu([truelabel.split()], label.split())

  return reliabilityvec

In [None]:
def QuestionReliabilityGLEUInter(sentences, slabelidlists, labels, truelabels):
  reliabilityInter = np.zeros(len(sentences))
  for i in range(len(sentences)):
    sentence = sentences[i]
    truelabel = truelabels[sentences.index(sentence)]
    slabels = labels[slabelidlists[sentence]]
    ns = len(slabels)
    sim = 0
    count = 0
    for k1 in range(ns):
      label1 = slabels[k1]
      for k2 in range(k1+1,ns):
        label2 = slabels[k2]
        sim += gleu.sentence_gleu([label1.split()], label2.split())
        count += 1
    reliabilityInter[i] = sim / count

  return reliabilityInter

## METEOR Functions

### Worker METEOR

In [None]:
def WorkerReliabilityMETEOR(workers, sentences, wlabelidlists, wsentenceidlists, labels, truelabels):
  w_num = len(workers)
  reliability = np.zeros(w_num)
  s_num = len(sentences)

  for j in range(w_num):
    worker = workers[j]
    wtruelabels = truelabels[wsentenceidlists[worker]]
    wlabelidlist = wlabelidlists[worker]
    wlabels = labels[wlabelidlist]

    nw = len(wlabels)
    totalsim = 0
    for i in range(nw):
      label = wlabels[i]
      truelabel = wtruelabels[i]
      totalsim += meteor.meteor_score([truelabel.split()], label.split())
    reliability[j] = totalsim / nw

  return reliability

### Question METEOR

In [None]:
def QuestionReliabilityMETEOR(sentences, slabelidlists, labels, truelabels):
  reliability = np.zeros(len(sentences))
  for i in range(len(sentences)):
    sentence = sentences[i]
    truelabel = truelabels[sentences.index(sentence)]
    slabels = labels[slabelidlists[sentence]]
    ns = len(slabels)
    sim = 0
    for k in range(ns):
      label = slabels[k]
      sim += meteor.meteor_score([truelabel.split()], label.split())
    reliability[i] = sim / ns

  return reliability

In [None]:
def QuestionReliabilityMETEORInter(sentences, slabelidlists, labels, truelabels):
  reliability = np.zeros(len(sentences))
  for i in range(len(sentences)):
    sentence = sentences[i]
    truelabel = truelabels[sentences.index(sentence)]
    slabels = labels[slabelidlists[sentence]]
    ns = len(slabels)
    sim = 0
    count = 0
    for k1 in range(ns):
      label1 = slabels[k1]
      for k2 in range(k1+1,ns):
        label2 = slabels[k2]
        sim += meteor.meteor_score([label1.split()], label2.split())
        count += 1
    reliability[i] = sim / count

  return reliability

## Crowd Only

### Embedding: Worker Reliability

In [None]:
# J1
labelfilename = folder + 'CrowdWSA2019_J1_label_anonymous.tsv'
gtfilename = folder + 'CrowdWSA2019_J1_gt.tsv'
j1workerreliability = DatasetWorkerReliabilityEmbedding(labelfilename,gtfilename)

# T1
labelfilename = folder + 'CrowdWSA2019_T1_label_anonymous.tsv'
gtfilename = folder + 'CrowdWSA2019_T1_gt.tsv'
t1workerreliability = DatasetWorkerReliabilityEmbedding(labelfilename,gtfilename)

# T2
labelfilename = folder + 'CrowdWSA2019_T2_label_anonymous.tsv'
gtfilename = folder + 'CrowdWSA2019_T2_gt.tsv'
t2workerreliability = DatasetWorkerReliabilityEmbedding(labelfilename,gtfilename)

print([np.min(j1workerreliability),np.min(t1workerreliability),np.min(t2workerreliability)])
print([np.mean(j1workerreliability),np.mean(t1workerreliability),np.mean(t2workerreliability)])
print([np.max(j1workerreliability),np.max(t1workerreliability),np.max(t2workerreliability)])

[0.4233174607157707, 0.5826911341398955, 0.4725991033017635]
[0.6619557354187682, 0.7260422374769335, 0.7076946628163027]
[0.8794820020596187, 0.904344166815281, 0.8958068251609802]


### Embedding: Question Reliability

In [None]:
# J1
labelfilename = folder + 'CrowdWSA2019_J1_label_anonymous.tsv'
gtfilename = folder + 'CrowdWSA2019_J1_gt.tsv'
j1questionreliability = DatasetQuestionReliabilityEmbedding(labelfilename,gtfilename)

# T1
labelfilename = folder + 'CrowdWSA2019_T1_label_anonymous.tsv'
gtfilename = folder + 'CrowdWSA2019_T1_gt.tsv'
t1questionreliability = DatasetQuestionReliabilityEmbedding(labelfilename,gtfilename)

# T2
labelfilename = folder + 'CrowdWSA2019_T2_label_anonymous.tsv'
gtfilename = folder + 'CrowdWSA2019_T2_gt.tsv'
t2questionreliability = DatasetQuestionReliabilityEmbedding(labelfilename,gtfilename)

print([np.min(j1questionreliability),np.min(t1questionreliability),np.min(t2questionreliability)])
print([np.mean(j1questionreliability),np.mean(t1questionreliability),np.mean(t2questionreliability)])
print([np.max(j1questionreliability),np.max(t1questionreliability),np.max(t2questionreliability)])

[0.3222570061683655, 0.1263815749436617, 0.12064818777143956]
[0.6731647727542452, 0.7244636795036494, 0.7104700434468687]
[0.9459680497646332, 0.9583230972290039, 0.9546079576015473]


### Worker GLEU

In [None]:
# J1
labelfilename = folder + 'CrowdWSA2019_J1_label_anonymous.tsv'
gtfilename = folder + 'CrowdWSA2019_J1_gt.tsv'

(workers,sentences,swlabels,truelabels) = loaddata(labelfilename,gtfilename)
(labellist,slabelidlists,sworkeridlists,wlabelidlists,wsentenceidlists) = labelformatconversion(workers,sentences,swlabels)
(truelabellist) = truelabelformatonversion(sentences,truelabels)

reliability = WorkerReliabilityGLEU(workers, sentences, wlabelidlists, wsentenceidlists, np.asarray(labellist), np.asarray(truelabellist))
j1workergleu = reliability

# T1
labelfilename = folder + 'CrowdWSA2019_T1_label_anonymous.tsv'
gtfilename = folder + 'CrowdWSA2019_T1_gt.tsv'

(workers,sentences,swlabels,truelabels) = loaddata(labelfilename,gtfilename)
(labellist,slabelidlists,sworkeridlists,wlabelidlists,wsentenceidlists) = labelformatconversion(workers,sentences,swlabels)
(truelabellist) = truelabelformatonversion(sentences,truelabels)

reliability = WorkerReliabilityGLEU(workers, sentences, wlabelidlists, wsentenceidlists, np.asarray(labellist), np.asarray(truelabellist))
t1workergleu = reliability

# T2
labelfilename = folder + 'CrowdWSA2019_T2_label_anonymous.tsv'
gtfilename = folder + 'CrowdWSA2019_T2_gt.tsv'

(workers,sentences,swlabels,truelabels) = loaddata(labelfilename,gtfilename)
(labellist,slabelidlists,sworkeridlists,wlabelidlists,wsentenceidlists) = labelformatconversion(workers,sentences,swlabels)
(truelabellist) = truelabelformatonversion(sentences,truelabels)

reliability = WorkerReliabilityGLEU(workers, sentences, wlabelidlists, wsentenceidlists, np.asarray(labellist), np.asarray(truelabellist))
t2workergleu = reliability

print([np.min(j1workergleu),np.min(t1workergleu),np.min(t2workergleu)])
print([np.mean(j1workergleu),np.mean(t1workergleu),np.mean(t2workergleu)])
print([np.max(j1workergleu),np.max(t1workergleu),np.max(t2workergleu)])

[0.07237291780635434, 0.06686639236772593, 0.050281626104241105]
[0.1868012686336684, 0.17641370367716142, 0.17155530496426646]
[0.5948484393066437, 0.5533564345646725, 0.4540110583805842]


###Question GLEU

In [None]:
# J1
labelfilename = folder + 'CrowdWSA2019_J1_label_anonymous.tsv'
gtfilename = folder + 'CrowdWSA2019_J1_gt.tsv'

(workers,sentences,swlabels,truelabels) = loaddata(labelfilename,gtfilename)
(labellist,slabelidlists,sworkeridlists,wlabelidlists,wsentenceidlists) = labelformatconversion(workers,sentences,swlabels)
(truelabellist) = truelabelformatonversion(sentences,truelabels)

quality = QuestionReliabilityGLEU(sentences, slabelidlists, np.asarray(labellist), np.asarray(truelabellist))
j1questiongleu = quality

# T1
labelfilename = folder + 'CrowdWSA2019_T1_label_anonymous.tsv'
gtfilename = folder + 'CrowdWSA2019_T1_gt.tsv'

(workers,sentences,swlabels,truelabels) = loaddata(labelfilename,gtfilename)
(labellist,slabelidlists,sworkeridlists,wlabelidlists,wsentenceidlists) = labelformatconversion(workers,sentences,swlabels)
(truelabellist) = truelabelformatonversion(sentences,truelabels)

quality = QuestionReliabilityGLEU(sentences, slabelidlists, np.asarray(labellist), np.asarray(truelabellist))
t1questiongleu = quality

# T2
labelfilename = folder + 'CrowdWSA2019_T2_label_anonymous.tsv'
gtfilename = folder + 'CrowdWSA2019_T2_gt.tsv'

(workers,sentences,swlabels,truelabels) = loaddata(labelfilename,gtfilename)
(labellist,slabelidlists,sworkeridlists,wlabelidlists,wsentenceidlists) = labelformatconversion(workers,sentences,swlabels)
(truelabellist) = truelabelformatonversion(sentences,truelabels)

quality = QuestionReliabilityGLEU(sentences, slabelidlists, np.asarray(labellist), np.asarray(truelabellist))
t2questiongleu = quality

print([np.min(j1questiongleu),np.min(t1questiongleu),np.min(t2questiongleu)])
print([np.mean(j1questiongleu),np.mean(t1questiongleu),np.mean(t2questiongleu)])
print([np.max(j1questiongleu),np.max(t1questiongleu),np.max(t2questiongleu)])

[0.006535947712418301, 0.006521739130434782, 0.009523809523809523]
[0.1929833036876219, 0.17403443483987385, 0.16158265928021784]
[0.6590909090909091, 0.4725563909774436, 0.4384761904761906]


### Worker METEOR

In [None]:
# J1
labelfilename = folder + 'CrowdWSA2019_J1_label_anonymous.tsv'
gtfilename = folder + 'CrowdWSA2019_J1_gt.tsv'

(workers,sentences,swlabels,truelabels) = loaddata(labelfilename,gtfilename)
(labellist,slabelidlists,sworkeridlists,wlabelidlists,wsentenceidlists) = labelformatconversion(workers,sentences,swlabels)
(truelabellist) = truelabelformatonversion(sentences,truelabels)

reliability = WorkerReliabilityMETEOR(workers, sentences, wlabelidlists, wsentenceidlists, np.asarray(labellist), np.asarray(truelabellist))
j1workermeteor = reliability

# T1
labelfilename = folder + 'CrowdWSA2019_T1_label_anonymous.tsv'
gtfilename = folder + 'CrowdWSA2019_T1_gt.tsv'

(workers,sentences,swlabels,truelabels) = loaddata(labelfilename,gtfilename)
(labellist,slabelidlists,sworkeridlists,wlabelidlists,wsentenceidlists) = labelformatconversion(workers,sentences,swlabels)
(truelabellist) = truelabelformatonversion(sentences,truelabels)

reliability = WorkerReliabilityMETEOR(workers, sentences, wlabelidlists, wsentenceidlists, np.asarray(labellist), np.asarray(truelabellist))
t1workermeteor = reliability

# T2
labelfilename = folder + 'CrowdWSA2019_T2_label_anonymous.tsv'
gtfilename = folder + 'CrowdWSA2019_T2_gt.tsv'

(workers,sentences,swlabels,truelabels) = loaddata(labelfilename,gtfilename)
(labellist,slabelidlists,sworkeridlists,wlabelidlists,wsentenceidlists) = labelformatconversion(workers,sentences,swlabels)
(truelabellist) = truelabelformatonversion(sentences,truelabels)

reliability = WorkerReliabilityMETEOR(workers, sentences, wlabelidlists, wsentenceidlists, np.asarray(labellist), np.asarray(truelabellist))
t2workermeteor = reliability

print([np.min(j1workermeteor),np.min(t1workermeteor),np.min(t2workermeteor)])
print([np.mean(j1workermeteor),np.mean(t1workermeteor),np.mean(t2workermeteor)])
print([np.max(j1workermeteor),np.max(t1workermeteor),np.max(t2workermeteor)])

[0.2040843932501189, 0.17805898892215194, 0.15225078689455235]
[0.37615897743650556, 0.3770760280922258, 0.3630198403684655]
[0.7463835833007678, 0.7162633888123681, 0.647342194834532]


### Question METEOR

In [None]:
# J1
labelfilename = folder + 'CrowdWSA2019_J1_label_anonymous.tsv'
gtfilename = folder + 'CrowdWSA2019_J1_gt.tsv'

(workers,sentences,swlabels,truelabels) = loaddata(labelfilename,gtfilename)
(labellist,slabelidlists,sworkeridlists,wlabelidlists,wsentenceidlists) = labelformatconversion(workers,sentences,swlabels)
(truelabellist) = truelabelformatonversion(sentences,truelabels)

quality = QuestionReliabilityMETEOR(sentences, slabelidlists, np.asarray(labellist), np.asarray(truelabellist))
j1questionmeteor = quality

# T1
labelfilename = folder + 'CrowdWSA2019_T1_label_anonymous.tsv'
gtfilename = folder + 'CrowdWSA2019_T1_gt.tsv'

(workers,sentences,swlabels,truelabels) = loaddata(labelfilename,gtfilename)
(labellist,slabelidlists,sworkeridlists,wlabelidlists,wsentenceidlists) = labelformatconversion(workers,sentences,swlabels)
(truelabellist) = truelabelformatonversion(sentences,truelabels)

quality = QuestionReliabilityMETEOR(sentences, slabelidlists, np.asarray(labellist), np.asarray(truelabellist))
t1questionmeteor = quality

# T2
labelfilename = folder + 'CrowdWSA2019_T2_label_anonymous.tsv'
gtfilename = folder + 'CrowdWSA2019_T2_gt.tsv'

(workers,sentences,swlabels,truelabels) = loaddata(labelfilename,gtfilename)
(labellist,slabelidlists,sworkeridlists,wlabelidlists,wsentenceidlists) = labelformatconversion(workers,sentences,swlabels)
(truelabellist) = truelabelformatonversion(sentences,truelabels)

quality = QuestionReliabilityMETEOR(sentences, slabelidlists, np.asarray(labellist), np.asarray(truelabellist))
t2questionmeteor = quality

print([np.min(j1questionmeteor),np.min(t1questionmeteor),np.min(t2questionmeteor)])
print([np.mean(j1questionmeteor),np.mean(t1questionmeteor),np.mean(t2questionmeteor)])
print([np.max(j1questionmeteor),np.max(t1questionmeteor),np.max(t2questionmeteor)])

[0.057400309787111414, 0.015841769778777652, 0.030561754273594994]
[0.3860552547481079, 0.3785616206179462, 0.36038552223904163]
[0.8443340586752301, 0.7218620418700931, 0.6620051395112414]


## HumanAgg

### Embedding: Worker Reliability

In [None]:
# J1
labelfilename = folder + 'CrowdWSA2019_J1_label_anonymous_humanagg.tsv'
gtfilename = folder + 'CrowdWSA2019_J1_gt.tsv'
j1workerreliability = DatasetWorkerReliabilityEmbedding(labelfilename,gtfilename)

# T1
labelfilename = folder + 'CrowdWSA2019_T1_label_anonymous_humanagg.tsv'
gtfilename = folder + 'CrowdWSA2019_T1_gt.tsv'
t1workerreliability = DatasetWorkerReliabilityEmbedding(labelfilename,gtfilename)

# T2
labelfilename = folder + 'CrowdWSA2019_T2_label_anonymous_humanagg.tsv'
gtfilename = folder + 'CrowdWSA2019_T2_gt.tsv'
t2workerreliability = DatasetWorkerReliabilityEmbedding(labelfilename,gtfilename)

print([np.min(j1workerreliability),np.min(t1workerreliability),np.min(t2workerreliability)])
print([np.mean(j1workerreliability),np.mean(t1workerreliability),np.mean(t2workerreliability)])
print([np.max(j1workerreliability),np.max(t1workerreliability),np.max(t2workerreliability)])

[0.22428159415721893, 0.12815801799297333, 0.26995864510536194]
[0.6856763181317702, 0.7336201392129196, 0.7214679326086928]
[1.0, 1.0, 1.0000001192092896]


### Embedding: Question Reliability

In [None]:
# J1
labelfilename = folder + 'CrowdWSA2019_J1_label_anonymous_humanagg.tsv'
gtfilename = folder + 'CrowdWSA2019_J1_gt.tsv'
j1questionreliability = DatasetQuestionReliabilityEmbedding(labelfilename,gtfilename)

# T1
labelfilename = folder + 'CrowdWSA2019_T1_label_anonymous_humanagg.tsv'
gtfilename = folder + 'CrowdWSA2019_T1_gt.tsv'
t1questionreliability = DatasetQuestionReliabilityEmbedding(labelfilename,gtfilename)

# T2
labelfilename = folder + 'CrowdWSA2019_T2_label_anonymous_humanagg.tsv'
gtfilename = folder + 'CrowdWSA2019_T2_gt.tsv'
t2questionreliability = DatasetQuestionReliabilityEmbedding(labelfilename,gtfilename)

print([np.min(j1questionreliability),np.min(t1questionreliability),np.min(t2questionreliability)])
print([np.mean(j1questionreliability),np.mean(t1questionreliability),np.mean(t2questionreliability)])
print([np.max(j1questionreliability),np.max(t1questionreliability),np.max(t2questionreliability)])

[0.23160867393016815, 0.1377883568406105, 0.032199914753437045]
[0.6982842175841331, 0.7429604621976612, 0.7281856881380081]
[0.9761502861976623, 0.9581658124923706, 0.9349516153335571]


### Worker GLEU

In [None]:
# J1
labelfilename = folder + 'CrowdWSA2019_J1_label_anonymous_humanagg.tsv'
gtfilename = folder + 'CrowdWSA2019_J1_gt.tsv'

(workers,sentences,swlabels,truelabels) = loaddata(labelfilename,gtfilename)
(labellist,slabelidlists,sworkeridlists,wlabelidlists,wsentenceidlists) = labelformatconversion(workers,sentences,swlabels)
(truelabellist) = truelabelformatonversion(sentences,truelabels)

reliability = WorkerReliabilityGLEU(workers, sentences, wlabelidlists, wsentenceidlists, np.asarray(labellist), np.asarray(truelabellist))
j1workergleu = reliability

# T1
labelfilename = folder + 'CrowdWSA2019_T1_label_anonymous_humanagg.tsv'
gtfilename = folder + 'CrowdWSA2019_T1_gt.tsv'

(workers,sentences,swlabels,truelabels) = loaddata(labelfilename,gtfilename)
(labellist,slabelidlists,sworkeridlists,wlabelidlists,wsentenceidlists) = labelformatconversion(workers,sentences,swlabels)
(truelabellist) = truelabelformatonversion(sentences,truelabels)

reliability = WorkerReliabilityGLEU(workers, sentences, wlabelidlists, wsentenceidlists, np.asarray(labellist), np.asarray(truelabellist))
t1workergleu = reliability

# T2
labelfilename = folder + 'CrowdWSA2019_T2_label_anonymous_humanagg.tsv'
gtfilename = folder + 'CrowdWSA2019_T2_gt.tsv'

(workers,sentences,swlabels,truelabels) = loaddata(labelfilename,gtfilename)
(labellist,slabelidlists,sworkeridlists,wlabelidlists,wsentenceidlists) = labelformatconversion(workers,sentences,swlabels)
(truelabellist) = truelabelformatonversion(sentences,truelabels)

reliability = WorkerReliabilityGLEU(workers, sentences, wlabelidlists, wsentenceidlists, np.asarray(labellist), np.asarray(truelabellist))
t2workergleu = reliability

print([np.min(j1workergleu),np.min(t1workergleu),np.min(t2workergleu)])
print([np.mean(j1workergleu),np.mean(t1workergleu),np.mean(t2workergleu)])
print([np.max(j1workergleu),np.max(t1workergleu),np.max(t2workergleu)])

[0.0, 0.021739130434782608, 0.0]
[0.21237081610419373, 0.18738060144552507, 0.18166888764533748]
[1.0, 0.7894736842105263, 0.6842105263157895]


### Question GLEU

In [None]:
# J1
labelfilename = folder + 'CrowdWSA2019_J1_label_anonymous_humanagg.tsv'
gtfilename = folder + 'CrowdWSA2019_J1_gt.tsv'

(workers,sentences,swlabels,truelabels) = loaddata(labelfilename,gtfilename)
(labellist,slabelidlists,sworkeridlists,wlabelidlists,wsentenceidlists) = labelformatconversion(workers,sentences,swlabels)
(truelabellist) = truelabelformatonversion(sentences,truelabels)

quality = QuestionReliabilityGLEU(sentences, slabelidlists, np.asarray(labellist), np.asarray(truelabellist))
j1questiongleu = quality

# T1
labelfilename = folder + 'CrowdWSA2019_T1_label_anonymous_humanagg.tsv'
gtfilename = folder + 'CrowdWSA2019_T1_gt.tsv'

(workers,sentences,swlabels,truelabels) = loaddata(labelfilename,gtfilename)
(labellist,slabelidlists,sworkeridlists,wlabelidlists,wsentenceidlists) = labelformatconversion(workers,sentences,swlabels)
(truelabellist) = truelabelformatonversion(sentences,truelabels)

quality = QuestionReliabilityGLEU(sentences, slabelidlists, np.asarray(labellist), np.asarray(truelabellist))
t1questiongleu = quality

# T2
labelfilename = folder + 'CrowdWSA2019_T2_label_anonymous_humanagg.tsv'
gtfilename = folder + 'CrowdWSA2019_T2_gt.tsv'

(workers,sentences,swlabels,truelabels) = loaddata(labelfilename,gtfilename)
(labellist,slabelidlists,sworkeridlists,wlabelidlists,wsentenceidlists) = labelformatconversion(workers,sentences,swlabels)
(truelabellist) = truelabelformatonversion(sentences,truelabels)

quality = QuestionReliabilityGLEU(sentences, slabelidlists, np.asarray(labellist), np.asarray(truelabellist))
t2questiongleu = quality

print([np.min(j1questiongleu),np.min(t1questiongleu),np.min(t2questiongleu)])
print([np.mean(j1questiongleu),np.mean(t1questiongleu),np.mean(t2questiongleu)])
print([np.max(j1questiongleu),np.max(t1questiongleu),np.max(t2questiongleu)])

[0.0047619047619047615, 0.017391304347826087, 0.0]
[0.2260193818423277, 0.18188431807326452, 0.17690832435077364]
[0.8566433566433567, 0.5789473684210527, 0.5421052631578946]


### Worker METEOR

In [None]:
# J1
labelfilename = folder + 'CrowdWSA2019_J1_label_anonymous_humanagg.tsv'
gtfilename = folder + 'CrowdWSA2019_J1_gt.tsv'

(workers,sentences,swlabels,truelabels) = loaddata(labelfilename,gtfilename)
(labellist,slabelidlists,sworkeridlists,wlabelidlists,wsentenceidlists) = labelformatconversion(workers,sentences,swlabels)
(truelabellist) = truelabelformatonversion(sentences,truelabels)

reliability = WorkerReliabilityMETEOR(workers, sentences, wlabelidlists, wsentenceidlists, np.asarray(labellist), np.asarray(truelabellist))
j1workermeteor = reliability

# T1
labelfilename = folder + 'CrowdWSA2019_T1_label_anonymous_humanagg.tsv'
gtfilename = folder + 'CrowdWSA2019_T1_gt.tsv'

(workers,sentences,swlabels,truelabels) = loaddata(labelfilename,gtfilename)
(labellist,slabelidlists,sworkeridlists,wlabelidlists,wsentenceidlists) = labelformatconversion(workers,sentences,swlabels)
(truelabellist) = truelabelformatonversion(sentences,truelabels)

reliability = WorkerReliabilityMETEOR(workers, sentences, wlabelidlists, wsentenceidlists, np.asarray(labellist), np.asarray(truelabellist))
t1workermeteor = reliability

# T2
labelfilename = folder + 'CrowdWSA2019_T2_label_anonymous_humanagg.tsv'
gtfilename = folder + 'CrowdWSA2019_T2_gt.tsv'

(workers,sentences,swlabels,truelabels) = loaddata(labelfilename,gtfilename)
(labellist,slabelidlists,sworkeridlists,wlabelidlists,wsentenceidlists) = labelformatconversion(workers,sentences,swlabels)
(truelabellist) = truelabelformatonversion(sentences,truelabels)

reliability = WorkerReliabilityMETEOR(workers, sentences, wlabelidlists, wsentenceidlists, np.asarray(labellist), np.asarray(truelabellist))
t2workermeteor = reliability

print([np.min(j1workermeteor),np.min(t1workermeteor),np.min(t2workermeteor)])
print([np.mean(j1workermeteor),np.mean(t1workermeteor),np.mean(t2workermeteor)])
print([np.max(j1workermeteor),np.max(t1workermeteor),np.max(t2workermeteor)])

[0.04901960784313726, 0.03937007874015748, 0.04761904761904762]
[0.4156094318692456, 0.39085744805149214, 0.383969723883758]
[0.9985422740524781, 0.9169724770642201, 0.8287608596250572]


### Question METEOR

In [None]:
# J1
labelfilename = folder + 'CrowdWSA2019_J1_label_anonymous_humanagg.tsv'
gtfilename = folder + 'CrowdWSA2019_J1_gt.tsv'

(workers,sentences,swlabels,truelabels) = loaddata(labelfilename,gtfilename)
(labellist,slabelidlists,sworkeridlists,wlabelidlists,wsentenceidlists) = labelformatconversion(workers,sentences,swlabels)
(truelabellist) = truelabelformatonversion(sentences,truelabels)

quality = QuestionReliabilityMETEOR(sentences, slabelidlists, np.asarray(labellist), np.asarray(truelabellist))
j1questionmeteor = quality

# T1
labelfilename = folder + 'CrowdWSA2019_T1_label_anonymous_humanagg.tsv'
gtfilename = folder + 'CrowdWSA2019_T1_gt.tsv'

(workers,sentences,swlabels,truelabels) = loaddata(labelfilename,gtfilename)
(labellist,slabelidlists,sworkeridlists,wlabelidlists,wsentenceidlists) = labelformatconversion(workers,sentences,swlabels)
(truelabellist) = truelabelformatonversion(sentences,truelabels)

quality = QuestionReliabilityMETEOR(sentences, slabelidlists, np.asarray(labellist), np.asarray(truelabellist))
t1questionmeteor = quality

# T2
labelfilename = folder + 'CrowdWSA2019_T2_label_anonymous_humanagg.tsv'
gtfilename = folder + 'CrowdWSA2019_T2_gt.tsv'

(workers,sentences,swlabels,truelabels) = loaddata(labelfilename,gtfilename)
(labellist,slabelidlists,sworkeridlists,wlabelidlists,wsentenceidlists) = labelformatconversion(workers,sentences,swlabels)
(truelabellist) = truelabelformatonversion(sentences,truelabels)

quality = QuestionReliabilityMETEOR(sentences, slabelidlists, np.asarray(labellist), np.asarray(truelabellist))
t2questionmeteor = quality

print([np.min(j1questionmeteor),np.min(t1questionmeteor),np.min(t2questionmeteor)])
print([np.mean(j1questionmeteor),np.mean(t1questionmeteor),np.mean(t2questionmeteor)])
print([np.max(j1questionmeteor),np.max(t1questionmeteor),np.max(t2questionmeteor)])

[0.04392998306041784, 0.03156632344033919, 0.03555764411027569]
[0.4397162519298426, 0.39265189819162094, 0.37463693494354033]
[0.9530227686703098, 0.8076119195645738, 0.7345149610271935]


## ChatGPT

### Embedding: Worker Reliability

In [None]:
# J1
labelfilename = folder + 'CrowdWSA2019_J1_label_5temperature_trial0_gpt-4-turbo-preview.tsv'
gtfilename = folder + 'CrowdWSA2019_J1_gt.tsv'
j1workerreliability = DatasetWorkerReliabilityEmbedding(labelfilename,gtfilename)

# T1
labelfilename = folder + 'CrowdWSA2019_T1_label_5temperature_trial0_gpt-4-turbo-preview.tsv'
gtfilename = folder + 'CrowdWSA2019_T1_gt.tsv'
t1workerreliability = DatasetWorkerReliabilityEmbedding(labelfilename,gtfilename)

# T2
labelfilename = folder + 'CrowdWSA2019_T2_label_5temperature_trial0_gpt-4-turbo-preview.tsv'
gtfilename = folder + 'CrowdWSA2019_T2_gt.tsv'
t2workerreliability = DatasetWorkerReliabilityEmbedding(labelfilename,gtfilename)

print([np.min(j1workerreliability),np.min(t1workerreliability),np.min(t2workerreliability)])
print([np.mean(j1workerreliability),np.mean(t1workerreliability),np.mean(t2workerreliability)])
print([np.max(j1workerreliability),np.max(t1workerreliability),np.max(t2workerreliability)])
print([np.std(j1workerreliability),np.std(t1workerreliability),np.std(t2workerreliability)])

[0.7216353012546897, 0.7757952199876308, 0.7602277125418186]
[0.731328093700111, 0.778561170130968, 0.7651334211528301]
[0.7379359434843064, 0.7823864875733852, 0.7678966821730137]
[0.005317873534983634, 0.0028524968075661137, 0.003180362097036341]


### Embedding: Question Reliability

In [None]:
# J1
labelfilename = folder + 'CrowdWSA2019_J1_label_3temperature_trial0_gpt-4-turbo-preview.tsv'
gtfilename = folder + 'CrowdWSA2019_J1_gt.tsv'
j1questionreliability = DatasetQuestionReliabilityEmbedding(labelfilename,gtfilename)

# T1
labelfilename = folder + 'CrowdWSA2019_T1_label_3temperature_trial0_gpt-4-turbo-preview.tsv'
gtfilename = folder + 'CrowdWSA2019_T1_gt.tsv'
t1questionreliability = DatasetQuestionReliabilityEmbedding(labelfilename,gtfilename)

# T2
labelfilename = folder + 'CrowdWSA2019_T2_label_3temperature_trial0_gpt-4-turbo-preview.tsv'
gtfilename = folder + 'CrowdWSA2019_T2_gt.tsv'
t2questionreliability = DatasetQuestionReliabilityEmbedding(labelfilename,gtfilename)

print([np.min(j1questionreliability),np.min(t1questionreliability),np.min(t2questionreliability)])
print([np.mean(j1questionreliability),np.mean(t1questionreliability),np.mean(t2questionreliability)])
print([np.max(j1questionreliability),np.max(t1questionreliability),np.max(t2questionreliability)])
print([np.std(j1questionreliability),np.std(t1questionreliability),np.std(t2questionreliability)])

[0.3219131529331207, 0.11951912939548492, 0.242936243613561]
[0.7287178416724006, 0.776251809646686, 0.7658956364790598]
[1.0000001192092896, 1.0000001192092896, 1.0]
[0.1463987707005359, 0.14334911865480052, 0.1497327417817354]


In [None]:
# J1
labelfilename = folder + 'CrowdWSA2019_J1_label_3temperature_trial0_gpt-4-turbo-preview.tsv'
gtfilename = folder + 'CrowdWSA2019_J1_gt.tsv'
j1questionreliability = DatasetQuestionReliabilityEmbeddingInter(labelfilename,gtfilename)

# T1
labelfilename = folder + 'CrowdWSA2019_T1_label_3temperature_trial0_gpt-4-turbo-preview.tsv'
gtfilename = folder + 'CrowdWSA2019_T1_gt.tsv'
t1questionreliability = DatasetQuestionReliabilityEmbeddingInter(labelfilename,gtfilename)

# T2
labelfilename = folder + 'CrowdWSA2019_T2_label_3temperature_trial0_gpt-4-turbo-preview.tsv'
gtfilename = folder + 'CrowdWSA2019_T2_gt.tsv'
t2questionreliability = DatasetQuestionReliabilityEmbeddingInter(labelfilename,gtfilename)

print('Inter-Annotator Agreement')
print([np.mean(j1questionreliability),np.mean(t1questionreliability),np.mean(t2questionreliability)])

Inter-Annotator Agreement
[0.9321979425748189, 0.9531522198518116, 0.9343030192454655]


### Worker GLEU

In [None]:
# J1
labelfilename = folder + 'CrowdWSA2019_J1_label_5temperature_trial0_gpt-4-turbo-preview.tsv'
gtfilename = folder + 'CrowdWSA2019_J1_gt.tsv'

(workers,sentences,swlabels,truelabels) = loaddata(labelfilename,gtfilename)
(labellist,slabelidlists,sworkeridlists,wlabelidlists,wsentenceidlists) = labelformatconversion(workers,sentences,swlabels)
(truelabellist) = truelabelformatonversion(sentences,truelabels)

reliability = WorkerReliabilityGLEU(workers, sentences, wlabelidlists, wsentenceidlists, np.asarray(labellist), np.asarray(truelabellist))
j1workergleu = reliability

# T1
labelfilename = folder + 'CrowdWSA2019_T1_label_5temperature_trial0_gpt-4-turbo-preview.tsv'
gtfilename = folder + 'CrowdWSA2019_T1_gt.tsv'

(workers,sentences,swlabels,truelabels) = loaddata(labelfilename,gtfilename)
(labellist,slabelidlists,sworkeridlists,wlabelidlists,wsentenceidlists) = labelformatconversion(workers,sentences,swlabels)
(truelabellist) = truelabelformatonversion(sentences,truelabels)

reliability = WorkerReliabilityGLEU(workers, sentences, wlabelidlists, wsentenceidlists, np.asarray(labellist), np.asarray(truelabellist))
t1workergleu = reliability

# T2
labelfilename = folder + 'CrowdWSA2019_T2_label_5temperature_trial0_gpt-4-turbo-preview.tsv'
gtfilename = folder + 'CrowdWSA2019_T2_gt.tsv'

(workers,sentences,swlabels,truelabels) = loaddata(labelfilename,gtfilename)
(labellist,slabelidlists,sworkeridlists,wlabelidlists,wsentenceidlists) = labelformatconversion(workers,sentences,swlabels)
(truelabellist) = truelabelformatonversion(sentences,truelabels)

reliability = WorkerReliabilityGLEU(workers, sentences, wlabelidlists, wsentenceidlists, np.asarray(labellist), np.asarray(truelabellist))
t2workergleu = reliability

print([np.min(j1workergleu),np.min(t1workergleu),np.min(t2workergleu)])
print([np.mean(j1workergleu),np.mean(t1workergleu),np.mean(t2workergleu)])
print([np.max(j1workergleu),np.max(t1workergleu),np.max(t2workergleu)])
print([np.std(j1workergleu),np.std(t1workergleu),np.std(t2workergleu)])

[0.2705515066488463, 0.21435638120789244, 0.202531026383216]
[0.27285867768269834, 0.21841283025989106, 0.21362884604481375]
[0.2756277178785879, 0.2283486157506044, 0.2244390340335261]
[0.0017904976557243896, 0.005105379204494995, 0.008333882855115278]


### Question GLEU

In [None]:
# J1
labelfilename = folder + 'CrowdWSA2019_J1_label_3temperature_trial0_gpt-4-turbo-preview.tsv'
gtfilename = folder + 'CrowdWSA2019_J1_gt.tsv'

(workers,sentences,swlabels,truelabels) = loaddata(labelfilename,gtfilename)
(labellist,slabelidlists,sworkeridlists,wlabelidlists,wsentenceidlists) = labelformatconversion(workers,sentences,swlabels)
(truelabellist) = truelabelformatonversion(sentences,truelabels)

quality = QuestionReliabilityGLEU(sentences, slabelidlists, np.asarray(labellist), np.asarray(truelabellist))
j1questiongleu = quality

# T1
labelfilename = folder + 'CrowdWSA2019_T1_label_3temperature_trial0_gpt-4-turbo-preview.tsv'
gtfilename = folder + 'CrowdWSA2019_T1_gt.tsv'

(workers,sentences,swlabels,truelabels) = loaddata(labelfilename,gtfilename)
(labellist,slabelidlists,sworkeridlists,wlabelidlists,wsentenceidlists) = labelformatconversion(workers,sentences,swlabels)
(truelabellist) = truelabelformatonversion(sentences,truelabels)

quality = QuestionReliabilityGLEU(sentences, slabelidlists, np.asarray(labellist), np.asarray(truelabellist))
t1questiongleu = quality

# T2
labelfilename = folder + 'CrowdWSA2019_T2_label_3temperature_trial0_gpt-4-turbo-preview.tsv'
gtfilename = folder + 'CrowdWSA2019_T2_gt.tsv'

(workers,sentences,swlabels,truelabels) = loaddata(labelfilename,gtfilename)
(labellist,slabelidlists,sworkeridlists,wlabelidlists,wsentenceidlists) = labelformatconversion(workers,sentences,swlabels)
(truelabellist) = truelabelformatonversion(sentences,truelabels)

quality = QuestionReliabilityGLEU(sentences, slabelidlists, np.asarray(labellist), np.asarray(truelabellist))
t2questiongleu = quality

print([np.min(j1questiongleu),np.min(t1questiongleu),np.min(t2questiongleu)])
print([np.mean(j1questiongleu),np.mean(t1questiongleu),np.mean(t2questiongleu)])
print([np.max(j1questiongleu),np.max(t1questiongleu),np.max(t2questiongleu)])
print([np.std(j1questiongleu),np.std(t1questiongleu),np.std(t2questiongleu)])

[0.0, 0.0, 0.023809523809523808]
[0.27378122078105216, 0.21543527431494502, 0.21217187337826487]
[1.0, 0.7894736842105262, 0.7600000000000001]
[0.22504284384838327, 0.15869505448426044, 0.1629914542136718]


In [None]:
# J1
labelfilename = folder + 'CrowdWSA2019_J1_label_3temperature_trial0_gpt-4-turbo-preview.tsv'
gtfilename = folder + 'CrowdWSA2019_J1_gt.tsv'

(workers,sentences,swlabels,truelabels) = loaddata(labelfilename,gtfilename)
(labellist,slabelidlists,sworkeridlists,wlabelidlists,wsentenceidlists) = labelformatconversion(workers,sentences,swlabels)
(truelabellist) = truelabelformatonversion(sentences,truelabels)

quality = QuestionReliabilityGLEUInter(sentences, slabelidlists, np.asarray(labellist), np.asarray(truelabellist))
j1questiongleu = quality

# T1
labelfilename = folder + 'CrowdWSA2019_T1_label_3temperature_trial0_gpt-4-turbo-preview.tsv'
gtfilename = folder + 'CrowdWSA2019_T1_gt.tsv'

(workers,sentences,swlabels,truelabels) = loaddata(labelfilename,gtfilename)
(labellist,slabelidlists,sworkeridlists,wlabelidlists,wsentenceidlists) = labelformatconversion(workers,sentences,swlabels)
(truelabellist) = truelabelformatonversion(sentences,truelabels)

quality = QuestionReliabilityGLEUInter(sentences, slabelidlists, np.asarray(labellist), np.asarray(truelabellist))
t1questiongleu = quality

# T2
labelfilename = folder + 'CrowdWSA2019_T2_label_3temperature_trial0_gpt-4-turbo-preview.tsv'
gtfilename = folder + 'CrowdWSA2019_T2_gt.tsv'

(workers,sentences,swlabels,truelabels) = loaddata(labelfilename,gtfilename)
(labellist,slabelidlists,sworkeridlists,wlabelidlists,wsentenceidlists) = labelformatconversion(workers,sentences,swlabels)
(truelabellist) = truelabelformatonversion(sentences,truelabels)

quality = QuestionReliabilityGLEUInter(sentences, slabelidlists, np.asarray(labellist), np.asarray(truelabellist))
t2questiongleu = quality

print('Inter-Annotator Agreement')
print([np.mean(j1questiongleu),np.mean(t1questiongleu),np.mean(t2questiongleu)])

Inter-Annotator Agreement
[0.7438322175614764, 0.7808905255240248, 0.7588866930519912]


### Worker METEOR

In [None]:
# J1
labelfilename = folder + 'CrowdWSA2019_J1_label_5temperature_trial0_gpt-4-turbo-preview.tsv'
gtfilename = folder + 'CrowdWSA2019_J1_gt.tsv'

(workers,sentences,swlabels,truelabels) = loaddata(labelfilename,gtfilename)
(labellist,slabelidlists,sworkeridlists,wlabelidlists,wsentenceidlists) = labelformatconversion(workers,sentences,swlabels)
(truelabellist) = truelabelformatonversion(sentences,truelabels)

reliability = WorkerReliabilityMETEOR(workers, sentences, wlabelidlists, wsentenceidlists, np.asarray(labellist), np.asarray(truelabellist))
j1workermeteor = reliability

# T1
labelfilename = folder + 'CrowdWSA2019_T1_label_5temperature_trial0_gpt-4-turbo-preview.tsv'
gtfilename = folder + 'CrowdWSA2019_T1_gt.tsv'

(workers,sentences,swlabels,truelabels) = loaddata(labelfilename,gtfilename)
(labellist,slabelidlists,sworkeridlists,wlabelidlists,wsentenceidlists) = labelformatconversion(workers,sentences,swlabels)
(truelabellist) = truelabelformatonversion(sentences,truelabels)

reliability = WorkerReliabilityMETEOR(workers, sentences, wlabelidlists, wsentenceidlists, np.asarray(labellist), np.asarray(truelabellist))
t1workermeteor = reliability

# T2
labelfilename = folder + 'CrowdWSA2019_T2_label_5temperature_trial0_gpt-4-turbo-preview.tsv'
gtfilename = folder + 'CrowdWSA2019_T2_gt.tsv'

(workers,sentences,swlabels,truelabels) = loaddata(labelfilename,gtfilename)
(labellist,slabelidlists,sworkeridlists,wlabelidlists,wsentenceidlists) = labelformatconversion(workers,sentences,swlabels)
(truelabellist) = truelabelformatonversion(sentences,truelabels)

reliability = WorkerReliabilityMETEOR(workers, sentences, wlabelidlists, wsentenceidlists, np.asarray(labellist), np.asarray(truelabellist))
t2workermeteor = reliability

print([np.min(j1workermeteor),np.min(t1workermeteor),np.min(t2workermeteor)])
print([np.mean(j1workermeteor),np.mean(t1workermeteor),np.mean(t2workermeteor)])
print([np.max(j1workermeteor),np.max(t1workermeteor),np.max(t2workermeteor)])
print([np.std(j1workermeteor),np.std(t1workermeteor),np.std(t2workermeteor)])

[0.49874032517874584, 0.42472715150037915, 0.41024920189198716]
[0.50488034128122, 0.42940969523552575, 0.427814031381003]
[0.5102609856206389, 0.4391799480009519, 0.44071832388885007]
[0.0039061072151675884, 0.005101665333992812, 0.010294613225262518]


### Question METEOR

In [None]:
# J1
labelfilename = folder + 'CrowdWSA2019_J1_label_3temperature_trial0_gpt-4-turbo-preview.tsv'
gtfilename = folder + 'CrowdWSA2019_J1_gt.tsv'

(workers,sentences,swlabels,truelabels) = loaddata(labelfilename,gtfilename)
(labellist,slabelidlists,sworkeridlists,wlabelidlists,wsentenceidlists) = labelformatconversion(workers,sentences,swlabels)
(truelabellist) = truelabelformatonversion(sentences,truelabels)

quality = QuestionReliabilityMETEOR(sentences, slabelidlists, np.asarray(labellist), np.asarray(truelabellist))
j1questionmeteor = quality

# T1
labelfilename = folder + 'CrowdWSA2019_T1_label_3temperature_trial0_gpt-4-turbo-preview.tsv'
gtfilename = folder + 'CrowdWSA2019_T1_gt.tsv'

(workers,sentences,swlabels,truelabels) = loaddata(labelfilename,gtfilename)
(labellist,slabelidlists,sworkeridlists,wlabelidlists,wsentenceidlists) = labelformatconversion(workers,sentences,swlabels)
(truelabellist) = truelabelformatonversion(sentences,truelabels)

quality = QuestionReliabilityMETEOR(sentences, slabelidlists, np.asarray(labellist), np.asarray(truelabellist))
t1questionmeteor = quality

# T2
labelfilename = folder + 'CrowdWSA2019_T2_label_3temperature_trial0_gpt-4-turbo-preview.tsv'
gtfilename = folder + 'CrowdWSA2019_T2_gt.tsv'

(workers,sentences,swlabels,truelabels) = loaddata(labelfilename,gtfilename)
(labellist,slabelidlists,sworkeridlists,wlabelidlists,wsentenceidlists) = labelformatconversion(workers,sentences,swlabels)
(truelabellist) = truelabelformatonversion(sentences,truelabels)

quality = QuestionReliabilityMETEOR(sentences, slabelidlists, np.asarray(labellist), np.asarray(truelabellist))
t2questionmeteor = quality

print([np.min(j1questionmeteor),np.min(t1questionmeteor),np.min(t2questionmeteor)])
print([np.mean(j1questionmeteor),np.mean(t1questionmeteor),np.mean(t2questionmeteor)])
print([np.max(j1questionmeteor),np.max(t1questionmeteor),np.max(t2questionmeteor)])
print([np.std(j1questionmeteor),np.std(t1questionmeteor),np.std(t2questionmeteor)])

[0.0, 0.013333333333333334, 0.04732235573357069]
[0.5051334652022386, 0.42620994282241964, 0.4265553611202379]
[0.9994999999999999, 0.8251217578434704, 0.8630595523581134]
[0.2336206100690281, 0.19194781358823995, 0.19104400283960798]


In [None]:
# J1
labelfilename = folder + 'CrowdWSA2019_J1_label_3temperature_trial0_gpt-4-turbo-preview.tsv'
gtfilename = folder + 'CrowdWSA2019_J1_gt.tsv'

(workers,sentences,swlabels,truelabels) = loaddata(labelfilename,gtfilename)
(labellist,slabelidlists,sworkeridlists,wlabelidlists,wsentenceidlists) = labelformatconversion(workers,sentences,swlabels)
(truelabellist) = truelabelformatonversion(sentences,truelabels)

quality = QuestionReliabilityMETEORInter(sentences, slabelidlists, np.asarray(labellist), np.asarray(truelabellist))
j1questionmeteor = quality

# T1
labelfilename = folder + 'CrowdWSA2019_T1_label_3temperature_trial0_gpt-4-turbo-preview.tsv'
gtfilename = folder + 'CrowdWSA2019_T1_gt.tsv'

(workers,sentences,swlabels,truelabels) = loaddata(labelfilename,gtfilename)
(labellist,slabelidlists,sworkeridlists,wlabelidlists,wsentenceidlists) = labelformatconversion(workers,sentences,swlabels)
(truelabellist) = truelabelformatonversion(sentences,truelabels)

quality = QuestionReliabilityMETEORInter(sentences, slabelidlists, np.asarray(labellist), np.asarray(truelabellist))
t1questionmeteor = quality

# T2
labelfilename = folder + 'CrowdWSA2019_T2_label_3temperature_trial0_gpt-4-turbo-preview.tsv'
gtfilename = folder + 'CrowdWSA2019_T2_gt.tsv'

(workers,sentences,swlabels,truelabels) = loaddata(labelfilename,gtfilename)
(labellist,slabelidlists,sworkeridlists,wlabelidlists,wsentenceidlists) = labelformatconversion(workers,sentences,swlabels)
(truelabellist) = truelabelformatonversion(sentences,truelabels)

quality = QuestionReliabilityMETEORInter(sentences, slabelidlists, np.asarray(labellist), np.asarray(truelabellist))
t2questionmeteor = quality

print('Inter-Annotator Agreement')
print([np.mean(j1questionmeteor),np.mean(t1questionmeteor),np.mean(t2questionmeteor)])

Inter-Annotator Agreement
[0.868317987887604, 0.8917817354834406, 0.8695972011693966]
