#Data Helper

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
workfolder = "MyDrive/HumanLLMTextAnswerAgg/Gemini/"
folder = "data/"

import os
os.chdir('/content/drive')
os.chdir(workfolder)
!ls

In [3]:
import pandas as pd
import numpy as np

def loaddata(labelfilename,gtfilename):
  gtdf = pd.read_csv(gtfilename,sep='\t',encoding='utf-8')

  sentences = gtdf['sentence'].values.tolist()
  truelabels = {}
  for idx,row in gtdf.iterrows():
    truelabels[row['sentence']] = row['trueanswer']

  labeldf = pd.read_csv(labelfilename,sep='\t',encoding='utf-8')

  workers = []
  swlabels = []

  for idx,row in labeldf.iterrows():
    worker = row['worker']
    if worker not in workers:
      workers.append(worker)
    workerid = workers.index(worker)
    sentenceid = sentences.index(row['sentence'])
    label = row['workeranswer'].strip()
    swlabels.append((sentenceid,workerid,label))

  return (workers,sentences,swlabels,truelabels)

def labelformatconversion(workers,sentences,swlabels):
  wlabelidlists = {}
  wsentenceidlists = {}
  for worker in workers:
    wlabelidlists[worker] = []
    wsentenceidlists[worker] = []

  slabelidlists = {}
  sworkeridlists = {}
  for sentence in sentences:
    slabelidlists[sentence] = []
    sworkeridlists[sentence] = []

  labellist = []
  labelidx = 0
  for (sentenceid,workerid,label) in swlabels:
    labellist.append(label)
    sentence = sentences[sentenceid]
    slabelidlists[sentence].append(labelidx)
    sworkeridlists[sentence].append(workerid)
    worker = workers[workerid]
    wlabelidlists[worker].append(labelidx)
    wsentenceidlists[worker].append(sentenceid)
    labelidx += 1

  return (labellist,slabelidlists,sworkeridlists,wlabelidlists,wsentenceidlists)

def truelabelformatonversion(sentences,truelabels):
  truelabellist = []
  for i in range(len(sentences)):
    sentence = sentences[i]
    truelabellist.append(truelabels[sentence])

  return truelabellist

# Universal Sentence Encoder

In [4]:
# Install the latest Tensorflow version.
!pip3 install --quiet "tensorflow>=1.7"
# Install TF-Hub.
!pip3 install --quiet tensorflow-hub
!pip3 install --quiet seaborn

In [5]:
#import tensorflow as tf
import tensorflow.compat.v1 as tf
tf.disable_v2_behavior()
import tensorflow_hub as hub
import os
import re

Instructions for updating:
non-resource variables are not supported in the long term


In [None]:
#module_url = "https://tfhub.dev/google/universal-sentence-encoder/2" #@param ["https://tfhub.dev/google/universal-sentence-encoder/2", "https://tfhub.dev/google/universal-sentence-encoder-large/3"]

In [6]:
module_url = "https://tfhub.dev/google/universal-sentence-encoder/4"

# Evaluation Method

In [7]:
DEFAULT_SIM_TYPE = 'COSINE'
def similarity(embed1, embed2, simtype = DEFAULT_SIM_TYPE):
  if (simtype == 'COSINE'):
    l1 = np.sqrt(np.sum(embed1**2))
    l2 = np.sqrt(np.sum(embed2**2))
    sim = np.inner(embed1,embed2) / (l1*l2)

  return sim

## Embedding

In [8]:
def evaluationbyEmbedding(sentences, elabels, label_embeddings, truelabels, truelabel_embeddings):
  totalsim = 0
  for sentence in elabels:
    elabel_embedding = label_embeddings[elabels[sentence]]
    truelabel_embedding = truelabel_embeddings[sentences.index(sentence)]
    totalsim += similarity(elabel_embedding,truelabel_embedding)

  #print(len(elabels),totalsim/len(elabels))
  return totalsim/len(elabels)


## GLEU

In [9]:
import nltk
import nltk.translate.gleu_score as gleu

try:
  nltk.data.find('tokenizers/punkt')
except LookupError:
  nltk.download('punkt')

def evaluationbyGLEU(sentences, labels, elabelidxs, truelabels):
  n = len(sentences)
  sim = 0
  for i in range(n):
    sentence = sentences[i]
    labelidx = elabelidxs[sentence]
    label = labels[labelidx]
    truelabel = truelabels[sentence]
    sim += gleu.sentence_gleu([truelabel.split()], label.split())
  #print(n,sim/n)
  return sim/n

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


## METEOR

In [10]:
import nltk
import nltk.translate.meteor_score as meteor

try:
  nltk.data.find('tokenizers/punkt')
except LookupError:
  nltk.download('punkt')

try:
  nltk.data.find('wordnet')
except LookupError:
  nltk.download('wordnet')

def evaluationbyMETEOR(sentences, labels, elabelidxs, truelabels):
  n = len(sentences)
  sim = 0
  for i in range(n):
    sentence = sentences[i]
    labelidx = elabelidxs[sentence]
    label = labels[labelidx]
    truelabel = truelabels[sentence]
    sim += meteor.meteor_score([truelabel.split()], label.split())
  #print(n,sim/n)
  return sim/n

[nltk_data] Downloading package wordnet to /root/nltk_data...


# Figures in Exploration Study

## Embedding Functions

### Worker Reliability

In [11]:
def WorkerReliabilityEmbedding(workers, sentences, wlabelidlists, wsentenceidlists, label_embeddings, truelabel_embeddings):
  w_num = len(workers)
  reliability = np.zeros(w_num)
  s_num = len(sentences)

  # compute reliability
  for j in range(w_num):
    worker = workers[j]
    wtruelabel_embeddings = truelabel_embeddings[wsentenceidlists[worker]]
    wlabelidlist = wlabelidlists[worker]
    wlabel_embeddings = label_embeddings[wlabelidlist]

    nw = len(wlabel_embeddings)
    totalsim = 0
    for i in range(nw):
      elabel_embedding = wlabel_embeddings[i]
      truelabel_embedding = wtruelabel_embeddings[i]
      totalsim += similarity(elabel_embedding,truelabel_embedding)
    reliability[j] = totalsim / nw

  return reliability

def DatasetWorkerReliabilityEmbedding(labelfilename,gtfilename):
  (workers,sentences,swlabels,truelabels) = loaddata(labelfilename,gtfilename)
  (labellist,slabelidlists,sworkeridlists,wlabelidlists,wsentenceidlists) = labelformatconversion(workers,sentences,swlabels)
  (truelabellist) = truelabelformatonversion(sentences,truelabels)

  #embed = hub.Module(module_url)
  embed = hub.load(module_url)
  tf.logging.set_verbosity(tf.logging.ERROR)
  with tf.Session() as session:
    session.run([tf.global_variables_initializer(), tf.tables_initializer()])
    label_embeddings = session.run(embed(labellist))
  with tf.Session() as session:
    session.run([tf.global_variables_initializer(), tf.tables_initializer()])
    truelabel_embeddings = session.run(embed(truelabellist))

  workerreliability = WorkerReliabilityEmbedding(workers, sentences, wlabelidlists, wsentenceidlists, label_embeddings, truelabel_embeddings)

  return workerreliability


### Question Reliability

In [12]:
def QuestionReliabilityEmbedding(sentences, slabelidlists, label_embeddings, truelabel_embeddings):
  reliability = np.zeros(len(sentences))
  for i in range(len(sentences)):
    sentence = sentences[i]
    truelabel_embedding = truelabel_embeddings[sentences.index(sentence)]
    slabel_embeddings = label_embeddings[slabelidlists[sentence]]
    ns = len(slabel_embeddings)
    sim = 0
    for k in range(ns):
      label_embedding = slabel_embeddings[k]
      sim += similarity(label_embedding,truelabel_embedding)
    reliability[i] = sim / ns

  return reliability

def DatasetQuestionReliabilityEmbedding(labelfilename,gtfilename):
  (workers,sentences,swlabels,truelabels) = loaddata(labelfilename,gtfilename)
  (labellist,slabelidlists,sworkeridlists,wlabelidlists,wsentenceidlists) = labelformatconversion(workers,sentences,swlabels)
  (truelabellist) = truelabelformatonversion(sentences,truelabels)

  #embed = hub.Module(module_url)
  embed = hub.load(module_url)
  tf.logging.set_verbosity(tf.logging.ERROR)
  with tf.Session() as session:
    session.run([tf.global_variables_initializer(), tf.tables_initializer()])
    label_embeddings = session.run(embed(labellist))
  with tf.Session() as session:
    session.run([tf.global_variables_initializer(), tf.tables_initializer()])
    truelabel_embeddings = session.run(embed(truelabellist))

  questionreliability = QuestionReliabilityEmbedding(sentences, slabelidlists, label_embeddings, truelabel_embeddings)

  return questionreliability

In [13]:
def QuestionReliabilityEmbeddingInter(sentences, slabelidlists, label_embeddings, truelabel_embeddings):
  reliability = np.zeros(len(sentences))
  for i in range(len(sentences)):
    sentence = sentences[i]
    truelabel_embedding = truelabel_embeddings[sentences.index(sentence)]
    slabel_embeddings = label_embeddings[slabelidlists[sentence]]
    ns = len(slabel_embeddings)
    sim = 0
    count = 0
    for k1 in range(ns):
      label1_embedding = slabel_embeddings[k1]
      for k2 in range(k1+1,ns):
        label2_embedding = slabel_embeddings[k2]
        sim += similarity(label2_embedding,label1_embedding)
        count += 1
    reliability[i] = sim / count

  return reliability

def DatasetQuestionReliabilityEmbeddingInter(labelfilename,gtfilename):
  (workers,sentences,swlabels,truelabels) = loaddata(labelfilename,gtfilename)
  (labellist,slabelidlists,sworkeridlists,wlabelidlists,wsentenceidlists) = labelformatconversion(workers,sentences,swlabels)
  (truelabellist) = truelabelformatonversion(sentences,truelabels)

  #embed = hub.Module(module_url)
  embed = hub.load(module_url)
  tf.logging.set_verbosity(tf.logging.ERROR)
  with tf.Session() as session:
    session.run([tf.global_variables_initializer(), tf.tables_initializer()])
    label_embeddings = session.run(embed(labellist))
  with tf.Session() as session:
    session.run([tf.global_variables_initializer(), tf.tables_initializer()])
    truelabel_embeddings = session.run(embed(truelabellist))

  questionreliability = QuestionReliabilityEmbeddingInter(sentences, slabelidlists, label_embeddings, truelabel_embeddings)

  return questionreliability

##GLEU Functions

### Worker GLEU

In [14]:
def WorkerReliabilityGLEU(workers, sentences, wlabelidlists, wsentenceidlists, labels, truelabels):
  w_num = len(workers)
  reliability = np.zeros(w_num)
  s_num = len(sentences)

  for j in range(w_num):
    worker = workers[j]
    wtruelabels = truelabels[wsentenceidlists[worker]]
    wlabelidlist = wlabelidlists[worker]
    wlabels = labels[wlabelidlist]

    nw = len(wlabels)
    totalsim = 0
    for i in range(nw):
      label = wlabels[i]
      truelabel = wtruelabels[i]
      totalsim += gleu.sentence_gleu([truelabel.split()], label.split())
    reliability[j] = totalsim / nw

  return reliability

###Question GLEU

In [15]:
def QuestionReliabilityGLEU(sentences, slabelidlists, labels, truelabels):
  reliability = np.zeros(len(sentences))
  for i in range(len(sentences)):
    sentence = sentences[i]
    truelabel = truelabels[sentences.index(sentence)]
    slabels = labels[slabelidlists[sentence]]
    ns = len(slabels)
    sim = 0
    for k in range(ns):
      label = slabels[k]
      sim += gleu.sentence_gleu([truelabel.split()], label.split())
    reliability[i] = sim / ns

  return reliability

In [16]:
def QuestionReliabilityGLEUVec(sentences, slabelidlists, labels, truelabels):
  reliabilityvec = np.zeros((len(sentences),len(labels[slabelidlists[sentences[0]]])))
  for i in range(len(sentences)):
    sentence = sentences[i]
    truelabel = truelabels[sentences.index(sentence)]
    slabels = labels[slabelidlists[sentence]]
    ns = len(slabels)
    sim = 0
    for k in range(ns):
      label = slabels[k]
      #sim += gleu.sentence_gleu([truelabel.split()], label.split())
      reliabilityvec[i][k] = gleu.sentence_gleu([truelabel.split()], label.split())

  return reliabilityvec

In [17]:
def QuestionReliabilityGLEUInter(sentences, slabelidlists, labels, truelabels):
  reliabilityInter = np.zeros(len(sentences))
  for i in range(len(sentences)):
    sentence = sentences[i]
    truelabel = truelabels[sentences.index(sentence)]
    slabels = labels[slabelidlists[sentence]]
    ns = len(slabels)
    sim = 0
    count = 0
    for k1 in range(ns):
      label1 = slabels[k1]
      for k2 in range(k1+1,ns):
        label2 = slabels[k2]
        sim += gleu.sentence_gleu([label1.split()], label2.split())
        count += 1
    reliabilityInter[i] = sim / count

  return reliabilityInter

## METEOR Functions

### Worker METEOR

In [18]:
def WorkerReliabilityMETEOR(workers, sentences, wlabelidlists, wsentenceidlists, labels, truelabels):
  w_num = len(workers)
  reliability = np.zeros(w_num)
  s_num = len(sentences)

  for j in range(w_num):
    worker = workers[j]
    wtruelabels = truelabels[wsentenceidlists[worker]]
    wlabelidlist = wlabelidlists[worker]
    wlabels = labels[wlabelidlist]

    nw = len(wlabels)
    totalsim = 0
    for i in range(nw):
      label = wlabels[i]
      truelabel = wtruelabels[i]
      totalsim += meteor.meteor_score([truelabel.split()], label.split())
    reliability[j] = totalsim / nw

  return reliability

### Question METEOR

In [19]:
def QuestionReliabilityMETEOR(sentences, slabelidlists, labels, truelabels):
  reliability = np.zeros(len(sentences))
  for i in range(len(sentences)):
    sentence = sentences[i]
    truelabel = truelabels[sentences.index(sentence)]
    slabels = labels[slabelidlists[sentence]]
    ns = len(slabels)
    sim = 0
    for k in range(ns):
      label = slabels[k]
      sim += meteor.meteor_score([truelabel.split()], label.split())
    reliability[i] = sim / ns

  return reliability

In [20]:
def QuestionReliabilityMETEORInter(sentences, slabelidlists, labels, truelabels):
  reliability = np.zeros(len(sentences))
  for i in range(len(sentences)):
    sentence = sentences[i]
    truelabel = truelabels[sentences.index(sentence)]
    slabels = labels[slabelidlists[sentence]]
    ns = len(slabels)
    sim = 0
    count = 0
    for k1 in range(ns):
      label1 = slabels[k1]
      for k2 in range(k1+1,ns):
        label2 = slabels[k2]
        sim += meteor.meteor_score([label1.split()], label2.split())
        count += 1
    reliability[i] = sim / count

  return reliability

## Crowd Only

### Embedding: Worker Reliability

In [None]:
# J1
labelfilename = folder + 'CrowdWSA2019_J1_label_anonymous.tsv'
gtfilename = folder + 'CrowdWSA2019_J1_gt.tsv'
j1workerreliability = DatasetWorkerReliabilityEmbedding(labelfilename,gtfilename)

# T1
labelfilename = folder + 'CrowdWSA2019_T1_label_anonymous.tsv'
gtfilename = folder + 'CrowdWSA2019_T1_gt.tsv'
t1workerreliability = DatasetWorkerReliabilityEmbedding(labelfilename,gtfilename)

# T2
labelfilename = folder + 'CrowdWSA2019_T2_label_anonymous.tsv'
gtfilename = folder + 'CrowdWSA2019_T2_gt.tsv'
t2workerreliability = DatasetWorkerReliabilityEmbedding(labelfilename,gtfilename)

print([np.min(j1workerreliability),np.min(t1workerreliability),np.min(t2workerreliability)])
print([np.mean(j1workerreliability),np.mean(t1workerreliability),np.mean(t2workerreliability)])
print([np.max(j1workerreliability),np.max(t1workerreliability),np.max(t2workerreliability)])
print([np.std(j1workerreliability),np.std(t1workerreliability),np.std(t2workerreliability)])

[0.42331745848059654, 0.582691141217947, 0.4725991182029247]
[0.6619557306348586, 0.7260422321414901, 0.7076946570262197]
[0.8794819871584575, 0.9043441876769066, 0.895806810259819]
[0.06872838692431518, 0.06870664661190475, 0.08609710281500424]


### Embedding: Question Reliability

In [None]:
# J1
labelfilename = folder + 'CrowdWSA2019_J1_label_anonymous.tsv'
gtfilename = folder + 'CrowdWSA2019_J1_gt.tsv'
j1questionreliability = DatasetQuestionReliabilityEmbedding(labelfilename,gtfilename)

# T1
labelfilename = folder + 'CrowdWSA2019_T1_label_anonymous.tsv'
gtfilename = folder + 'CrowdWSA2019_T1_gt.tsv'
t1questionreliability = DatasetQuestionReliabilityEmbedding(labelfilename,gtfilename)

# T2
labelfilename = folder + 'CrowdWSA2019_T2_label_anonymous.tsv'
gtfilename = folder + 'CrowdWSA2019_T2_gt.tsv'
t2questionreliability = DatasetQuestionReliabilityEmbedding(labelfilename,gtfilename)

print([np.min(j1questionreliability),np.min(t1questionreliability),np.min(t2questionreliability)])
print([np.mean(j1questionreliability),np.mean(t1questionreliability),np.mean(t2questionreliability)])
print([np.max(j1questionreliability),np.max(t1questionreliability),np.max(t2questionreliability)])
print([np.std(j1questionreliability),np.std(t1questionreliability),np.std(t2questionreliability)])

[0.3222570225596428, 0.12638154476881028, 0.12064819484949112]
[0.673164767403735, 0.7244636745974421, 0.7104700366854666]
[0.9459681451320648, 0.9583230137825012, 0.9546079635620117]
[0.11685931751027732, 0.1314015847056896, 0.13862285350346543]


In [None]:
# J1
labelfilename = folder + 'CrowdWSA2019_J1_label_anonymous.tsv'
gtfilename = folder + 'CrowdWSA2019_J1_gt.tsv'
j1questionreliability = DatasetQuestionReliabilityEmbeddingInter(labelfilename,gtfilename)

# T1
labelfilename = folder + 'CrowdWSA2019_T1_label_anonymous.tsv'
gtfilename = folder + 'CrowdWSA2019_T1_gt.tsv'
t1questionreliability = DatasetQuestionReliabilityEmbeddingInter(labelfilename,gtfilename)

# T2
labelfilename = folder + 'CrowdWSA2019_T2_label_anonymous.tsv'
gtfilename = folder + 'CrowdWSA2019_T2_gt.tsv'
t2questionreliability = DatasetQuestionReliabilityEmbeddingInter(labelfilename,gtfilename)

print('Inter-Annotator Agreement')
print([np.mean(j1questionreliability),np.mean(t1questionreliability),np.mean(t2questionreliability)])

Inter-Annotator Agreement
[0.6582932666982628, 0.7251615675522224, 0.7060259345790577]


### Worker GLEU

In [None]:
# J1
labelfilename = folder + 'CrowdWSA2019_J1_label_anonymous.tsv'
gtfilename = folder + 'CrowdWSA2019_J1_gt.tsv'

(workers,sentences,swlabels,truelabels) = loaddata(labelfilename,gtfilename)
(labellist,slabelidlists,sworkeridlists,wlabelidlists,wsentenceidlists) = labelformatconversion(workers,sentences,swlabels)
(truelabellist) = truelabelformatonversion(sentences,truelabels)

reliability = WorkerReliabilityGLEU(workers, sentences, wlabelidlists, wsentenceidlists, np.asarray(labellist), np.asarray(truelabellist))
j1workergleu = reliability

# T1
labelfilename = folder + 'CrowdWSA2019_T1_label_anonymous.tsv'
gtfilename = folder + 'CrowdWSA2019_T1_gt.tsv'

(workers,sentences,swlabels,truelabels) = loaddata(labelfilename,gtfilename)
(labellist,slabelidlists,sworkeridlists,wlabelidlists,wsentenceidlists) = labelformatconversion(workers,sentences,swlabels)
(truelabellist) = truelabelformatonversion(sentences,truelabels)

reliability = WorkerReliabilityGLEU(workers, sentences, wlabelidlists, wsentenceidlists, np.asarray(labellist), np.asarray(truelabellist))
t1workergleu = reliability

# T2
labelfilename = folder + 'CrowdWSA2019_T2_label_anonymous.tsv'
gtfilename = folder + 'CrowdWSA2019_T2_gt.tsv'

(workers,sentences,swlabels,truelabels) = loaddata(labelfilename,gtfilename)
(labellist,slabelidlists,sworkeridlists,wlabelidlists,wsentenceidlists) = labelformatconversion(workers,sentences,swlabels)
(truelabellist) = truelabelformatonversion(sentences,truelabels)

reliability = WorkerReliabilityGLEU(workers, sentences, wlabelidlists, wsentenceidlists, np.asarray(labellist), np.asarray(truelabellist))
t2workergleu = reliability

print([np.min(j1workergleu),np.min(t1workergleu),np.min(t2workergleu)])
print([np.mean(j1workergleu),np.mean(t1workergleu),np.mean(t2workergleu)])
print([np.max(j1workergleu),np.max(t1workergleu),np.max(t2workergleu)])
print([np.std(j1workergleu),np.std(t1workergleu),np.std(t2workergleu)])

[0.07237291780635434, 0.06686639236772593, 0.050281626104241105]
[0.1868012686336684, 0.17641370367716142, 0.17155530496426646]
[0.5948484393066437, 0.5533564345646725, 0.4540110583805842]
[0.09150140354861402, 0.08176752140395627, 0.08381023328768089]


###Question GLEU

In [None]:
# J1
labelfilename = folder + 'CrowdWSA2019_J1_label_anonymous.tsv'
gtfilename = folder + 'CrowdWSA2019_J1_gt.tsv'

(workers,sentences,swlabels,truelabels) = loaddata(labelfilename,gtfilename)
(labellist,slabelidlists,sworkeridlists,wlabelidlists,wsentenceidlists) = labelformatconversion(workers,sentences,swlabels)
(truelabellist) = truelabelformatonversion(sentences,truelabels)

quality = QuestionReliabilityGLEU(sentences, slabelidlists, np.asarray(labellist), np.asarray(truelabellist))
j1questiongleu = quality

# T1
labelfilename = folder + 'CrowdWSA2019_T1_label_anonymous.tsv'
gtfilename = folder + 'CrowdWSA2019_T1_gt.tsv'

(workers,sentences,swlabels,truelabels) = loaddata(labelfilename,gtfilename)
(labellist,slabelidlists,sworkeridlists,wlabelidlists,wsentenceidlists) = labelformatconversion(workers,sentences,swlabels)
(truelabellist) = truelabelformatonversion(sentences,truelabels)

quality = QuestionReliabilityGLEU(sentences, slabelidlists, np.asarray(labellist), np.asarray(truelabellist))
t1questiongleu = quality

# T2
labelfilename = folder + 'CrowdWSA2019_T2_label_anonymous.tsv'
gtfilename = folder + 'CrowdWSA2019_T2_gt.tsv'

(workers,sentences,swlabels,truelabels) = loaddata(labelfilename,gtfilename)
(labellist,slabelidlists,sworkeridlists,wlabelidlists,wsentenceidlists) = labelformatconversion(workers,sentences,swlabels)
(truelabellist) = truelabelformatonversion(sentences,truelabels)

quality = QuestionReliabilityGLEU(sentences, slabelidlists, np.asarray(labellist), np.asarray(truelabellist))
t2questiongleu = quality

print([np.min(j1questiongleu),np.min(t1questiongleu),np.min(t2questiongleu)])
print([np.mean(j1questiongleu),np.mean(t1questiongleu),np.mean(t2questiongleu)])
print([np.max(j1questiongleu),np.max(t1questiongleu),np.max(t2questiongleu)])
print([np.std(j1questiongleu),np.std(t1questiongleu),np.std(t2questiongleu)])

[0.006535947712418301, 0.006521739130434782, 0.009523809523809523]
[0.1929833036876219, 0.17403443483987385, 0.16158265928021784]
[0.6590909090909091, 0.4725563909774436, 0.4384761904761906]
[0.1008622248193353, 0.09880973187901006, 0.09119787478836808]


In [None]:
# J1
labelfilename = folder + 'CrowdWSA2019_J1_label_anonymous.tsv'
gtfilename = folder + 'CrowdWSA2019_J1_gt.tsv'

(workers,sentences,swlabels,truelabels) = loaddata(labelfilename,gtfilename)
(labellist,slabelidlists,sworkeridlists,wlabelidlists,wsentenceidlists) = labelformatconversion(workers,sentences,swlabels)
(truelabellist) = truelabelformatonversion(sentences,truelabels)

quality = QuestionReliabilityGLEUInter(sentences, slabelidlists, np.asarray(labellist), np.asarray(truelabellist))
j1questiongleu = quality

# T1
labelfilename = folder + 'CrowdWSA2019_T1_label_anonymous.tsv'
gtfilename = folder + 'CrowdWSA2019_T1_gt.tsv'

(workers,sentences,swlabels,truelabels) = loaddata(labelfilename,gtfilename)
(labellist,slabelidlists,sworkeridlists,wlabelidlists,wsentenceidlists) = labelformatconversion(workers,sentences,swlabels)
(truelabellist) = truelabelformatonversion(sentences,truelabels)

quality = QuestionReliabilityGLEUInter(sentences, slabelidlists, np.asarray(labellist), np.asarray(truelabellist))
t1questiongleu = quality

# T2
labelfilename = folder + 'CrowdWSA2019_T2_label_anonymous.tsv'
gtfilename = folder + 'CrowdWSA2019_T2_gt.tsv'

(workers,sentences,swlabels,truelabels) = loaddata(labelfilename,gtfilename)
(labellist,slabelidlists,sworkeridlists,wlabelidlists,wsentenceidlists) = labelformatconversion(workers,sentences,swlabels)
(truelabellist) = truelabelformatonversion(sentences,truelabels)

quality = QuestionReliabilityGLEUInter(sentences, slabelidlists, np.asarray(labellist), np.asarray(truelabellist))
t2questiongleu = quality

print('Inter-Annotator Agreement')
print([np.mean(j1questiongleu),np.mean(t1questiongleu),np.mean(t2questiongleu)])

Inter-Annotator Agreement
[0.17984577986538539, 0.25332925704032905, 0.2376583436883194]


### Worker METEOR

In [None]:
# J1
labelfilename = folder + 'CrowdWSA2019_J1_label_anonymous.tsv'
gtfilename = folder + 'CrowdWSA2019_J1_gt.tsv'

(workers,sentences,swlabels,truelabels) = loaddata(labelfilename,gtfilename)
(labellist,slabelidlists,sworkeridlists,wlabelidlists,wsentenceidlists) = labelformatconversion(workers,sentences,swlabels)
(truelabellist) = truelabelformatonversion(sentences,truelabels)

reliability = WorkerReliabilityMETEOR(workers, sentences, wlabelidlists, wsentenceidlists, np.asarray(labellist), np.asarray(truelabellist))
j1workermeteor = reliability

# T1
labelfilename = folder + 'CrowdWSA2019_T1_label_anonymous.tsv'
gtfilename = folder + 'CrowdWSA2019_T1_gt.tsv'

(workers,sentences,swlabels,truelabels) = loaddata(labelfilename,gtfilename)
(labellist,slabelidlists,sworkeridlists,wlabelidlists,wsentenceidlists) = labelformatconversion(workers,sentences,swlabels)
(truelabellist) = truelabelformatonversion(sentences,truelabels)

reliability = WorkerReliabilityMETEOR(workers, sentences, wlabelidlists, wsentenceidlists, np.asarray(labellist), np.asarray(truelabellist))
t1workermeteor = reliability

# T2
labelfilename = folder + 'CrowdWSA2019_T2_label_anonymous.tsv'
gtfilename = folder + 'CrowdWSA2019_T2_gt.tsv'

(workers,sentences,swlabels,truelabels) = loaddata(labelfilename,gtfilename)
(labellist,slabelidlists,sworkeridlists,wlabelidlists,wsentenceidlists) = labelformatconversion(workers,sentences,swlabels)
(truelabellist) = truelabelformatonversion(sentences,truelabels)

reliability = WorkerReliabilityMETEOR(workers, sentences, wlabelidlists, wsentenceidlists, np.asarray(labellist), np.asarray(truelabellist))
t2workermeteor = reliability

print([np.min(j1workermeteor),np.min(t1workermeteor),np.min(t2workermeteor)])
print([np.mean(j1workermeteor),np.mean(t1workermeteor),np.mean(t2workermeteor)])
print([np.max(j1workermeteor),np.max(t1workermeteor),np.max(t2workermeteor)])
print([np.std(j1workermeteor),np.std(t1workermeteor),np.std(t2workermeteor)])

[0.2040843932501189, 0.17805898892215194, 0.15225078689455235]
[0.37615897743650556, 0.3770760280922258, 0.3630198403684655]
[0.7463835833007678, 0.7162633888123681, 0.647342194834532]
[0.10373272152854734, 0.09868627015984659, 0.10101248149742166]


### Question METEOR

In [None]:
# J1
labelfilename = folder + 'CrowdWSA2019_J1_label_anonymous.tsv'
gtfilename = folder + 'CrowdWSA2019_J1_gt.tsv'

(workers,sentences,swlabels,truelabels) = loaddata(labelfilename,gtfilename)
(labellist,slabelidlists,sworkeridlists,wlabelidlists,wsentenceidlists) = labelformatconversion(workers,sentences,swlabels)
(truelabellist) = truelabelformatonversion(sentences,truelabels)

quality = QuestionReliabilityMETEOR(sentences, slabelidlists, np.asarray(labellist), np.asarray(truelabellist))
j1questionmeteor = quality

# T1
labelfilename = folder + 'CrowdWSA2019_T1_label_anonymous.tsv'
gtfilename = folder + 'CrowdWSA2019_T1_gt.tsv'

(workers,sentences,swlabels,truelabels) = loaddata(labelfilename,gtfilename)
(labellist,slabelidlists,sworkeridlists,wlabelidlists,wsentenceidlists) = labelformatconversion(workers,sentences,swlabels)
(truelabellist) = truelabelformatonversion(sentences,truelabels)

quality = QuestionReliabilityMETEOR(sentences, slabelidlists, np.asarray(labellist), np.asarray(truelabellist))
t1questionmeteor = quality

# T2
labelfilename = folder + 'CrowdWSA2019_T2_label_anonymous.tsv'
gtfilename = folder + 'CrowdWSA2019_T2_gt.tsv'

(workers,sentences,swlabels,truelabels) = loaddata(labelfilename,gtfilename)
(labellist,slabelidlists,sworkeridlists,wlabelidlists,wsentenceidlists) = labelformatconversion(workers,sentences,swlabels)
(truelabellist) = truelabelformatonversion(sentences,truelabels)

quality = QuestionReliabilityMETEOR(sentences, slabelidlists, np.asarray(labellist), np.asarray(truelabellist))
t2questionmeteor = quality

print([np.min(j1questionmeteor),np.min(t1questionmeteor),np.min(t2questionmeteor)])
print([np.mean(j1questionmeteor),np.mean(t1questionmeteor),np.mean(t2questionmeteor)])
print([np.max(j1questionmeteor),np.max(t1questionmeteor),np.max(t2questionmeteor)])
print([np.std(j1questionmeteor),np.std(t1questionmeteor),np.std(t2questionmeteor)])

[0.057400309787111414, 0.015841769778777652, 0.030561754273594994]
[0.3860552547481079, 0.3785616206179462, 0.36038552223904163]
[0.8443340586752301, 0.7218620418700931, 0.6620051395112414]
[0.14319194574220404, 0.1454309690357034, 0.13372475839451317]


In [None]:
# J1
labelfilename = folder + 'CrowdWSA2019_J1_label_anonymous.tsv'
gtfilename = folder + 'CrowdWSA2019_J1_gt.tsv'

(workers,sentences,swlabels,truelabels) = loaddata(labelfilename,gtfilename)
(labellist,slabelidlists,sworkeridlists,wlabelidlists,wsentenceidlists) = labelformatconversion(workers,sentences,swlabels)
(truelabellist) = truelabelformatonversion(sentences,truelabels)

quality = QuestionReliabilityMETEORInter(sentences, slabelidlists, np.asarray(labellist), np.asarray(truelabellist))
j1questionmeteor = quality

# T1
labelfilename = folder + 'CrowdWSA2019_T1_label_anonymous.tsv'
gtfilename = folder + 'CrowdWSA2019_T1_gt.tsv'

(workers,sentences,swlabels,truelabels) = loaddata(labelfilename,gtfilename)
(labellist,slabelidlists,sworkeridlists,wlabelidlists,wsentenceidlists) = labelformatconversion(workers,sentences,swlabels)
(truelabellist) = truelabelformatonversion(sentences,truelabels)

quality = QuestionReliabilityMETEORInter(sentences, slabelidlists, np.asarray(labellist), np.asarray(truelabellist))
t1questionmeteor = quality

# T2
labelfilename = folder + 'CrowdWSA2019_T2_label_anonymous.tsv'
gtfilename = folder + 'CrowdWSA2019_T2_gt.tsv'

(workers,sentences,swlabels,truelabels) = loaddata(labelfilename,gtfilename)
(labellist,slabelidlists,sworkeridlists,wlabelidlists,wsentenceidlists) = labelformatconversion(workers,sentences,swlabels)
(truelabellist) = truelabelformatonversion(sentences,truelabels)

quality = QuestionReliabilityMETEORInter(sentences, slabelidlists, np.asarray(labellist), np.asarray(truelabellist))
t2questionmeteor = quality

print('Inter-Annotator Agreement')
print([np.mean(j1questionmeteor),np.mean(t1questionmeteor),np.mean(t2questionmeteor)])

Inter-Annotator Agreement
[0.3791010870235044, 0.4762804678136589, 0.44018203073354956]


## HumanAgg

### Embedding: Worker Reliability

In [None]:
# J1
labelfilename = folder + 'CrowdWSA2019_J1_label_anonymous_humanagg.tsv'
gtfilename = folder + 'CrowdWSA2019_J1_gt.tsv'
j1workerreliability = DatasetWorkerReliabilityEmbedding(labelfilename,gtfilename)

# T1
labelfilename = folder + 'CrowdWSA2019_T1_label_anonymous_humanagg.tsv'
gtfilename = folder + 'CrowdWSA2019_T1_gt.tsv'
t1workerreliability = DatasetWorkerReliabilityEmbedding(labelfilename,gtfilename)

# T2
labelfilename = folder + 'CrowdWSA2019_T2_label_anonymous_humanagg.tsv'
gtfilename = folder + 'CrowdWSA2019_T2_gt.tsv'
t2workerreliability = DatasetWorkerReliabilityEmbedding(labelfilename,gtfilename)

print([np.min(j1workerreliability),np.min(t1workerreliability),np.min(t2workerreliability)])
print([np.mean(j1workerreliability),np.mean(t1workerreliability),np.mean(t2workerreliability)])
print([np.max(j1workerreliability),np.max(t1workerreliability),np.max(t2workerreliability)])
print([np.std(j1workerreliability),np.std(t1workerreliability),np.std(t2workerreliability)])

[0.22428150475025177, 0.12815798819065094, 0.26995864510536194]
[0.6856763112775225, 0.7336201400943694, 0.7214679287495791]
[1.0, 1.0000001192092896, 1.0]
[0.11351834414516504, 0.15114783099464582, 0.12502008759691086]


### Embedding: Question Reliability

In [None]:
# J1
labelfilename = folder + 'CrowdWSA2019_J1_label_anonymous_humanagg.tsv'
gtfilename = folder + 'CrowdWSA2019_J1_gt.tsv'
j1questionreliability = DatasetQuestionReliabilityEmbedding(labelfilename,gtfilename)

# T1
labelfilename = folder + 'CrowdWSA2019_T1_label_anonymous_humanagg.tsv'
gtfilename = folder + 'CrowdWSA2019_T1_gt.tsv'
t1questionreliability = DatasetQuestionReliabilityEmbedding(labelfilename,gtfilename)

# T2
labelfilename = folder + 'CrowdWSA2019_T2_label_anonymous_humanagg.tsv'
gtfilename = folder + 'CrowdWSA2019_T2_gt.tsv'
t2questionreliability = DatasetQuestionReliabilityEmbedding(labelfilename,gtfilename)

print([np.min(j1questionreliability),np.min(t1questionreliability),np.min(t2questionreliability)])
print([np.mean(j1questionreliability),np.mean(t1questionreliability),np.mean(t2questionreliability)])
print([np.max(j1questionreliability),np.max(t1questionreliability),np.max(t2questionreliability)])
print([np.std(j1questionreliability),np.std(t1questionreliability),np.std(t2questionreliability)])

[0.2316086158156395, 0.137788325548172, 0.032199906557798384]
[0.6982842117607594, 0.7429604615867136, 0.7281856786385178]
[0.9761502742767334, 0.9581658720970154, 0.9349515914916993]
[0.13401528846826255, 0.1371743215127005, 0.15612887389223598]


In [21]:
# J1
labelfilename = folder + 'CrowdWSA2019_J1_label_anonymous_humanagg.tsv'
gtfilename = folder + 'CrowdWSA2019_J1_gt.tsv'
j1questionreliability = DatasetQuestionReliabilityEmbeddingInter(labelfilename,gtfilename)

# T1
labelfilename = folder + 'CrowdWSA2019_T1_label_anonymous_humanagg.tsv'
gtfilename = folder + 'CrowdWSA2019_T1_gt.tsv'
t1questionreliability = DatasetQuestionReliabilityEmbeddingInter(labelfilename,gtfilename)

# T2
labelfilename = folder + 'CrowdWSA2019_T2_label_anonymous_humanagg.tsv'
gtfilename = folder + 'CrowdWSA2019_T2_gt.tsv'
t2questionreliability = DatasetQuestionReliabilityEmbeddingInter(labelfilename,gtfilename)

print('Inter-Annotator Agreement')
print([np.mean(j1questionreliability),np.mean(t1questionreliability),np.mean(t2questionreliability)])

Inter-Annotator Agreement
[0.728640569114685, 0.7850180976390839, 0.7751828557699918]


### Worker GLEU

In [None]:
# J1
labelfilename = folder + 'CrowdWSA2019_J1_label_anonymous_humanagg.tsv'
gtfilename = folder + 'CrowdWSA2019_J1_gt.tsv'

(workers,sentences,swlabels,truelabels) = loaddata(labelfilename,gtfilename)
(labellist,slabelidlists,sworkeridlists,wlabelidlists,wsentenceidlists) = labelformatconversion(workers,sentences,swlabels)
(truelabellist) = truelabelformatonversion(sentences,truelabels)

reliability = WorkerReliabilityGLEU(workers, sentences, wlabelidlists, wsentenceidlists, np.asarray(labellist), np.asarray(truelabellist))
j1workergleu = reliability

# T1
labelfilename = folder + 'CrowdWSA2019_T1_label_anonymous_humanagg.tsv'
gtfilename = folder + 'CrowdWSA2019_T1_gt.tsv'

(workers,sentences,swlabels,truelabels) = loaddata(labelfilename,gtfilename)
(labellist,slabelidlists,sworkeridlists,wlabelidlists,wsentenceidlists) = labelformatconversion(workers,sentences,swlabels)
(truelabellist) = truelabelformatonversion(sentences,truelabels)

reliability = WorkerReliabilityGLEU(workers, sentences, wlabelidlists, wsentenceidlists, np.asarray(labellist), np.asarray(truelabellist))
t1workergleu = reliability

# T2
labelfilename = folder + 'CrowdWSA2019_T2_label_anonymous_humanagg.tsv'
gtfilename = folder + 'CrowdWSA2019_T2_gt.tsv'

(workers,sentences,swlabels,truelabels) = loaddata(labelfilename,gtfilename)
(labellist,slabelidlists,sworkeridlists,wlabelidlists,wsentenceidlists) = labelformatconversion(workers,sentences,swlabels)
(truelabellist) = truelabelformatonversion(sentences,truelabels)

reliability = WorkerReliabilityGLEU(workers, sentences, wlabelidlists, wsentenceidlists, np.asarray(labellist), np.asarray(truelabellist))
t2workergleu = reliability

print([np.min(j1workergleu),np.min(t1workergleu),np.min(t2workergleu)])
print([np.mean(j1workergleu),np.mean(t1workergleu),np.mean(t2workergleu)])
print([np.max(j1workergleu),np.max(t1workergleu),np.max(t2workergleu)])
print([np.std(j1workergleu),np.std(t1workergleu),np.std(t2workergleu)])

[0.0, 0.021739130434782608, 0.0]
[0.21237081610419373, 0.18738060144552507, 0.18166888764533748]
[1.0, 0.7894736842105263, 0.6842105263157895]
[0.13989523557680925, 0.1297426918952499, 0.11901746893535857]


### Question GLEU

In [None]:
# J1
labelfilename = folder + 'CrowdWSA2019_J1_label_anonymous_humanagg.tsv'
gtfilename = folder + 'CrowdWSA2019_J1_gt.tsv'

(workers,sentences,swlabels,truelabels) = loaddata(labelfilename,gtfilename)
(labellist,slabelidlists,sworkeridlists,wlabelidlists,wsentenceidlists) = labelformatconversion(workers,sentences,swlabels)
(truelabellist) = truelabelformatonversion(sentences,truelabels)

quality = QuestionReliabilityGLEU(sentences, slabelidlists, np.asarray(labellist), np.asarray(truelabellist))
j1questiongleu = quality

# T1
labelfilename = folder + 'CrowdWSA2019_T1_label_anonymous_humanagg.tsv'
gtfilename = folder + 'CrowdWSA2019_T1_gt.tsv'

(workers,sentences,swlabels,truelabels) = loaddata(labelfilename,gtfilename)
(labellist,slabelidlists,sworkeridlists,wlabelidlists,wsentenceidlists) = labelformatconversion(workers,sentences,swlabels)
(truelabellist) = truelabelformatonversion(sentences,truelabels)

quality = QuestionReliabilityGLEU(sentences, slabelidlists, np.asarray(labellist), np.asarray(truelabellist))
t1questiongleu = quality

# T2
labelfilename = folder + 'CrowdWSA2019_T2_label_anonymous_humanagg.tsv'
gtfilename = folder + 'CrowdWSA2019_T2_gt.tsv'

(workers,sentences,swlabels,truelabels) = loaddata(labelfilename,gtfilename)
(labellist,slabelidlists,sworkeridlists,wlabelidlists,wsentenceidlists) = labelformatconversion(workers,sentences,swlabels)
(truelabellist) = truelabelformatonversion(sentences,truelabels)

quality = QuestionReliabilityGLEU(sentences, slabelidlists, np.asarray(labellist), np.asarray(truelabellist))
t2questiongleu = quality

print([np.min(j1questiongleu),np.min(t1questiongleu),np.min(t2questiongleu)])
print([np.mean(j1questiongleu),np.mean(t1questiongleu),np.mean(t2questiongleu)])
print([np.max(j1questiongleu),np.max(t1questiongleu),np.max(t2questiongleu)])
print([np.std(j1questiongleu),np.std(t1questiongleu),np.std(t2questiongleu)])

[0.0047619047619047615, 0.017391304347826087, 0.0]
[0.2260193818423277, 0.18188431807326452, 0.17690832435077364]
[0.8566433566433567, 0.5789473684210527, 0.5421052631578946]
[0.14572121234084773, 0.11830057361303958, 0.12395305400266479]


In [None]:
# J1
labelfilename = folder + 'CrowdWSA2019_J1_label_anonymous_humanagg.tsv'
gtfilename = folder + 'CrowdWSA2019_J1_gt.tsv'

(workers,sentences,swlabels,truelabels) = loaddata(labelfilename,gtfilename)
(labellist,slabelidlists,sworkeridlists,wlabelidlists,wsentenceidlists) = labelformatconversion(workers,sentences,swlabels)
(truelabellist) = truelabelformatonversion(sentences,truelabels)

quality = QuestionReliabilityGLEUInter(sentences, slabelidlists, np.asarray(labellist), np.asarray(truelabellist))
j1questiongleu = quality

# T1
labelfilename = folder + 'CrowdWSA2019_T1_label_anonymous_humanagg.tsv'
gtfilename = folder + 'CrowdWSA2019_T1_gt.tsv'

(workers,sentences,swlabels,truelabels) = loaddata(labelfilename,gtfilename)
(labellist,slabelidlists,sworkeridlists,wlabelidlists,wsentenceidlists) = labelformatconversion(workers,sentences,swlabels)
(truelabellist) = truelabelformatonversion(sentences,truelabels)

quality = QuestionReliabilityGLEUInter(sentences, slabelidlists, np.asarray(labellist), np.asarray(truelabellist))
t1questiongleu = quality

# T2
labelfilename = folder + 'CrowdWSA2019_T2_label_anonymous_humanagg.tsv'
gtfilename = folder + 'CrowdWSA2019_T2_gt.tsv'

(workers,sentences,swlabels,truelabels) = loaddata(labelfilename,gtfilename)
(labellist,slabelidlists,sworkeridlists,wlabelidlists,wsentenceidlists) = labelformatconversion(workers,sentences,swlabels)
(truelabellist) = truelabelformatonversion(sentences,truelabels)

quality = QuestionReliabilityGLEUInter(sentences, slabelidlists, np.asarray(labellist), np.asarray(truelabellist))
t2questiongleu = quality

print('Inter-Annotator Agreement')
print([np.mean(j1questiongleu),np.mean(t1questiongleu),np.mean(t2questiongleu)])

Inter-Annotator Agreement
[0.27457611924125636, 0.3473106664655869, 0.33712483252936076]


### Worker METEOR

In [None]:
# J1
labelfilename = folder + 'CrowdWSA2019_J1_label_anonymous_humanagg.tsv'
gtfilename = folder + 'CrowdWSA2019_J1_gt.tsv'

(workers,sentences,swlabels,truelabels) = loaddata(labelfilename,gtfilename)
(labellist,slabelidlists,sworkeridlists,wlabelidlists,wsentenceidlists) = labelformatconversion(workers,sentences,swlabels)
(truelabellist) = truelabelformatonversion(sentences,truelabels)

reliability = WorkerReliabilityMETEOR(workers, sentences, wlabelidlists, wsentenceidlists, np.asarray(labellist), np.asarray(truelabellist))
j1workermeteor = reliability

# T1
labelfilename = folder + 'CrowdWSA2019_T1_label_anonymous_humanagg.tsv'
gtfilename = folder + 'CrowdWSA2019_T1_gt.tsv'

(workers,sentences,swlabels,truelabels) = loaddata(labelfilename,gtfilename)
(labellist,slabelidlists,sworkeridlists,wlabelidlists,wsentenceidlists) = labelformatconversion(workers,sentences,swlabels)
(truelabellist) = truelabelformatonversion(sentences,truelabels)

reliability = WorkerReliabilityMETEOR(workers, sentences, wlabelidlists, wsentenceidlists, np.asarray(labellist), np.asarray(truelabellist))
t1workermeteor = reliability

# T2
labelfilename = folder + 'CrowdWSA2019_T2_label_anonymous_humanagg.tsv'
gtfilename = folder + 'CrowdWSA2019_T2_gt.tsv'

(workers,sentences,swlabels,truelabels) = loaddata(labelfilename,gtfilename)
(labellist,slabelidlists,sworkeridlists,wlabelidlists,wsentenceidlists) = labelformatconversion(workers,sentences,swlabels)
(truelabellist) = truelabelformatonversion(sentences,truelabels)

reliability = WorkerReliabilityMETEOR(workers, sentences, wlabelidlists, wsentenceidlists, np.asarray(labellist), np.asarray(truelabellist))
t2workermeteor = reliability

print([np.min(j1workermeteor),np.min(t1workermeteor),np.min(t2workermeteor)])
print([np.mean(j1workermeteor),np.mean(t1workermeteor),np.mean(t2workermeteor)])
print([np.max(j1workermeteor),np.max(t1workermeteor),np.max(t2workermeteor)])
print([np.std(j1workermeteor),np.std(t1workermeteor),np.std(t2workermeteor)])

[0.04901960784313726, 0.03937007874015748, 0.04761904761904762]
[0.4156094318692456, 0.39085744805149214, 0.383969723883758]
[0.9985422740524781, 0.9169724770642201, 0.8287608596250572]
[0.1568779534740165, 0.16659213948214727, 0.1619260199285394]


### Question METEOR

In [None]:
# J1
labelfilename = folder + 'CrowdWSA2019_J1_label_anonymous_humanagg.tsv'
gtfilename = folder + 'CrowdWSA2019_J1_gt.tsv'

(workers,sentences,swlabels,truelabels) = loaddata(labelfilename,gtfilename)
(labellist,slabelidlists,sworkeridlists,wlabelidlists,wsentenceidlists) = labelformatconversion(workers,sentences,swlabels)
(truelabellist) = truelabelformatonversion(sentences,truelabels)

quality = QuestionReliabilityMETEOR(sentences, slabelidlists, np.asarray(labellist), np.asarray(truelabellist))
j1questionmeteor = quality

# T1
labelfilename = folder + 'CrowdWSA2019_T1_label_anonymous_humanagg.tsv'
gtfilename = folder + 'CrowdWSA2019_T1_gt.tsv'

(workers,sentences,swlabels,truelabels) = loaddata(labelfilename,gtfilename)
(labellist,slabelidlists,sworkeridlists,wlabelidlists,wsentenceidlists) = labelformatconversion(workers,sentences,swlabels)
(truelabellist) = truelabelformatonversion(sentences,truelabels)

quality = QuestionReliabilityMETEOR(sentences, slabelidlists, np.asarray(labellist), np.asarray(truelabellist))
t1questionmeteor = quality

# T2
labelfilename = folder + 'CrowdWSA2019_T2_label_anonymous_humanagg.tsv'
gtfilename = folder + 'CrowdWSA2019_T2_gt.tsv'

(workers,sentences,swlabels,truelabels) = loaddata(labelfilename,gtfilename)
(labellist,slabelidlists,sworkeridlists,wlabelidlists,wsentenceidlists) = labelformatconversion(workers,sentences,swlabels)
(truelabellist) = truelabelformatonversion(sentences,truelabels)

quality = QuestionReliabilityMETEOR(sentences, slabelidlists, np.asarray(labellist), np.asarray(truelabellist))
t2questionmeteor = quality

print([np.min(j1questionmeteor),np.min(t1questionmeteor),np.min(t2questionmeteor)])
print([np.mean(j1questionmeteor),np.mean(t1questionmeteor),np.mean(t2questionmeteor)])
print([np.max(j1questionmeteor),np.max(t1questionmeteor),np.max(t2questionmeteor)])
print([np.std(j1questionmeteor),np.std(t1questionmeteor),np.std(t2questionmeteor)])

[0.04392998306041784, 0.03156632344033919, 0.03555764411027569]
[0.4397162519298426, 0.39265189819162094, 0.37463693494354033]
[0.9530227686703098, 0.8076119195645738, 0.7345149610271935]
[0.18562328684067483, 0.16796419415518357, 0.16579222728333315]


In [None]:
# J1
labelfilename = folder + 'CrowdWSA2019_J1_label_anonymous_humanagg.tsv'
gtfilename = folder + 'CrowdWSA2019_J1_gt.tsv'

(workers,sentences,swlabels,truelabels) = loaddata(labelfilename,gtfilename)
(labellist,slabelidlists,sworkeridlists,wlabelidlists,wsentenceidlists) = labelformatconversion(workers,sentences,swlabels)
(truelabellist) = truelabelformatonversion(sentences,truelabels)

quality = QuestionReliabilityMETEORInter(sentences, slabelidlists, np.asarray(labellist), np.asarray(truelabellist))
j1questionmeteor = quality

# T1
labelfilename = folder + 'CrowdWSA2019_T1_label_anonymous_humanagg.tsv'
gtfilename = folder + 'CrowdWSA2019_T1_gt.tsv'

(workers,sentences,swlabels,truelabels) = loaddata(labelfilename,gtfilename)
(labellist,slabelidlists,sworkeridlists,wlabelidlists,wsentenceidlists) = labelformatconversion(workers,sentences,swlabels)
(truelabellist) = truelabelformatonversion(sentences,truelabels)

quality = QuestionReliabilityMETEORInter(sentences, slabelidlists, np.asarray(labellist), np.asarray(truelabellist))
t1questionmeteor = quality

# T2
labelfilename = folder + 'CrowdWSA2019_T2_label_anonymous_humanagg.tsv'
gtfilename = folder + 'CrowdWSA2019_T2_gt.tsv'

(workers,sentences,swlabels,truelabels) = loaddata(labelfilename,gtfilename)
(labellist,slabelidlists,sworkeridlists,wlabelidlists,wsentenceidlists) = labelformatconversion(workers,sentences,swlabels)
(truelabellist) = truelabelformatonversion(sentences,truelabels)

quality = QuestionReliabilityMETEORInter(sentences, slabelidlists, np.asarray(labellist), np.asarray(truelabellist))
t2questionmeteor = quality

print('Inter-Annotator Agreement')
print([np.mean(j1questionmeteor),np.mean(t1questionmeteor),np.mean(t2questionmeteor)])

Inter-Annotator Agreement
[0.4909968555244004, 0.5669534077252868, 0.5492090558645203]


## Gemini

### Embedding: Worker Reliability

In [None]:
# J1
labelfilename = folder + 'CrowdWSA2019_J1_label_5temperature_trial0_gemini-1.5-pro-latest.tsv'
gtfilename = folder + 'CrowdWSA2019_J1_gt.tsv'
j1workerreliability = DatasetWorkerReliabilityEmbedding(labelfilename,gtfilename)

# T1
labelfilename = folder + 'CrowdWSA2019_T1_label_5temperature_trial0_gemini-1.5-pro-latest.tsv'
gtfilename = folder + 'CrowdWSA2019_T1_gt.tsv'
t1workerreliability = DatasetWorkerReliabilityEmbedding(labelfilename,gtfilename)

# T2
labelfilename = folder + 'CrowdWSA2019_T2_label_5temperature_trial0_gemini-1.5-pro-latest.tsv'
gtfilename = folder + 'CrowdWSA2019_T2_gt.tsv'
t2workerreliability = DatasetWorkerReliabilityEmbedding(labelfilename,gtfilename)

print([np.min(j1workerreliability),np.min(t1workerreliability),np.min(t2workerreliability)])
print([np.mean(j1workerreliability),np.mean(t1workerreliability),np.mean(t2workerreliability)])
print([np.max(j1workerreliability),np.max(t1workerreliability),np.max(t2workerreliability)])
print([np.std(j1workerreliability),np.std(t1workerreliability),np.std(t2workerreliability)])

[0.7265168557167053, 0.7566493032127619, 0.7458261470496654]
[0.7310330567359925, 0.7635132035166025, 0.7540152969062328]
[0.7373357325792312, 0.7718896312266588, 0.7684078720211983]
[0.003566330116766897, 0.0058377494613524724, 0.008632023874518177]


### Embedding: Question Reliability

In [None]:
# J1
labelfilename = folder + 'CrowdWSA2019_J1_label_3temperature_trial0_gemini-1.5-pro-latest.tsv'
gtfilename = folder + 'CrowdWSA2019_J1_gt.tsv'
j1questionreliability = DatasetQuestionReliabilityEmbedding(labelfilename,gtfilename)

# T1
labelfilename = folder + 'CrowdWSA2019_T1_label_3temperature_trial0_gemini-1.5-pro-latest.tsv'
gtfilename = folder + 'CrowdWSA2019_T1_gt.tsv'
t1questionreliability = DatasetQuestionReliabilityEmbedding(labelfilename,gtfilename)

# T2
labelfilename = folder + 'CrowdWSA2019_T2_label_3temperature_trial0_gemini-1.5-pro-latest.tsv'
gtfilename = folder + 'CrowdWSA2019_T2_gt.tsv'
t2questionreliability = DatasetQuestionReliabilityEmbedding(labelfilename,gtfilename)

print([np.min(j1questionreliability),np.min(t1questionreliability),np.min(t2questionreliability)])
print([np.mean(j1questionreliability),np.mean(t1questionreliability),np.mean(t2questionreliability)])
print([np.max(j1questionreliability),np.max(t1questionreliability),np.max(t2questionreliability)])
print([np.std(j1questionreliability),np.std(t1questionreliability),np.std(t2questionreliability)])

[0.25261545181274414, 0.06303130835294724, 0.19608191649119058]
[0.7289235450029373, 0.7592670890440543, 0.7474555508792399]
[1.0000001192092896, 1.0000001192092896, 1.0000001192092896]
[0.14549432727840597, 0.16255473168138804, 0.15438875585157777]


In [22]:
# J1
labelfilename = folder + 'CrowdWSA2019_J1_label_3temperature_trial0_gemini-1.5-pro-latest.tsv'
gtfilename = folder + 'CrowdWSA2019_J1_gt.tsv'
j1questionreliability = DatasetQuestionReliabilityEmbeddingInter(labelfilename,gtfilename)

# T1
labelfilename = folder + 'CrowdWSA2019_T1_label_3temperature_trial0_gemini-1.5-pro-latest.tsv'
gtfilename = folder + 'CrowdWSA2019_T1_gt.tsv'
t1questionreliability = DatasetQuestionReliabilityEmbeddingInter(labelfilename,gtfilename)

# T2
labelfilename = folder + 'CrowdWSA2019_T2_label_3temperature_trial0_gemini-1.5-pro-latest.tsv'
gtfilename = folder + 'CrowdWSA2019_T2_gt.tsv'
t2questionreliability = DatasetQuestionReliabilityEmbeddingInter(labelfilename,gtfilename)

print('Inter-Annotator Agreement')
print([np.mean(j1questionreliability),np.mean(t1questionreliability),np.mean(t2questionreliability)])

Inter-Annotator Agreement
[0.937320643723011, 0.9501944700876871, 0.9360006310542424]


### Worker GLEU

In [None]:
# J1
labelfilename = folder + 'CrowdWSA2019_J1_label_3temperature_trial0_gemini-1.5-pro-latest.tsv'
gtfilename = folder + 'CrowdWSA2019_J1_gt.tsv'

(workers,sentences,swlabels,truelabels) = loaddata(labelfilename,gtfilename)
(labellist,slabelidlists,sworkeridlists,wlabelidlists,wsentenceidlists) = labelformatconversion(workers,sentences,swlabels)
(truelabellist) = truelabelformatonversion(sentences,truelabels)

reliability = WorkerReliabilityGLEU(workers, sentences, wlabelidlists, wsentenceidlists, np.asarray(labellist), np.asarray(truelabellist))
j1workergleu = reliability

# T1
labelfilename = folder + 'CrowdWSA2019_T1_label_3temperature_trial0_gemini-1.5-pro-latest.tsv'
gtfilename = folder + 'CrowdWSA2019_T1_gt.tsv'

(workers,sentences,swlabels,truelabels) = loaddata(labelfilename,gtfilename)
(labellist,slabelidlists,sworkeridlists,wlabelidlists,wsentenceidlists) = labelformatconversion(workers,sentences,swlabels)
(truelabellist) = truelabelformatonversion(sentences,truelabels)

reliability = WorkerReliabilityGLEU(workers, sentences, wlabelidlists, wsentenceidlists, np.asarray(labellist), np.asarray(truelabellist))
t1workergleu = reliability

# T2
labelfilename = folder + 'CrowdWSA2019_T2_label_3temperature_trial0_gemini-1.5-pro-latest.tsv'
gtfilename = folder + 'CrowdWSA2019_T2_gt.tsv'

(workers,sentences,swlabels,truelabels) = loaddata(labelfilename,gtfilename)
(labellist,slabelidlists,sworkeridlists,wlabelidlists,wsentenceidlists) = labelformatconversion(workers,sentences,swlabels)
(truelabellist) = truelabelformatonversion(sentences,truelabels)

reliability = WorkerReliabilityGLEU(workers, sentences, wlabelidlists, wsentenceidlists, np.asarray(labellist), np.asarray(truelabellist))
t2workergleu = reliability

print([np.min(j1workergleu),np.min(t1workergleu),np.min(t2workergleu)])
print([np.mean(j1workergleu),np.mean(t1workergleu),np.mean(t2workergleu)])
print([np.max(j1workergleu),np.max(t1workergleu),np.max(t2workergleu)])
print([np.std(j1workergleu),np.std(t1workergleu),np.std(t2workergleu)])

[0.2587824530067214, 0.19676582151697755, 0.17072220447837905]
[0.26329381475652, 0.2015413430294043, 0.17511781344081512]
[0.2714006983158938, 0.20638460761265148, 0.1810178093075434]
[0.005744612634632056, 0.003927145131968486, 0.004335685618056471]


### Question GLEU

In [None]:
# J1
labelfilename = folder + 'CrowdWSA2019_J1_label_3temperature_trial0_gemini-1.5-pro-latest.tsv'
gtfilename = folder + 'CrowdWSA2019_J1_gt.tsv'

(workers,sentences,swlabels,truelabels) = loaddata(labelfilename,gtfilename)
(labellist,slabelidlists,sworkeridlists,wlabelidlists,wsentenceidlists) = labelformatconversion(workers,sentences,swlabels)
(truelabellist) = truelabelformatonversion(sentences,truelabels)

quality = QuestionReliabilityGLEU(sentences, slabelidlists, np.asarray(labellist), np.asarray(truelabellist))
j1questiongleu = quality

# T1
labelfilename = folder + 'CrowdWSA2019_T1_label_3temperature_trial0_gemini-1.5-pro-latest.tsv'
gtfilename = folder + 'CrowdWSA2019_T1_gt.tsv'

(workers,sentences,swlabels,truelabels) = loaddata(labelfilename,gtfilename)
(labellist,slabelidlists,sworkeridlists,wlabelidlists,wsentenceidlists) = labelformatconversion(workers,sentences,swlabels)
(truelabellist) = truelabelformatonversion(sentences,truelabels)

quality = QuestionReliabilityGLEU(sentences, slabelidlists, np.asarray(labellist), np.asarray(truelabellist))
t1questiongleu = quality

# T2
labelfilename = folder + 'CrowdWSA2019_T2_label_3temperature_trial0_gemini-1.5-pro-latest.tsv'
gtfilename = folder + 'CrowdWSA2019_T2_gt.tsv'

(workers,sentences,swlabels,truelabels) = loaddata(labelfilename,gtfilename)
(labellist,slabelidlists,sworkeridlists,wlabelidlists,wsentenceidlists) = labelformatconversion(workers,sentences,swlabels)
(truelabellist) = truelabelformatonversion(sentences,truelabels)

quality = QuestionReliabilityGLEU(sentences, slabelidlists, np.asarray(labellist), np.asarray(truelabellist))
t2questiongleu = quality

print([np.min(j1questiongleu),np.min(t1questiongleu),np.min(t2questiongleu)])
print([np.mean(j1questiongleu),np.mean(t1questiongleu),np.mean(t2questiongleu)])
print([np.max(j1questiongleu),np.max(t1questiongleu),np.max(t2questiongleu)])
print([np.std(j1questiongleu),np.std(t1questiongleu),np.std(t2questiongleu)])

[0.0, 0.0, 0.0]
[0.26329381475651986, 0.20154134302940427, 0.17511781344081512]
[1.0, 0.7142857142857143, 0.7142857142857143]
[0.2055638208998254, 0.1485730584592399, 0.14489547952870194]


In [None]:
# J1
labelfilename = folder + 'CrowdWSA2019_J1_label_3temperature_trial0_gemini-1.5-pro-latest.tsv'
gtfilename = folder + 'CrowdWSA2019_J1_gt.tsv'

(workers,sentences,swlabels,truelabels) = loaddata(labelfilename,gtfilename)
(labellist,slabelidlists,sworkeridlists,wlabelidlists,wsentenceidlists) = labelformatconversion(workers,sentences,swlabels)
(truelabellist) = truelabelformatonversion(sentences,truelabels)

quality = QuestionReliabilityGLEUInter(sentences, slabelidlists, np.asarray(labellist), np.asarray(truelabellist))
j1questiongleu = quality

# T1
labelfilename = folder + 'CrowdWSA2019_T1_label_3temperature_trial0_gemini-1.5-pro-latest.tsv'
gtfilename = folder + 'CrowdWSA2019_T1_gt.tsv'

(workers,sentences,swlabels,truelabels) = loaddata(labelfilename,gtfilename)
(labellist,slabelidlists,sworkeridlists,wlabelidlists,wsentenceidlists) = labelformatconversion(workers,sentences,swlabels)
(truelabellist) = truelabelformatonversion(sentences,truelabels)

quality = QuestionReliabilityGLEUInter(sentences, slabelidlists, np.asarray(labellist), np.asarray(truelabellist))
t1questiongleu = quality

# T2
labelfilename = folder + 'CrowdWSA2019_T2_label_3temperature_trial0_gemini-1.5-pro-latest.tsv'
gtfilename = folder + 'CrowdWSA2019_T2_gt.tsv'

(workers,sentences,swlabels,truelabels) = loaddata(labelfilename,gtfilename)
(labellist,slabelidlists,sworkeridlists,wlabelidlists,wsentenceidlists) = labelformatconversion(workers,sentences,swlabels)
(truelabellist) = truelabelformatonversion(sentences,truelabels)

quality = QuestionReliabilityGLEUInter(sentences, slabelidlists, np.asarray(labellist), np.asarray(truelabellist))
t2questiongleu = quality

print('Inter-Annotator Agreement')
print([np.mean(j1questiongleu),np.mean(t1questiongleu),np.mean(t2questiongleu)])

Inter-Annotator Agreement
[0.7749407862360516, 0.8028466366416024, 0.7853553973365477]


### Worker METEOR

In [None]:
# J1
labelfilename = folder + 'CrowdWSA2019_J1_label_3temperature_trial0_gemini-1.5-pro-latest.tsv'
gtfilename = folder + 'CrowdWSA2019_J1_gt.tsv'

(workers,sentences,swlabels,truelabels) = loaddata(labelfilename,gtfilename)
(labellist,slabelidlists,sworkeridlists,wlabelidlists,wsentenceidlists) = labelformatconversion(workers,sentences,swlabels)
(truelabellist) = truelabelformatonversion(sentences,truelabels)

reliability = WorkerReliabilityMETEOR(workers, sentences, wlabelidlists, wsentenceidlists, np.asarray(labellist), np.asarray(truelabellist))
j1workermeteor = reliability

# T1
labelfilename = folder + 'CrowdWSA2019_T1_label_3temperature_trial0_gemini-1.5-pro-latest.tsv'
gtfilename = folder + 'CrowdWSA2019_T1_gt.tsv'

(workers,sentences,swlabels,truelabels) = loaddata(labelfilename,gtfilename)
(labellist,slabelidlists,sworkeridlists,wlabelidlists,wsentenceidlists) = labelformatconversion(workers,sentences,swlabels)
(truelabellist) = truelabelformatonversion(sentences,truelabels)

reliability = WorkerReliabilityMETEOR(workers, sentences, wlabelidlists, wsentenceidlists, np.asarray(labellist), np.asarray(truelabellist))
t1workermeteor = reliability

# T2
labelfilename = folder + 'CrowdWSA2019_T2_label_3temperature_trial0_gemini-1.5-pro-latest.tsv'
gtfilename = folder + 'CrowdWSA2019_T2_gt.tsv'

(workers,sentences,swlabels,truelabels) = loaddata(labelfilename,gtfilename)
(labellist,slabelidlists,sworkeridlists,wlabelidlists,wsentenceidlists) = labelformatconversion(workers,sentences,swlabels)
(truelabellist) = truelabelformatonversion(sentences,truelabels)

reliability = WorkerReliabilityMETEOR(workers, sentences, wlabelidlists, wsentenceidlists, np.asarray(labellist), np.asarray(truelabellist))
t2workermeteor = reliability

print([np.min(j1workermeteor),np.min(t1workermeteor),np.min(t2workermeteor)])
print([np.mean(j1workermeteor),np.mean(t1workermeteor),np.mean(t2workermeteor)])
print([np.max(j1workermeteor),np.max(t1workermeteor),np.max(t2workermeteor)])
print([np.std(j1workermeteor),np.std(t1workermeteor),np.std(t2workermeteor)])

[0.48688476642407547, 0.4165467270307604, 0.3754233042779632]
[0.4925553566644423, 0.42180923722406827, 0.3833222793399864]
[0.49931582980242156, 0.43096990143965963, 0.39407064961732075]
[0.005133141840172578, 0.006501471299814404, 0.00787486134782037]


### Question METEOR

In [None]:
# J1
labelfilename = folder + 'CrowdWSA2019_J1_label_3temperature_trial0_gemini-1.5-pro-latest.tsv'
gtfilename = folder + 'CrowdWSA2019_J1_gt.tsv'

(workers,sentences,swlabels,truelabels) = loaddata(labelfilename,gtfilename)
(labellist,slabelidlists,sworkeridlists,wlabelidlists,wsentenceidlists) = labelformatconversion(workers,sentences,swlabels)
(truelabellist) = truelabelformatonversion(sentences,truelabels)

quality = QuestionReliabilityMETEOR(sentences, slabelidlists, np.asarray(labellist), np.asarray(truelabellist))
j1questionmeteor = quality

# T1
labelfilename = folder + 'CrowdWSA2019_T1_label_3temperature_trial0_gemini-1.5-pro-latest.tsv'
gtfilename = folder + 'CrowdWSA2019_T1_gt.tsv'

(workers,sentences,swlabels,truelabels) = loaddata(labelfilename,gtfilename)
(labellist,slabelidlists,sworkeridlists,wlabelidlists,wsentenceidlists) = labelformatconversion(workers,sentences,swlabels)
(truelabellist) = truelabelformatonversion(sentences,truelabels)

quality = QuestionReliabilityMETEOR(sentences, slabelidlists, np.asarray(labellist), np.asarray(truelabellist))
t1questionmeteor = quality

# T2
labelfilename = folder + 'CrowdWSA2019_T2_label_3temperature_trial0_gemini-1.5-pro-latest.tsv'
gtfilename = folder + 'CrowdWSA2019_T2_gt.tsv'

(workers,sentences,swlabels,truelabels) = loaddata(labelfilename,gtfilename)
(labellist,slabelidlists,sworkeridlists,wlabelidlists,wsentenceidlists) = labelformatconversion(workers,sentences,swlabels)
(truelabellist) = truelabelformatonversion(sentences,truelabels)

quality = QuestionReliabilityMETEOR(sentences, slabelidlists, np.asarray(labellist), np.asarray(truelabellist))
t2questionmeteor = quality

print([np.min(j1questionmeteor),np.min(t1questionmeteor),np.min(t2questionmeteor)])
print([np.mean(j1questionmeteor),np.mean(t1questionmeteor),np.mean(t2questionmeteor)])
print([np.max(j1questionmeteor),np.max(t1questionmeteor),np.max(t2questionmeteor)])
print([np.std(j1questionmeteor),np.std(t1questionmeteor),np.std(t2questionmeteor)])

[0.0, 0.0, 0.014492753623188406]
[0.4925553566644425, 0.4218092372240683, 0.3833222793399865]
[0.9994999999999999, 0.8972972972972975, 0.8399159663865547]
[0.2262523063295898, 0.19312748818173903, 0.19455235270177124]


In [None]:
# J1
labelfilename = folder + 'CrowdWSA2019_J1_label_3temperature_trial0_gemini-1.5-pro-latest.tsv'
gtfilename = folder + 'CrowdWSA2019_J1_gt.tsv'

(workers,sentences,swlabels,truelabels) = loaddata(labelfilename,gtfilename)
(labellist,slabelidlists,sworkeridlists,wlabelidlists,wsentenceidlists) = labelformatconversion(workers,sentences,swlabels)
(truelabellist) = truelabelformatonversion(sentences,truelabels)

quality = QuestionReliabilityMETEORInter(sentences, slabelidlists, np.asarray(labellist), np.asarray(truelabellist))
j1questionmeteor = quality

# T1
labelfilename = folder + 'CrowdWSA2019_T1_label_3temperature_trial0_gemini-1.5-pro-latest.tsv'
gtfilename = folder + 'CrowdWSA2019_T1_gt.tsv'

(workers,sentences,swlabels,truelabels) = loaddata(labelfilename,gtfilename)
(labellist,slabelidlists,sworkeridlists,wlabelidlists,wsentenceidlists) = labelformatconversion(workers,sentences,swlabels)
(truelabellist) = truelabelformatonversion(sentences,truelabels)

quality = QuestionReliabilityMETEORInter(sentences, slabelidlists, np.asarray(labellist), np.asarray(truelabellist))
t1questionmeteor = quality

# T2
labelfilename = folder + 'CrowdWSA2019_T2_label_3temperature_trial0_gemini-1.5-pro-latest.tsv'
gtfilename = folder + 'CrowdWSA2019_T2_gt.tsv'

(workers,sentences,swlabels,truelabels) = loaddata(labelfilename,gtfilename)
(labellist,slabelidlists,sworkeridlists,wlabelidlists,wsentenceidlists) = labelformatconversion(workers,sentences,swlabels)
(truelabellist) = truelabelformatonversion(sentences,truelabels)

quality = QuestionReliabilityMETEORInter(sentences, slabelidlists, np.asarray(labellist), np.asarray(truelabellist))
t2questionmeteor = quality

print('Inter-Annotator Agreement')
print([np.mean(j1questionmeteor),np.mean(t1questionmeteor),np.mean(t2questionmeteor)])

Inter-Annotator Agreement
[0.8792202361105649, 0.8830406459094042, 0.8728183947814567]
