In [1]:
import pandas as pd
import glob
from collections import Counter, defaultdict
from statistics import stdev, mean, median

# Worker statistics

Compute statistics about the number of HITs per worker.

In [2]:
def count_ids(files):
    worker_ids = Counter()
    for path in files:
        df = pd.read_excel(path)
        worker_ids.update(df['WorkerId'])
    return worker_ids

metadata = dict()

# Compute overall statistics, combining all the subtasks.
files = glob.glob("./Responses/*/*.xlsx")
worker_ids = count_ids(files)
metadata["Overall_num"] = len(worker_ids)
metadata["Overall_min"] = min(worker_ids.values())
metadata["Overall_max"] = max(worker_ids.values())
metadata["Overall_mean"] = mean(worker_ids.values())
metadata["Overall_std"] = stdev(worker_ids.values())

# Compute statistics for the individual subtasks.
for category in ["Grammaticality", "Repetition", "Coherence"]:
    files = glob.glob(f"./Responses/{category}/*.xlsx")
    worker_ids = count_ids(files)
    metadata[f"{category}_num"] = len(worker_ids)
    metadata[f"{category}_min"] = min(worker_ids.values())
    metadata[f"{category}_max"] = max(worker_ids.values())
    metadata[f"{category}_mean"] = mean(worker_ids.values())
    metadata[f"{category}_std"] = stdev(worker_ids.values())

# Were attention checks used?
metadata["Overall_attention"] = "Mixed"
metadata["Coherence_attention"] = "Yes"
metadata["Repetition_attention"] = "Yes"
metadata["Grammaticality_attention"] = "No"

categories = ["Overall", "Coherence", "Repetition", "Grammaticality"]
header = ["Category", "Total", "Min", "Max", "Mean", "Stdev", "Attention check"]
body = [[category,
         metadata[f'{category}_num'], 
         metadata[f'{category}_min'],
         metadata[f'{category}_max'],
         metadata[f'{category}_mean'],
         metadata[f'{category}_std'],
         metadata[f'{category}_attention']] for category in categories]

df = pd.DataFrame(columns=header, data=body)

with open("./Tables/worker_stats.tex", 'w') as f:
    # The formatter rounds the numbers to two decimals, for presentation.
    # Hide axis 0 means the row numbers are hidden.
    # Hrules are rules from the Booktabs package.
    f.write(df.style.format(formatter="{:.2f}".format, 
                            subset=["Mean", "Stdev"]).hide(axis=0).to_latex(hrules=True))

df

Unnamed: 0,Category,Total,Min,Max,Mean,Stdev,Attention check
0,Overall,216,1,67,8.333333,12.243271,Mixed
1,Coherence,119,1,36,5.042017,6.462926,Yes
2,Repetition,135,1,33,4.444444,6.365329,Yes
3,Grammaticality,80,1,30,7.5,7.789964,No


# Time taken

Compute average time taken per task.

In [3]:
# create index:
seconds = defaultdict(list)
for category in ["Grammaticality", "Repetition", "Coherence"]:
    files = glob.glob(f"./Responses/{category}/*.xlsx")
    for file in files:
        df = pd.read_excel(file)
        selection = df['WorkTimeInSeconds']
        seconds[category].extend(selection)

for key in ["Grammaticality", "Repetition", "Coherence"]:
    seconds['Overall'].extend(seconds[key])

header = ["Category", "N", "Mean", "Median", "Stdev", "Min", "Max"]
data = []
for key in ["Overall", "Coherence", "Repetition", "Grammaticality"]:
    values = seconds[key]
    row = [key, len(values), round(mean(values)), round(median(values)), round(stdev(values)), min(values), max(values)]
    data.append(row)

df = pd.DataFrame(columns=header, data=data)

def display_time(seconds):
    "Function to show time."
    m,s = divmod(seconds, 60)
    if m != 0:
        return f"{m}m{s}s"
    else: 
        return f"{s}s"

with open("./Tables/seconds.tex", 'w') as f:
    # The formatter rounds the numbers to two decimals, for presentation.
    # Hide axis 0 means the row numbers are hidden.
    # Hrules are rules from the Booktabs package.
    f.write(df.style.format({'Mean': display_time, 
                             "Stdev": display_time,
                             "Median": display_time,
                             "Min": display_time,
                             "Max": display_time}).hide(axis=0).to_latex(hrules=True))

df

Unnamed: 0,Category,N,Mean,Median,Stdev,Min,Max
0,Overall,1800,4429,2966,3919,31,14396
1,Coherence,600,4406,2806,4200,33,14396
2,Repetition,600,3156,1822,3377,63,14078
3,Grammaticality,600,5725,5949,3710,31,14279


In [4]:
# Carry out statistical analysis:
from scipy.stats import f_oneway, levene, kruskal

print("We first try Levene's test:")
result_levene = levene(seconds["Grammaticality"], seconds["Repetition"], seconds["Coherence"])
print(result_levene)
print()

print("The result is significant, so we cannot run the ANOVA.")
# f_oneway(seconds["Grammaticality"], seconds["Repetition"], seconds["Coherence"])
print()

print("Running Kruskal-Wallis test instead:")
result_kruskal = kruskal(seconds["Grammaticality"], seconds["Repetition"], seconds["Coherence"])
print(result_kruskal)

We first try Levene's test:
LeveneResult(statistic=27.90535337913394, pvalue=1.1621102605865985e-12)

The result is significant, so we cannot run the ANOVA.

Running Kruskal-Wallis test instead:
KruskalResult(statistic=136.25804526849961, pvalue=2.5819118601502666e-30)
