In [1]:
import os
import sys

sys.path.append("./")
sys.path.append("../../")

import json
from collections import Counter

import pandas as pd

from src.config import *

### Combine annotations

In [2]:
path = "/scratch/salvi/wiki_image_classification/" + MTURK_PATH + "annotated/"
n = 100
seed = 1


df_list = []
for file in filter(
    lambda x: x.startswith(f"{n}_{seed}_")
    and x.endswith(".json")
    and not "combined" in x,
    os.listdir(path),
):
    name = file.split("_")[3].split(".")[0].lower()
    with open(path + file, "r") as f:
        data = pd.DataFrame(json.load(f))
        df_list.append(data[["labels", "other_text"]].add_suffix("_" + name))
df = pd.concat(df_list, axis=1)
df.insert(0, "id", data["id"])

In [3]:
df_labels = df[list(filter(lambda x: x.startswith("labels_"), df.columns))]
df_other = df[list(filter(lambda x: x.startswith("other_text_"), df.columns))]
df["common_labels"] = df_labels.apply(
    lambda x: set.intersection(*[set(l) for l in x.to_list()]), axis=1
)
df["all_labels"] = df_labels.apply(
    lambda x: set.union(*[set(l) for l in x.to_list()]), axis=1
)
df["all_others"] = df_other.apply(lambda x: [el.lower() for el in x if el], axis=1)
df["to_review"] = df.apply(
    lambda x: len(x.common_labels) != len(x.all_labels) or len(x.all_others) > 0, axis=1
)

In [4]:
df.to_json(path + f"{n}_{seed}_sample_combined.json", orient="records")

### Analyse annotations

In [5]:
print(f"Out of {len(df)} images, {sum(df.to_review)} need to be reviewed")

Out of 100 images, 70 need to be reviewed


In [6]:
df.all_others.sum()

["i really don't know! ",
 'nothing',
 "hahahah what i didn't know there were gifs in this!",
 'non-scientific diagram',
 'military',
 'nothing',
 "no label. it seems he's wearing a sports shirt but...",
 'weird pic. composition',
 'nothing',
 'no label',
 'do we need monuments/memorials?']

In [7]:
conf_counter = Counter(
    [
        x
        for l in df.apply(lambda x: x.all_labels - x.common_labels, axis=1).values
        for x in l
    ]
)
common_counter = Counter([x for l in df.common_labels.values for x in l])
all_counter = Counter([x for l in df.all_labels.values for x in l])

In [8]:
common_counter.most_common()

[('People', 19),
 ('Architecture', 10),
 ('Landscapes', 7),
 ('Animals', 7),
 ('Transportation', 6),
 ('Events', 6),
 ('Sports', 6),
 ('Technology_Engineering', 5),
 ('History', 5),
 ('Art', 5),
 ('Urban', 4),
 ('Maps_Flags', 4),
 ('Belief', 4),
 ('Music', 3),
 ('Space', 2),
 ('Diagrams', 1),
 ('Logos', 1),
 ('Entertainment', 1),
 ('Food', 1)]

In [9]:
all_counter.most_common()

[('Architecture', 29),
 ('People', 25),
 ('History', 22),
 ('Urban', 20),
 ('Landscapes', 16),
 ('Events', 14),
 ('Transportation', 12),
 ('Technology_Engineering', 11),
 ('Belief', 11),
 ('Entertainment', 10),
 ('Animals', 10),
 ('Sports', 9),
 ('Other', 8),
 ('Biology', 7),
 ('Art', 7),
 ('Diagrams', 5),
 ('Plants', 5),
 ('Fossils', 5),
 ('Maps_Flags', 4),
 ('Music', 4),
 ('Politics', 4),
 ('Food', 3),
 ('Space', 2),
 ('Logos', 2),
 ('Physics', 1),
 ('Chemistry', 1),
 ('Earth_sciences', 1)]

In [38]:
conf_counter.most_common()

[('Other', 21),
 ('History', 20),
 ('People', 20),
 ('Architecture', 14),
 ('Earth_Environment', 13),
 ('Events', 10),
 ('Art', 9),
 ('Plants', 8),
 ('Technology_Engineering', 6),
 ('Entertainment', 5),
 ('Landscapes', 4),
 ('Politics', 4),
 ('Belief', 4),
 ('Transportation', 3),
 ('Biology', 3),
 ('Animals', 2),
 ('Sports', 2),
 ('Fossils', 2),
 ('Medicine_Health', 1),
 ('Food', 1),
 ('Mathematics', 1),
 ('Space', 1),
 ('Physics', 1),
 ('Literature', 1),
 ('Music', 1)]