In [124]:
import csv
import json
import re
from collections import defaultdict, Counter

In [88]:
with open("forc4cl-corpus/val.csv", "r") as csvfile:
    csvreader = csv.DictReader(csvfile)
    score_output = open("to_score_val.jsonl", "w")
    forc_output = open("fields_output_val.jsonl", "w")
    for row in csvreader:
        if not (row["title"] or row["abstract"]):
            continue
        text = f'{row["title"] if row["title"] else ""}. {row["abstract"] if row["abstract"] else ""}'
        pattern = re.compile("([^[:alpha:]\\s\\d])|(\\b\\d+\\b)")
        text = pattern.sub("", text)
        new_pattern = re.compile("[\\r\\n\\v\\t]+")
        text = new_pattern.sub(" ", text).lower().strip()
        score_output.write(json.dumps({"merged_id": row["acl_id"], "text": text}) + "\n")
        forc_output.write(json.dumps({"merged_id": row["acl_id"], "level1": row["Level1"], "level2": row["Level2"], "level3": row["Level3"]}) + "\n")
    score_output.close()
    forc_output.close()

In [142]:
level_ones = set()
level_twos = set()
level_threes = set()
matchup = {}
reverse_matchup = {}
with open("cset-forc-crosswalk.csv", "r") as csvfile:
    csvreader = csv.DictReader(csvfile)
    for row in csvreader:
        if row["FOS"]:
            initial = {"level one": row["level zero"]}
            if row["level one"]:
                initial["level two"] = row["level one"]
                if row["level two"]:
                    level_threes.add(row["level two"])
                    initial["level three"] = row["level two"]
                    reverse_matchup[row["level two"]] = row["FOS"]
                else:
                    level_twos.add(row["level one"])
                    reverse_matchup[row["level one"]] = row["FOS"]
            else:
                level_ones.add(row["level zero"])
                reverse_matchup[row["level zero"]] = row["FOS"]
            matchup[row["FOS"]] = initial

In [148]:
considered_fos = set([i.lower() for i in matchup.keys()])

In [149]:
considered_fos

{'active learning',
 'adversarial machine learning',
 'algorithmic bias',
 'argument mining',
 'automatic summarization',
 'chatbots',
 'cluster analysis',
 'collaborative filtering',
 'commonsense reasoning',
 'data augmentation',
 'dialogue systems',
 'distributed search engine',
 'ethics of artificial intelligence',
 'generative models',
 'information extraction',
 'information retrieval',
 'knowledge graphs',
 'knowledge representation and reasoning',
 'machine translation',
 'multi-agent systems',
 'multi-label classification',
 'multimodal learning',
 'one-shot learning',
 'ontology learning',
 'parsing',
 'question answering',
 'recommender systems',
 'reinforcement learning',
 'semantic parsing',
 'sentiment analysis',
 'speech recognition',
 'transfer learning',
 'unsupervised learning'}

In [129]:
levels = {}
levels_reversed = defaultdict(set)
children = defaultdict(list)
parents = {}
with open("../../assets/fields/all_fields_hierarchy.jsonl", "r") as json_file:
    data = [json.loads(line) for line in json_file]
for row in data:
    levels[row["child_display_name"].lower()] = int(row["child_level"])
    levels_reversed[int(row["child_level"])].add(row["child_display_name"].lower())
    if row["parent_level"] == 0:
        levels[row["display_name"].lower()] = row["parent_level"]
        levels_reversed[row["parent_level"]].add(row["display_name"].lower())
    children[row["display_name"].lower()].append(row["child_display_name"].lower())
    parents[row["child_display_name"].lower().strip()] = row["display_name"].lower().strip()

In [183]:
i = 0
top_fields = {}
all_top_fields = {}
with open("forc4cl-corpus/en_scores.jsonl", "r") as jsonfile:
    data = [json.loads(line) for line in jsonfile]
    for row in data:
        max_dict = defaultdict(list)
        field_dict = {}
        for field in row["fields"]:
            level = levels[field["id"].lower()]
            if field["id"].lower() in considered_fos:
                max_dict[level].append((field["score"], field["id"]))
        all_fields = []
        for level in max_dict:
            top_five_fields = sorted(max_dict[level], reverse=True)[:1]
            field_dict[level] = [i[1].lower() for i in top_five_fields]
            all_fields.extend([i[1].lower() for i in top_five_fields])
        top_fields[row["merged_id"]] = field_dict
        all_top_fields[row["merged_id"]] = set(all_fields)
        i += 1

In [185]:
true_positives = 0
false_positives = 0
true_negatives = 0
false_negatives = 0
false_pos_ids = {}
with open("forc4cl-corpus/en_scores.jsonl", "r") as jsonfile:
    data = [json.loads(line) for line in jsonfile]
    for row in data:
        if row["merged_id"] in forc_dict:
            expected = forc_dict[row["merged_id"]]
            expected_filtered = [i for i in expected["level1"] if i in level_ones]
            expected_filtered.extend([i for i in expected["level2"] if i in level_twos])
            expected_filtered.extend([i for i in expected["level3"] if i in level_threes])
            if expected_filtered:
                found = False
                for field in expected_filtered:
                    if reverse_matchup[field].lower() in all_top_fields[row["merged_id"]]:
                        found = True
                    # else:
                        # false_negatives += 1
                if found:
                    true_positives += 1
                else:
                    false_negatives += 1
            else:
                found = False
                for field in all_top_fields[row["merged_id"]]:
                    if field in considered_fos:
                        found = True
                        false_pos_ids[row["merged_id"]] = field
                        # false_positives += 1
                    # else:
                        # true_negatives += 1
                if found:
                    false_positives += 1
                else:
                    true_negatives += 1
                    

In [107]:
val_json = open("forc4cl-corpus/fields_output_val.jsonl", "r")
forc_data = [json.loads(line) for line in val_json]
test_json = open("forc4cl-corpus/fields_output_test.jsonl", "r")
test_data = [json.loads(line) for line in test_json]
forc_data.extend(test_data)
forc_dict = {}
for row in forc_data:
    level1 = eval(row["level1"])
    if row["level2"]:
        level2 = eval(row["level2"])
    else:
        level2 = []
    if row["level3"]:
        level3 = eval(row["level3"])
    else:
        level3 = []
    forc_dict[row["merged_id"]] = {"level1": level1, "level2": level2, "level3": level3}
val_json.close()
test_json.close()

In [186]:
true_positives, false_positives, true_negatives, false_negatives

(192, 130, 0, 128)

In [188]:
true_positives/(true_positives+false_negatives)

0.6

In [184]:
all_top_fields

{'L16-1238': {'information extraction', 'speech recognition'},
 '2020.trac-1.16': {'speech recognition', 'transfer learning'},
 '2020.parlaclarin-1.7': {'algorithmic bias', 'speech recognition'},
 '2020.paclic-1.39': {'semantic parsing', 'speech recognition'},
 'D16-1143': {'collaborative filtering', 'speech recognition'},
 'S17-1027': {'semantic parsing', 'speech recognition'},
 '2020.fnp-1.18': {'automatic summarization', 'reinforcement learning'},
 '2020.splu-1.2': {'information extraction', 'speech recognition'},
 'O16-2001': {'parsing', 'reinforcement learning'},
 '2020.emnlp-main.708': {'algorithmic bias', 'reinforcement learning'},
 '2020.vardial-1.24': {'information extraction', 'speech recognition'},
 '2016.amta-users.14': {'machine translation', 'speech recognition'},
 '2020.rail-1.5': {'parsing', 'speech recognition'},
 'U19-1004': {'generative models', 'speech recognition'},
 'S17-2168': {'automatic summarization', 'reinforcement learning'},
 'O17-1013': {'multimodal learni