# Evaluations on Marco's Annotations

We first read in the annotations from Marco's files, run the pipeline of those annotations using GPT3.5 and GPT4-turbo, and then evaluate the results.

## Functions

In [1]:
import os
import sys
import evaluate
import numpy as np
import pandas as pd
import json
import matplotlib.pyplot as plt
import seaborn as sns

  from .autonotebook import tqdm as notebook_tqdm


## Read in the Annotations

In [2]:
# read in json file
df = pd.read_json("../../data/annotation/anno_matching.json")
df2 = pd.read_json("../../data/annotation/anno_matching_v2.json")

In [3]:
drop_cols = ["doc_type"]
job_anno = df[df["doc_type"] == "job"].drop(columns=drop_cols)
course_anno = df[df["doc_type"] == "course"].drop(columns=drop_cols)

job_anno_v2 = df2[df2["doc_type"] == "job"].drop(columns=drop_cols)
course_anno_v2 = df2[df2["doc_type"] == "course"].drop(columns=drop_cols)

In [4]:
# these were missing in the original annotations
replace_job_from_v2 = [15490, 17109]

In [5]:
job_anno

Unnamed: 0,sample_num,doc_id,extraction
0,49,15490,"[{'start': 1908, 'end': 1923, 'text': 'Selbstä..."
1,50,402,"[{'start': 516, 'end': 548, 'text': 'Selbststä..."
2,51,13004,"[{'start': 597, 'end': 646, 'text': 'Leitung v..."
3,53,5782,"[{'start': 161, 'end': 185, 'text': 'beraten u..."
4,54,792,"[{'start': 79, 'end': 117, 'text': 'Leitung un..."
5,55,934,"[{'start': 643, 'end': 654, 'text': 'dynamisch..."
6,56,9831,"[{'start': 782, 'end': 842, 'text': 'Aufrechte..."
7,57,17109,"[{'start': 104, 'end': 144, 'text': 'Konzeptio..."
8,58,469,"[{'start': 342, 'end': 352, 'text': 'motiviert..."
9,61,13965,"[{'start': 118, 'end': 144, 'text': 'Problemlö..."


In [6]:
# replace job annotations with v2
job_anno.loc[job_anno["doc_id"].isin(replace_job_from_v2)] = job_anno_v2[
    job_anno_v2["doc_id"].isin(replace_job_from_v2)
]

In [7]:
job_anno

Unnamed: 0,sample_num,doc_id,extraction
0,49,15490,"[{'start': 1908, 'end': 1924, 'text': 'Selbstä..."
1,50,402,"[{'start': 516, 'end': 548, 'text': 'Selbststä..."
2,51,13004,"[{'start': 597, 'end': 646, 'text': 'Leitung v..."
3,53,5782,"[{'start': 161, 'end': 185, 'text': 'beraten u..."
4,54,792,"[{'start': 79, 'end': 117, 'text': 'Leitung un..."
5,55,934,"[{'start': 643, 'end': 654, 'text': 'dynamisch..."
6,56,9831,"[{'start': 782, 'end': 842, 'text': 'Aufrechte..."
7,57,17109,"[{'start': 136, 'end': 167, 'text': 'Beratung ..."
8,58,469,"[{'start': 342, 'end': 352, 'text': 'motiviert..."
9,61,13965,"[{'start': 118, 'end': 144, 'text': 'Problemlö..."


### Dev/Test Split

We only have 10/15 annotations so we will use half of them for dev and half for test.


In [8]:
seed = 42
np.random.seed(seed)

job_anno_dev = job_anno.sample(frac=0.5, random_state=seed)
job_anno_test = job_anno.drop(job_anno_dev.index)

course_anno_dev = course_anno.sample(frac=0.5, random_state=seed)
course_anno_test = course_anno.drop(course_anno_dev.index)

In [9]:
# save dev and test sets

SAVE_TO_DISK = False

if SAVE_TO_DISK:
    job_anno_dev.to_json("../../data/annotation/anno_job_dev.json")
    job_anno_test.to_json("../../data/annotation/anno_job_test.json")

    course_anno_dev.to_json("../../data/annotation/anno_course_dev.json")
    course_anno_test.to_json("../../data/annotation/anno_course_test.json")

In [10]:
# get dev doc_ids
job_dev_doc_ids = job_anno_dev["doc_id"].unique()
course_dev_doc_ids = course_anno_dev["doc_id"].unique()

print("jobs dev set:", job_dev_doc_ids)
print("courses dev set:", course_dev_doc_ids)

jobs dev set: [13965  9577 15490  8811   934   469 13004   402]
courses dev set: [ 129 7855 7136 7679    9]


In [11]:
job_ids = job_anno["doc_id"].astype(str)
course_ids = course_anno["doc_id"].astype(str)

# output ids as txt files
with open("anno_job_ids.txt", "w") as f:
    for id in job_ids:
        f.write(id + "\n")

with open("anno_course_ids.txt", "w") as f:
    for id in course_ids:
        f.write(id + "\n")

job_ids = job_ids.astype(int)
course_ids = course_ids.astype(int)

In [12]:
# !python ../skillExtract/pipeline_jobs_courses.py --num-sentences 2 --do-extraction --detailed --max_tokens 2000 --do-matching --prompt_type wlevels --language de --datapath ../data/processed/job_evl_all.csv --candidates_method mixed --max_candidates 3 --ids evaluation/anno_job_ids.txt
# !python ../skillExtract/pipeline_jobs_courses.py --num-sentences 2 --do-extraction --detailed --max_tokens 2000 --do-matching --prompt_type wlevels --language de --datapath ../data/processed/course_evl_all.csv --candidates_method mixed --max_candidates 3 --ids evaluation/anno_course_ids.txt

In [13]:
taxonomy = pd.read_csv("../../data/taxonomy/taxonomy_V4.csv")
taxonomy.drop(columns=["name+definition"], inplace=True)
# drop duplicates
taxonomy.drop_duplicates(subset=["unique_id", "name"], inplace=True)

In [14]:
# taxonomy_old = pd.read_csv("../../data/taxonomy/taxonomy_V4_simple.csv")
# # check that the unique_id and name are the same
# assert taxonomy["unique_id"].equals(taxonomy_old["unique_id"])
# assert taxonomy["name"].equals(taxonomy_old["name"])

In [15]:
job_anno_exp = job_anno.explode("extraction").reset_index(drop=True)
course_anno_exp = course_anno.explode("extraction").reset_index(drop=True)

In [16]:
# # for each job_id, get the name2 as a list

# job_name2 = job_anno_exp.groupby("doc_id")["name2"].apply(list).reset_index()
# course_name2 = course_anno_exp.groupby("doc_id")["name2"].apply(list).reset_index()

In [17]:
# # arrange in the order of job_order

# job_name2 = job_name2[job_name2["doc_id"].isin(job_order)].reset_index(drop=True)

# course_name2

# # remove "NONE" and duplicates
# job_name2["name2"] = job_name2["name2"].apply(
#     lambda x: list(set(x) - {"NONE"} - {None})
# )
# course_name2["name2"] = course_name2["name2"].apply(
#     lambda x: list(set(x) - {"NONE"} - {None})
# )

In [18]:
# job_name2.to_csv("job_name2.csv", index=False)
# course_name2.to_csv("course_name2.csv", index=False)

In [19]:
_temp_df = pd.json_normalize(job_anno_exp["extraction"])
job_anno_exp = pd.concat([job_anno_exp, _temp_df], axis=1)

_temp_df = pd.json_normalize(course_anno_exp["extraction"])
course_anno_exp = pd.concat([course_anno_exp, _temp_df], axis=1)

print(job_anno_exp.shape)
print(course_anno_exp.shape)

(212, 19)
(165, 19)


In [20]:
rename_dict = {
    "doc_id": "doc_id",
    "text": "extracted",
    "label.text": "level",
    "req_status": "req_status",
    "match_1": "matched",
    "match_1s": "matched1s",
    "match_2": "matched2",
    "match_2s": "matched2s",
}  # keeping only the first match for now

job_anno_exp = job_anno_exp.rename(columns=rename_dict)
course_anno_exp = course_anno_exp.rename(columns=rename_dict)

# keep only columns in rename_dict value
job_anno_exp = job_anno_exp[rename_dict.values()]
course_anno_exp = course_anno_exp[rename_dict.values()]

In [21]:
# job_anno_exp_v2

In [22]:
# jobs_with_NA = job_anno_exp[job_anno_exp.isna().any(axis=1)]
# # export to csv
# jobs_with_NA.to_csv("jobs_with_NA.csv", index=False)

jobs_with_NA = job_anno_exp[job_anno_exp.isna().any(axis=1)]
jobs_with_NA.doc_id.unique()  # this showed the doc_ids with NaNs

array([], dtype=int64)

In [23]:
job_anno_exp = job_anno_exp.merge(
    taxonomy, how="left", left_on="matched", right_on="name"
).drop(columns=["matched"])
course_anno_exp = course_anno_exp.merge(
    taxonomy, how="left", left_on="matched", right_on="name"
).drop(columns=["matched"])

In [24]:
print(job_anno_exp.shape)
print(course_anno_exp.shape)

(212, 13)
(165, 13)


In [25]:
def get_level_3(row):
    """
    Returns the lowest level of the taxonomy that is not NaN in each
    """
    for level in ["Type Level 3", "Type Level 2", "Type Level 1"]:
        value = row[level]
        if not pd.isna(value):
            return value


def get_level_2(row):
    """
    Returns the lowest level of the taxonomy that is not NaN in each
    """
    for level in ["Type Level 2", "Type Level 1"]:
        value = row[level]
        if not pd.isna(value):
            return value


# get level 2 and 3
job_anno_exp["name3"] = job_anno_exp.apply(get_level_3, axis=1)
job_anno_exp["name2"] = job_anno_exp.apply(get_level_2, axis=1)

course_anno_exp["name3"] = course_anno_exp.apply(get_level_3, axis=1)
course_anno_exp["name2"] = course_anno_exp.apply(get_level_2, axis=1)

In [26]:
# get some summary stats
print("annotated levels:")
print("jobs:\n", job_anno_exp["level"].value_counts())
print("courses:\n", course_anno_exp["level"].value_counts())

print("\n")
print("annotated num_unique_skills per doc:")
# print("jobs:\n", job_anno_exp.groupby("doc_id")["unique_id"].nunique())
print("jobs avg:\n", job_anno_exp.groupby("doc_id")["unique_id"].nunique().mean())
# print("courses:\n", course_anno_exp.groupby("doc_id")["unique_id"].nunique())
print("courses avg:\n", course_anno_exp.groupby("doc_id")["unique_id"].nunique().mean())

print("\n")
print("average num_unique_skills per doc in dev vs test:")

job_anno_exp_dev = job_anno_exp[job_anno_exp["doc_id"].isin(job_dev_doc_ids)]
job_anno_exp_test = job_anno_exp[job_anno_exp["doc_id"].isin(job_dev_doc_ids) == False]
course_anno_exp_dev = course_anno_exp[
    course_anno_exp["doc_id"].isin(course_dev_doc_ids)
]
course_anno_exp_test = course_anno_exp[
    course_anno_exp["doc_id"].isin(course_dev_doc_ids) == False
]
print("jobs dev:\n", job_anno_exp_dev.groupby("doc_id")["unique_id"].nunique().mean())
print("jobs test:\n", job_anno_exp_test.groupby("doc_id")["unique_id"].nunique().mean())
print(
    "courses dev:\n",
    course_anno_exp_dev.groupby("doc_id")["unique_id"].nunique().mean(),
)
print(
    "courses test:\n",
    course_anno_exp_test.groupby("doc_id")["unique_id"].nunique().mean(),
)

annotated levels:
jobs:
 Unknown         150
Intermediate     27
Expert           24
Beginner         11
Name: level, dtype: int64
courses:
 Unknown         94
Beginner        68
Intermediate     3
Name: level, dtype: int64


annotated num_unique_skills per doc:
jobs avg:
 8.533333333333333
courses avg:
 4.6


average num_unique_skills per doc in dev vs test:
jobs dev:
 8.75
jobs test:
 8.285714285714286
courses dev:
 4.0
courses test:
 5.2


In [27]:
# bring back document text for each doc_id
course_data = pd.read_csv("../../data/processed/course_evl_de.csv")
job_data = pd.read_csv("../../data/processed/job_evl_all.csv")

# filter to only ids in anno df
course_data = course_data[course_data["id"].astype(str).isin(course_ids.astype(str))]
job_data = job_data[job_data["id"].astype(str).isin(job_ids.astype(str))]

job_data["id"] = job_data["id"].astype(int)
course_data["id"] = course_data["id"].astype(int)

# remove extra spaces in text
job_data["fulltext"] = job_data["fulltext"].str.replace(" +", " ", regex=True)
course_data["fulltext"] = course_data["fulltext"].str.replace(" +", " ", regex=True)

In [28]:
# get course data req status from text
try:
    course_anno_exp.drop(columns=["req_status"], inplace=True)
except:
    pass

_temp_merge_df = pd.merge(
    course_anno_exp[["doc_id", "extracted"]],
    course_data[["id", "skill_type", "fulltext"]],
    left_on="doc_id",
    right_on="id",
    how="outer",
)

_temp_merge_df["req_status"] = _temp_merge_df.apply(
    lambda x: x["skill_type"] if x["extracted"] in x["fulltext"] else None, axis=1
)

In [29]:
annotated_course_req = [7136, 6605, 9, 4599]

# show course_data that where the doc id is not in annotated_course_req and req_status is required

course_data[~course_data["id"].isin(annotated_course_req)][
    course_data["skill_type"] == "required"
]

  course_data[~course_data["id"].isin(annotated_course_req)][


Unnamed: 0,id,name,intro,key_benefits,learning_targets_description,target_group_description,structure_description,admission_criteria_description,level,study_ids,fulltext,skill_type,language
1571,7290,CAS Full-Stack Development,<p>Von der Konzeption und der Entwicklung bis ...,<p>Bei der Entwicklung professioneller Web-Anw...,<p>In dieser Weiterbildung werden Sie alle rel...,,<p>Das CAS Full-Stack Development behandelt di...,<p>Das CAS Full-Stack Development baut auf den...,"{'id': 2, 'name': 'Fortgeschritten', 'enum': '...",[5],Das CAS Full-Stack Development baut auf den A...,required,de
1573,129,CAS FH in Digital Business - Fernstudium/Onlin...,<p>CAS Digital Business Fernstudium: Sie erhal...,<p><strong>Weiterbildung Digital Business &nda...,<p><strong>Online-Studium Digital Business: Di...,"<p>Mitarbeitende in Grossunternehmen, KMUs, Be...",<p>Das Online-Studium Digital Business ist v&o...,<p>Formale Voraussetzung zur regul&auml;ren Zu...,"{'id': 1, 'name': 'Beginner', 'enum': 'BEGINNER'}",[1],Formale Voraussetzung zur regulären Zulassung...,required,de


In [30]:
course_data[course_data["id"] == 4599]

Unnamed: 0,id,name,intro,key_benefits,learning_targets_description,target_group_description,structure_description,admission_criteria_description,level,study_ids,fulltext,skill_type,language
1032,4599,Lateral Leadership,"<p>Wir mischen unsere eigene Erfahrung, fundie...",<p><strong>Im Training&nbsp;Lateral Leadership...,<p><strong>F&uuml;hren ohne Weisungsbefugnis&n...,<p>Du m&ouml;chtest Eigenverantwortung in dein...,<p>In drei Schritten zum besseren lateralen Le...,,,[9],Lateral Leadership Wir mischen unsere eigene E...,to_acquire,de
1568,4599,Lateral Leadership,"<p>Wir mischen unsere eigene Erfahrung, fundie...",<p><strong>Im Training&nbsp;Lateral Leadership...,<p><strong>F&uuml;hren ohne Weisungsbefugnis&n...,<p>Du m&ouml;chtest Eigenverantwortung in dein...,<p>In drei Schritten zum besseren lateralen Le...,,,[9],Du möchtest Eigenverantwortung in deinen Team...,required,de


In [31]:
# drop if req_status is None
_temp_merge_df.dropna(subset=["req_status"], inplace=True)

In [32]:
# _temp_merge_df keep only not None
# _temp_merge_df = _temp_merge_df[_temp_merge_df["req_status"].notna()]


course_anno_exp = pd.merge(
    course_anno_exp,
    _temp_merge_df[["doc_id", "extracted", "req_status"]],
    on=["doc_id", "extracted"],
    how="left",
    indicator=True,
).drop_duplicates()

In [33]:
course_anno_exp_temp = course_anno_exp.copy()
course_anno_exp_temp[course_anno_exp_temp["req_status"].isna()]

Unnamed: 0,doc_id,extracted,level,matched1s,matched2,matched2s,unique_id,Type Level 1,Type Level 2,Type Level 3,Type Level 4,name,name3,name2,req_status,_merge


In [34]:
course_anno_exp_temp[course_anno_exp_temp["req_status"].isna()].doc_id.unique()

array([], dtype=int64)

In [35]:
troubleshoot = 6605
course_anno_exp_temp[course_anno_exp_temp["doc_id"] == troubleshoot]
print(course_data[course_data["id"] == troubleshoot]["fulltext"].values[0])
print(course_data[course_data["id"] == troubleshoot]["fulltext"].values[1])

Arbeit 4.0 - New Work Fit für die Zukunft: Veränderung mit neuen Arbeitsmethoden &amp; Organisationsformen gestalten und mit der Weiterbildung zum zertifizierten und gefragten Experten für die Zukunft der Arbeit werden. Durch Digitalisierung, Globalisierung und Vernetzung verändert sich die Art und Weise unserer Zusammenarbeit in der Wissens- und Informationsgesellschaft nachhaltig. Agile Arbeitsmethoden und Organisationsformen sind nötig, um flexibel auf sich rasch wandelnde Anforderungen und Kundenbedürfnisse zu reagieren. Im Kurs lernen Sie wichtige Zusammenhänge und Methoden kennen, um sich für die Herausforderungen der Arbeitswelt 4.0 zu rüsten und wichtige Veränderungen im eigenen Aufgabenbereich, im Team und im Unternehmen anzustossen und selbst zu gestalten. . Der E-Learning Kurs Arbeit 4.0 hilft dabei, die aktuellen und zukünftigen Veränderungen durch neue Technologien und Arbeitsweisen einzuordnen und zu verstehen. Zudem bietet er Fach- und Führungskräften aus allen Branchen 

In [36]:
req_not_anno = [7290, 129]
course_anno_exp_temp[
    course_anno_exp_temp["doc_id"].isin(req_not_anno)
].req_status.value_counts()

to_acquire    21
Name: req_status, dtype: int64

In [37]:
_temp_merge_df_j = pd.merge(
    job_anno_exp[["doc_id", "extracted"]],
    job_data[["id", "fulltext"]],
    left_on="doc_id",
    right_on="id",
    how="outer",
)

_temp_merge_df_j["check_skill_in_text"] = _temp_merge_df_j.apply(
    lambda x: x["extracted"] in x["fulltext"], axis=1
)

In [38]:
_temp_merge_df_j[_temp_merge_df_j["check_skill_in_text"] == False]

Unnamed: 0,doc_id,extracted,id,fulltext,check_skill_in_text


## Read in the Extraction & Matching results

In [39]:
# c_versions = [
#     "course_gpt-3.5-turbo_ids_2sent_n10_mixed_V4_detailed_anno_oldrules.json",
#     "course_gpt-3.5-turbo_ids_2sent_n10_mixed_V4_detailed_anno_oldprompt.json",
#     "course_gpt-3.5-turbo_ids_2sent_n10_mixed_V4_detailed_anno_explicit.json",
#     "course_gpt-3.5-turbo_ids_2sent_n10_jBd_V4_detailed_anno_cand1.json",
#     "course_gpt-3.5-turbo_ids_2sent_n10_rules_V4_detailed_anno_cand1.json",
#     "course_gpt-3.5-turbo_ids_2sent_n10_mixed_V4_detailed_anno_noexplicit.json",
# ]
# j_versions = [
#     "job_gpt-3.5-turbo_ids_2sent_n10_mixed_V4_detailed_anno_oldrules.json",
#     "job_gpt-3.5-turbo_ids_2sent_n10_mixed_V4_detailed_anno_oldprompt.json",
#     "job_gpt-3.5-turbo_ids_2sent_n10_mixed_V4_detailed_anno_explicit.json",
#     "job_gpt-3.5-turbo_ids_2sent_n10_jBd_V4_detailed_anno_cand1.json",
#     "job_gpt-3.5-turbo_ids_2sent_n10_rules_V4_detailed_anno_cand1.json",
#     "job_gpt-3.5-turbo_ids_2sent_n10_mixed_V4_detailed_anno_noexplicit.json",
#     "job_gpt-3.5-turbo_ids_2sent_n10_mixed_V4_detailed_anno_explicit_lv.json",
#     "job_gpt-3.5-turbo_ids_2sent_n10_mixed_V4_detailed_anno_lvv2_paperprmpt.json",
#     "job_gpt-3.5-turbo_ids_2sent_n10_mixed_V4_detailed_anno_reqv2.json",
# ]

In [40]:
c_versions = [
    "course_gpt-3.5-turbo_ids_2sent_n10_jBd_V4_detailed_base_emb.json",
    "course_gpt-3.5-turbo_ids_2sent_n10_rules_V4_detailed_base_rules.json",
    "course_gpt-3.5-turbo_ids_2sent_n10_mixed_V4_detailed_ogprmpt_oldrules.json",
    "course_gpt-3.5-turbo_ids_2sent_n10_mixed_V4_detailed_ogprmpt_newrules.json",
    "course_gpt-3.5-turbo_ids_2sent_n10_mixed_V4_detailed_likepaper.json",
    "course_gpt-3.5-turbo_ids_2sent_n10_mixed_V4_detailed_explicit.json",
    "course_gpt-3.5-turbo_ids_2sent_n10_mixed_V4_detailed_skillsonly.json",
    "course_gpt-3.5-turbo_ids_2sent_n10_mixed_V4_detailed_sep_required.json",
    "course_gpt-3.5-turbo_ids_2sent_n10_mixed_V4_detailed_sep_toacquire.json",
]

j_versions = [
    "job_gpt-3.5-turbo_ids_2sent_n10_jBd_V4_detailed_base_emb.json",
    "job_gpt-3.5-turbo_ids_2sent_n10_rules_V4_detailed_base_rules.json",
    "job_gpt-3.5-turbo_ids_2sent_n10_mixed_V4_detailed_ogprmpt_oldrules.json",
    "job_gpt-3.5-turbo_ids_2sent_n10_mixed_V4_detailed_ogprmpt_newrules.json",
    "job_gpt-3.5-turbo_ids_2sent_n10_mixed_V4_detailed_likepaper.json",
    "job_gpt-3.5-turbo_ids_2sent_n10_mixed_V4_detailed_explicit.json",
    "job_gpt-3.5-turbo_ids_2sent_n10_mixed_V4_detailed_skillsonly.json",
    "job_gpt-3.5-turbo_ids_2sent_n10_mixed_V4_detailed_lvs_only.json",
]

In [41]:
c_file_name = c_versions[-1]
j_file_name = j_versions[6]

print(c_file_name)
print(j_file_name)
path = "../anno_results/"

with open(path + c_file_name, "r") as f:
    course_results = json.load(f)

with open(path + j_file_name, "r") as f:
    job_results = json.load(f)

course_gpt-3.5-turbo_ids_2sent_n10_mixed_V4_detailed_sep_toacquire.json
job_gpt-3.5-turbo_ids_2sent_n10_mixed_V4_detailed_skillsonly.json


In [42]:
def extract_skills(document):
    """Get extracted skills from each sentence in a document from the detailed output"""
    # extracted_skills_count = 0
    # matched_skills_count = 0
    extracted_skills_list = []
    # existing_skills = set()

    for sentences in document:
        # create zip dict of sentences["extracted_skills"] and sentences["extracted_skills_levels"]
        # if "extracted_skills_reqstatus" exists, use that as well
        extracted_skills = {}
        skill_level_dict = dict(
            zip(
                sentences.get("extracted_skills", [""]),
                sentences.get("extracted_skills_levels", [""]),
                # sentences.get("extracted_skills_reqstatus", [""]),
            )
        )
        # print("skill_level_dict")
        # print(skill_level_dict)

        if "extracted_skills_reqstatus" in sentences:
            skill_req_dict = dict(
                zip(
                    sentences.get("extracted_skills", [""]),
                    sentences.get("extracted_skills_reqstatus", [""]),
                )
            )
            # print("skill_req_dict")
            # print(skill_req_dict)
        # extracted_skills_count += len(skill_level_dict)
        matched_skill = sentences.get("matched_skills", {})
        # print("matched_skill")
        # print(matched_skill)
        # matched_skills_count += len(matched_skill)
        for skill_name, skill_level in skill_level_dict.items():
            unique_id = matched_skill.get(skill_name, {}).get("unique_id", None)
            skill_req = (
                skill_req_dict[skill_name]
                if "extracted_skills_reqstatus" in sentences
                else None
            )
            # if unique_id not in existing_skills:
            extracted_skills_list.append(
                (unique_id, skill_level, skill_req, skill_name)
            )
            # existing_skills.add(unique_id)
        # for skill_name, skill_info in matched_skill.items():
        #     unique_id = skill_info["unique_id"]
        #     skill_level = skill_level_dict[skill_name]
        #     skill_req = (
        #         skill_req_dict[skill_name]
        #         if "extracted_skills_reqstatus" in sentences
        #         else None
        #     )
        #     if unique_id not in existing_skills:
        #         extracted_skills.append((unique_id, skill_level, skill_req, skill_name))
        #         existing_skills.add(unique_id)
        # break
    return extracted_skills_list  # , extracted_skills_count, matched_skills_count


def get_clean_results(data):
    """get relevant information on matched skills from the detailed output"""
    clean_output_dict = {}
    # extract_skills_dict = {}
    # matched_skills_dict = {}

    for doc_id, doc_data in data.items():
        if isinstance(doc_data, list):
            # If the data is a list, directly extract matched skills
            clean_output_dict[doc_id] = extract_skills(doc_data)
            # clean_output_dict[doc_id], extract_skills_dict[doc_id], matched_skills_dict[doc_id] = extract_skills(doc_data)
        elif isinstance(doc_data, dict):
            # If the data is a dictionary, check for nested keys
            nested_keys = ["required", "to_acquire"]
            clean_output_dict[doc_id] = {}
            # extract_skills_dict[doc_id] = {}
            # matched_skills_dict[doc_id] = {}
            for key in nested_keys:
                key_dict = {}
                # ext_dict = {}
                # match_dict = {}
                if key in doc_data:
                    key_dict[key] = extract_skills(doc_data[key])
                    # key_dict[key], ext_dict[key], match_dict[key] = extract_skills(doc_data[key])
                    clean_output_dict[doc_id].update(key_dict)
                    # extract_skills_dict[doc_id].update(ext_dict)
                    # matched_skills_dict[doc_id].update(match_dict)

    return clean_output_dict  # , extract_skills_dict, matched_skills_dict

In [43]:
job_results = get_clean_results(job_results)
course_results = get_clean_results(course_results)

In [44]:
job_results_dev = {k: v for k, v in job_results.items() if int(k) in job_dev_doc_ids}
job_results_test = {
    k: v for k, v in job_results.items() if int(k) not in job_dev_doc_ids
}

course_results_dev = {
    k: v for k, v in course_results.items() if int(k) in course_dev_doc_ids
}
course_results_test = {
    k: v for k, v in course_results.items() if int(k) not in course_dev_doc_ids
}

In [45]:
def get_results_df(matched_dict, taxonomy, doc_type):
    processed_data = []
    if doc_type == "job":
        for doc_id, annotations in matched_dict.items():
            for skill_id, level, req_status, extracted in annotations:
                processed_data.append(
                    {
                        "doc_id": doc_id,
                        "skill_id": skill_id,
                        "level": level,
                        "extracted": extracted,
                        "req_status": req_status,
                    }
                )
    elif doc_type == "course":
        for doc_id, categories in matched_dict.items():
            for req_status, skills in categories.items():
                for skill_id, level, _, extracted in skills:
                    processed_data.append(
                        {
                            "doc_id": doc_id,
                            "skill_id": skill_id,
                            "level": level,
                            "extracted": extracted,
                            "req_status": req_status,
                        }
                    )
    else:
        print("doc_type must be 'job' or 'course'")
        return None
    # Create DataFrame
    df = pd.DataFrame(processed_data)

    # Merge with taxonomy

    taxonomy["name3"] = taxonomy.apply(get_level_3, axis=1)
    taxonomy["name2"] = taxonomy.apply(get_level_2, axis=1)

    df = df.merge(
        taxonomy[["unique_id", "name", "name2", "name3"]],
        how="left",
        left_on="skill_id",
        right_on="unique_id",
    ).drop(columns=["unique_id"])

    # convert ids into ints
    df["doc_id"] = df["doc_id"].astype(float)
    df["skill_id"] = df["skill_id"].astype(float)

    return df


job_results_dev_df = get_results_df(job_results_dev, taxonomy, "job")
job_results_test_df = get_results_df(job_results_test, taxonomy, "job")
course_results_dev_df = get_results_df(course_results_dev, taxonomy, "course")
course_results_test_df = get_results_df(course_results_test, taxonomy, "course")

In [46]:
job_results_df = get_results_df(job_results, taxonomy, "job")
course_results_df = get_results_df(course_results, taxonomy, "course")

In [47]:
course_results_df[
    course_results_df["doc_id"].isin(req_not_anno)
].req_status.value_counts()

to_acquire    54
required      23
Name: req_status, dtype: int64

In [48]:
# get some summary stats
print("levels:")
print("jobs:\n", job_results_df["level"].value_counts())
print("courses:\n", course_results_df["level"].value_counts())

print("\n \n")
print("num_unique_skills per doc:")
print("jobs:\n", job_results_df.groupby("doc_id")["skill_id"].nunique())
print("jobs avg:\n", job_results_df.groupby("doc_id")["skill_id"].nunique().mean())
print("courses:\n", course_results_df.groupby("doc_id")["skill_id"].nunique())
print(
    "courses avg:\n", course_results_df.groupby("doc_id")["skill_id"].nunique().mean()
)

levels:
jobs:
     70
Name: level, dtype: int64
courses:
 unknown         171
intermediate    132
beginner         86
expert           12
Name: level, dtype: int64

 

num_unique_skills per doc:
jobs:
 doc_id
469.0       2
934.0       1
1658.0      3
5782.0      3
6837.0      5
7031.0      1
8811.0      7
9577.0      6
9831.0      5
13004.0     5
13965.0    13
15490.0     4
17109.0     1
Name: skill_id, dtype: int64
jobs avg:
 4.3076923076923075
courses:
 doc_id
9.0       25
129.0     21
4599.0    16
6605.0    27
7136.0    27
7290.0    27
7679.0    10
7761.0    16
7829.0    27
7855.0    14
Name: skill_id, dtype: int64
courses avg:
 21.0


## Compute Some Metrics

### F1 Scores for Matching Results

Looking at how well for each document, the final matched result matches the annotations


#### Looking at Lowest Level


In [49]:
from sklearn.metrics import accuracy_score, f1_score, recall_score, precision_score
from sklearn.preprocessing import MultiLabelBinarizer
import warnings


warnings.filterwarnings("ignore", category=UserWarning)

In [50]:
course_anno_exp

Unnamed: 0,doc_id,extracted,level,matched1s,matched2,matched2s,unique_id,Type Level 1,Type Level 2,Type Level 3,Type Level 4,name,name3,name2,req_status,_merge
0,7679,wie Sie diese Fehler nicht nur bei anderen suc...,Beginner,,,,56.0,Sozial- & Selbstkompetenz,Kommunikation,,,Kommunikation,Kommunikation,Kommunikation,to_acquire,both
1,7679,Kommunikation so verbessern,Beginner,,,,56.0,Sozial- & Selbstkompetenz,Kommunikation,,,Kommunikation,Kommunikation,Kommunikation,to_acquire,both
2,7679,"Verstehen Sie, wie Stimmung, Körpersprache und...",Beginner,,,,56.0,Sozial- & Selbstkompetenz,Kommunikation,,,Kommunikation,Kommunikation,Kommunikation,to_acquire,both
3,7679,Kommunikationsfähigkeiten,Beginner,Sprachliche Fertigkeiten,,,56.0,Sozial- & Selbstkompetenz,Kommunikation,,,Kommunikation,Kommunikation,Kommunikation,to_acquire,both
4,7679,Nicht farbgerecht kommunizieren,Beginner,,,,,,,,,,,,to_acquire,both
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
173,4599,schaffen wir Vertrauen,Unknown,,,,35.0,Sozial- & Selbstkompetenz,Führung,,,Führung,Führung,Führung,to_acquire,both
174,4599,Zusammenarbeit fördern,Unknown,,Führung,,55.0,Sozial- & Selbstkompetenz,Kollaboration,,,Kollaboration,Kollaboration,Kollaboration,to_acquire,both
175,4599,gelingen uns echte Resultate,Unknown,,,,35.0,Sozial- & Selbstkompetenz,Führung,,,Führung,Führung,Führung,to_acquire,both
176,4599,Methoden und Werkzeuge der lateralen Führung,Unknown,,,,,,,,,,,,to_acquire,both


In [51]:
if "sep_required" in c_file_name:
    course_results_df = course_results_df[course_results_df["req_status"] == "required"]
    course_anno_exp = course_anno_exp[course_anno_exp["req_status"] == "required"]

if "sep_toacquire" in c_file_name:
    course_results_df = course_results_df[course_results_df["req_status"] != "required"]
    course_anno_exp = course_anno_exp[course_anno_exp["req_status"] != "required"]

In [52]:
print(course_results_df.req_status.value_counts())
print(course_anno_exp.req_status.value_counts())

to_acquire    349
Name: req_status, dtype: int64
to_acquire    149
Name: req_status, dtype: int64


In [53]:
def get_matched_df(results_df):
    results_df = results_df.copy()
    results_df.dropna(subset=["skill_id"], inplace=True)

    # convert skill_id to int
    results_df["skill_id"] = results_df["skill_id"].astype(int)
    return results_df


job_matched_df = get_matched_df(job_results_df)
course_matched_df = get_matched_df(course_results_df)

In [54]:
def get_acc_f1_r_p(
    doc_id, anno_df=job_anno_exp, matched_df=job_matched_df, column="name"
):
    true = list(set(anno_df[anno_df.doc_id == doc_id][column].dropna()))
    pred = list(set(matched_df[matched_df.doc_id == doc_id][column].dropna()))
    if not true or not pred:
        return 0.0, 0.0, 0.0  # or handle it in a way that makes sense for your analysis

    # print(true)
    # print(pred)
    TP = len(set(true) & set(pred))
    FP = len(set(pred) - set(true))
    FN = len(set(true) - set(pred))

    # print("TP:", TP)
    # print("FP:", FP)
    # print("FN:", FN)

    # calculate precision, recall and f1 score but handle 0 division
    try:
        precision = TP / (TP + FP)
    except ZeroDivisionError:
        precision = 0.0
    try:
        recall = TP / (TP + FN)
    except ZeroDivisionError:
        recall = 0.0
    try:
        f1 = 2 * (precision * recall) / (precision + recall)
    except ZeroDivisionError:
        f1 = 0.0

    return f1, recall, precision

In [55]:
print("JOB F1 SCORES ON MATCHED SKILLS - LOWEST LEVEL\n")

job_f1_scores = []
job_recall_scores = []
job_precision_scores = []

for job_id in job_dev_doc_ids:
    f1, recall, precision = get_acc_f1_r_p(job_id)
    job_f1_scores.append(f1)
    job_recall_scores.append(recall)
    job_precision_scores.append(precision)

print("f1:", np.mean(job_f1_scores))
print("recall:", np.mean(job_recall_scores))
print("precision:", np.mean(job_precision_scores))

JOB F1 SCORES ON MATCHED SKILLS - LOWEST LEVEL

f1: 0.15127923976608187
recall: 0.12614989177489178
precision: 0.28946886446886444


In [56]:
course_matched_df.doc_id.unique()

array([7855., 7829., 7679., 7761., 4599., 6605.,    9., 7290., 7136.,
        129.])

In [57]:
course_anno_exp.doc_id.unique()

array([7679, 7855, 7761, 7829, 7290, 7136, 6605,    9,  129, 4599])

In [58]:
print("COURSE F1 SCORES ON MATCHED SKILLS - LOWEST LEVEL\n")

course_f1_scores = []
course_recall_scores = []
course_precision_scores = []

for course_id in course_dev_doc_ids:
    f1, recall, precision = get_acc_f1_r_p(
        course_id, anno_df=course_anno_exp, matched_df=course_matched_df
    )
    course_f1_scores.append(f1)
    course_recall_scores.append(recall)
    course_precision_scores.append(precision)

print("f1:", np.mean(course_f1_scores))
print("recall:", np.mean(course_recall_scores))
print("precision:", np.mean(course_precision_scores))

COURSE F1 SCORES ON MATCHED SKILLS - LOWEST LEVEL

f1: 0.16830808080808082
recall: 0.545
precision: 0.10322510822510822


#### Looking at Level 3

In [59]:
print("JOB F1 SCORES ON MATCHED SKILLS - LEVEL 3\n")

job_f1_scores = []
job_recall_scores = []
job_precision_scores = []

for job_id in job_dev_doc_ids:
    f1, recall, precision = get_acc_f1_r_p(
        job_id, anno_df=job_anno_exp, matched_df=job_matched_df, column="name3"
    )
    job_f1_scores.append(f1)
    job_recall_scores.append(recall)
    job_precision_scores.append(precision)

print("f1:", np.mean(job_f1_scores))
print("recall:", np.mean(job_recall_scores))
print("precision:", np.mean(job_precision_scores))

JOB F1 SCORES ON MATCHED SKILLS - LEVEL 3

f1: 0.18603801169590642
recall: 0.1589556277056277
precision: 0.32815934065934066


In [60]:
course_matched_df.req_status.value_counts()

to_acquire    313
Name: req_status, dtype: int64

In [61]:
course_anno_exp.req_status.value_counts()

to_acquire    149
Name: req_status, dtype: int64

In [62]:
print("COURSE F1 SCORES ON MATCHED SKILLS - LEVEL 3\n")

course_f1_scores = []
course_recall_scores = []
course_precision_scores = []

for course_id in course_dev_doc_ids:
    f1, recall, precision = get_acc_f1_r_p(
        course_id,
        anno_df=course_anno_exp,
        matched_df=course_matched_df,
        column="name3",
    )
    course_f1_scores.append(f1)
    course_recall_scores.append(recall)
    course_precision_scores.append(precision)

print("f1:", np.mean(course_f1_scores))
print("recall:", np.mean(course_recall_scores))
print("precision:", np.mean(course_precision_scores))

COURSE F1 SCORES ON MATCHED SKILLS - LEVEL 3

f1: 0.1810064935064935
recall: 0.545
precision: 0.11345238095238094


#### Looking at Level 2

In [63]:
print("JOB F1 SCORES ON MATCHED SKILLS - LEVEL 2\n")

job_f1_scores = []
job_recall_scores = []
job_precision_scores = []

for job_id in job_dev_doc_ids:
    f1, recall, precision = get_acc_f1_r_p(
        job_id, anno_df=job_anno_exp, matched_df=job_matched_df, column="name2"
    )
    job_f1_scores.append(f1)
    job_recall_scores.append(recall)
    job_precision_scores.append(precision)

print("f1:", np.mean(job_f1_scores))
print("recall:", np.mean(job_recall_scores))
print("precision:", np.mean(job_precision_scores))

JOB F1 SCORES ON MATCHED SKILLS - LEVEL 2

f1: 0.31607142857142856
recall: 0.24791666666666667
precision: 0.5357142857142857


In [64]:
print("COURSE F1 SCORES ON MATCHED SKILLS - LEVEL 2\n")

course_f1_scores = []
course_recall_scores = []
course_precision_scores = []

for course_id in course_dev_doc_ids:
    f1, recall, precision = get_acc_f1_r_p(
        course_id,
        anno_df=course_anno_exp,
        matched_df=course_matched_df,
        column="name2",
    )
    course_f1_scores.append(f1)
    course_recall_scores.append(recall)
    course_precision_scores.append(precision)

print("f1:", np.mean(course_f1_scores))
print("recall:", np.mean(course_recall_scores))
print("precision:", np.mean(course_precision_scores))

COURSE F1 SCORES ON MATCHED SKILLS - LEVEL 2

f1: 0.2705882352941177
recall: 0.6866666666666668
precision: 0.18333333333333335


### Recall scores when we include more annotations

In [65]:
# job_anno_dev_dev["lv1s"] = # concatenate name + matched1s
job_anno_dev_lv1 = job_anno_exp_dev.copy()
job_anno_dev_lv2 = job_anno_exp_dev.copy()

keep_cols = ["name", "doc_id", "extracted", "level", "req_status"]

tolist_cols = ["name", "matched1s", "matched2", "matched2s"]
for col in tolist_cols:
    job_anno_dev_lv1[col] = job_anno_dev_lv1[col].apply(lambda x: [x] if x else [])
    job_anno_dev_lv2[col] = job_anno_dev_lv2[col].apply(lambda x: [x] if x else [])

job_anno_dev_lv1["name"] = job_anno_dev_lv1["name"] + job_anno_dev_lv1["matched1s"]
job_anno_dev_lv2["name"] = (
    job_anno_dev_lv2["name"]
    + job_anno_dev_lv2["matched1s"]
    + job_anno_dev_lv2["matched2"]
    + job_anno_dev_lv2["matched2s"]
)

job_anno_dev_lv1 = job_anno_dev_lv1[keep_cols]
job_anno_dev_lv2 = job_anno_dev_lv2[keep_cols]

# name and matched1s are strings, I wantt to join them into the same list
# job_anno_dev_lv1["lvs"] = job_anno_dev_lv1["lvs"].apply(lambda x: (x))
# job_anno_dev_lv2["lvs"] = job_anno_dev_lv2["lvs"].apply(lambda x: (x))

# make each item in the list in lvs column into it's own row
job_anno_dev_lv1 = job_anno_dev_lv1.explode("name").reset_index(drop=True)
print(job_anno_dev_lv1.shape)
job_anno_dev_lv2 = job_anno_dev_lv2.explode("name").reset_index(drop=True)
print(job_anno_dev_lv2.shape)

(124, 5)
(151, 5)


In [66]:
# course_anno_dev_dev["lv1s"] = # concatenate name + matched1s
course_anno_dev_lv1 = course_anno_exp_dev.copy()
course_anno_dev_lv2 = course_anno_exp_dev.copy()

keep_cols = ["name", "doc_id", "extracted", "level", "req_status"]

tolist_cols = ["name", "matched1s", "matched2", "matched2s"]
for col in tolist_cols:
    course_anno_dev_lv1[col] = course_anno_dev_lv1[col].apply(
        lambda x: [x] if x else []
    )
    course_anno_dev_lv2[col] = course_anno_dev_lv2[col].apply(
        lambda x: [x] if x else []
    )

course_anno_dev_lv1["name"] = (
    course_anno_dev_lv1["name"] + course_anno_dev_lv1["matched1s"]
)
course_anno_dev_lv2["name"] = (
    course_anno_dev_lv2["name"]
    + course_anno_dev_lv2["matched1s"]
    + course_anno_dev_lv2["matched2"]
    + course_anno_dev_lv2["matched2s"]
)

course_anno_dev_lv1 = course_anno_dev_lv1[keep_cols]
course_anno_dev_lv2 = course_anno_dev_lv2[keep_cols]

# name and matched1s are strings, I wantt to join them into the same list
# course_anno_dev_lv1["lvs"] = course_anno_dev_lv1["lvs"].apply(lambda x: (x))
# course_anno_dev_lv2["lvs"] = course_anno_dev_lv2["lvs"].apply(lambda x: (x))

# make each item in the list in lvs column into it's own row
course_anno_dev_lv1 = course_anno_dev_lv1.explode("name").reset_index(drop=True)
print(course_anno_dev_lv1.shape)
course_anno_dev_lv2 = course_anno_dev_lv2.explode("name").reset_index(drop=True)
print(course_anno_dev_lv2.shape)

(103, 5)
(117, 5)


In [67]:
for job_id in job_dev_doc_ids:
    f1, recall, precision = get_acc_f1_r_p(
        job_id,
        anno_df=job_anno_dev_lv1,
        matched_df=job_results_dev_df,
        column="name",
    )
    job_f1_scores.append(f1)
    job_recall_scores.append(recall)
    job_precision_scores.append(precision)

print("JOB RECALL ALL BEST MATCHES\n")
# print("f1:", np.mean(job_f1_scores))
print("recall:", np.mean(job_recall_scores))
# print("precision:", np.mean(job_precision_scores))

JOB RECALL ALL BEST MATCHES

recall: 0.18506944444444445


In [68]:
for job_id in job_dev_doc_ids:
    f1, recall, precision = get_acc_f1_r_p(
        job_id,
        anno_df=job_anno_dev_lv2,
        matched_df=job_results_dev_df,
        column="name",
    )
    job_f1_scores.append(f1)
    job_recall_scores.append(recall)
    job_precision_scores.append(precision)

print("JOB RECALL BEST AND BACKUP MATCHES\n")
# print("f1:", np.mean(job_f1_scores))
print("recall:", np.mean(job_recall_scores))
# print("precision:", np.mean(job_precision_scores))

JOB RECALL BEST AND BACKUP MATCHES

recall: 0.16077279202279202


In [69]:
for course_id in course_dev_doc_ids:
    f1, recall, precision = get_acc_f1_r_p(
        course_id,
        anno_df=course_anno_dev_lv1,
        matched_df=course_results_dev_df,
        column="name",
    )
    course_f1_scores.append(f1)
    course_recall_scores.append(recall)
    course_precision_scores.append(precision)

print("COURSE RECALL ALL BEST MATCHES\n")
# print("f1:", np.mean(course_f1_scores))
print("recall:", np.mean(course_recall_scores))
# print("precision:", np.mean(course_precision_scores))

COURSE RECALL ALL BEST MATCHES

recall: 0.5807792207792207


In [70]:
for course_id in course_dev_doc_ids:
    f1, recall, precision = get_acc_f1_r_p(
        course_id,
        anno_df=course_anno_dev_lv2,
        matched_df=course_results_dev_df,
        column="name",
    )
    course_f1_scores.append(f1)
    course_recall_scores.append(recall)
    course_precision_scores.append(precision)

print("COURSE RECALL BEST AND BACKUP MATCHES\n")
# print("f1:", np.mean(course_f1_scores))
print("recall:", np.mean(course_recall_scores))
# print("precision:", np.mean(course_precision_scores))

COURSE RECALL BEST AND BACKUP MATCHES

recall: 0.541948051948052


### F1 Scores on Extraction Results


In [71]:
# export job and course data to csv
job_data.to_csv("anno_job_data.csv", index=False)
course_data.to_csv("anno_course_data.csv", index=False)

In [72]:
print("num_job_ids:", len(job_data["id"].unique()))
print("num_course_ids:", len(course_data["id"].unique()))

num_job_ids: 15
num_course_ids: 10


In [73]:
from nltk.tokenize import word_tokenize
from fuzzywuzzy import process
from fuzzywuzzy import fuzz
import nltk

nltk.download("punkt")


def find_span_in_text(extracted_span, full_text):
    # Use fuzzy matching to find the closest match in the text
    # Returns the start and end positions of the match
    closest_match, _ = process.extractOne(
        extracted_span, full_text.split("."), scorer=fuzz.token_set_ratio
    )
    # print(closest_match)
    start = full_text.find(closest_match)
    end = start + len(closest_match)
    return start, end


def tokenize_and_tag(full_text, spans, label_prefix):
    tokens = word_tokenize(full_text)
    tags = ["O"] * len(tokens)

    # Function to update tags for a span
    def update_tags_for_span(span_start, span_end):
        token_index = 0
        for i, token in enumerate(tokens):
            token_start = full_text.find(token, token_index)
            token_end = token_start + len(token)
            token_index = token_end
            if token_start >= span_end:
                break
            if span_start <= token_start < span_end:
                tags[i] = (
                    f"B-{label_prefix}"
                    if token_start == span_start
                    else f"I-{label_prefix}"
                )

    # Update tags for spans
    for span_text in spans:
        span_start, span_end = find_span_in_text(span_text, full_text)
        update_tags_for_span(span_start, span_end)

    return list(zip(tokens, tags))

[nltk_data] Downloading package punkt to /Users/annadai/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [74]:
def process_data(ids, data, results_df, anno_df):
    """Process data for seqeval"""
    tagged_data = {}

    for doc_id in ids:
        full_text = data[data["id"] == doc_id]["fulltext"].values[0]
        extracted_spans = results_df[results_df["doc_id"] == doc_id]["extracted"].values
        annotated_spans = anno_df[anno_df["doc_id"] == doc_id]["extracted"].values

        # Tagging the annotated spans
        tagged_annotated = tokenize_and_tag(full_text, annotated_spans, "ANNOTATED")

        # Tagging the extracted spans
        tagged_extracted = tokenize_and_tag(full_text, extracted_spans, "EXTRACTED")

        # Combine the tagged data for seqeval
        tagged_data[doc_id] = (tagged_annotated, tagged_extracted)
    return tagged_data


def get_seqeval_metrics(tagged_data):
    true_labels = []
    pred_labels = []

    for doc_id, (tagged_annotated, tagged_extracted) in tagged_data.items():
        doc_true_labels = []
        doc_pred_labels = []

        # Process annotated spans
        for token, tag in tagged_annotated:
            label = (
                "B-SKILL"
                if "B-ANNOTATED" in tag
                else "I-SKILL"
                if "I-ANNOTATED" in tag
                else "O"
            )
            doc_true_labels.append(label)

        # Process extracted spans
        for token, tag in tagged_extracted:
            label = (
                "B-SKILL"
                if "B-EXTRACTED" in tag
                else "I-SKILL"
                if "I-EXTRACTED" in tag
                else "O"
            )
            doc_pred_labels.append(label)

        true_labels.append(doc_true_labels)
        pred_labels.append(doc_pred_labels)

    # Import seqeval metrics
    from seqeval.metrics import f1_score as seqeval_f1
    from seqeval.metrics import precision_score as seqeval_precision
    from seqeval.metrics import recall_score as seqeval_recall

    # print("true_labels:", true_labels)
    # print("pred_labels:", pred_labels)

    # Calculate metrics
    f1 = seqeval_f1(true_labels, pred_labels, average="micro")
    precision = seqeval_precision(true_labels, pred_labels, average="micro")
    recall = seqeval_recall(true_labels, pred_labels, average="micro")

    # Print results
    return f1, precision, recall

In [75]:
print("JOB F1 SCORES ON EXTRACTIONS SEQEVAL\n")

job_processed_data = process_data(
    job_dev_doc_ids, job_data, job_results_dev_df, job_anno_exp_dev
)
f1, precision, recall = get_seqeval_metrics(job_processed_data)

print("f1:", f1)
print("recall:", recall)
print("precision:", precision)

JOB F1 SCORES ON EXTRACTIONS SEQEVAL

f1: 0.41379310344827586
recall: 0.41379310344827586
precision: 0.41379310344827586


In [76]:
print("COURSE F1 SCORES ON EXTRACTIONS SEQEVAL\n")

course_processed_data = process_data(
    course_dev_doc_ids, course_data, course_results_dev_df, course_anno_exp_dev
)

f1, precision, recall = get_seqeval_metrics(course_processed_data)

print("f1:", f1)
print("recall:", recall)
print("precision:", precision)

COURSE F1 SCORES ON EXTRACTIONS SEQEVAL

f1: 0.660377358490566
recall: 0.875
precision: 0.5303030303030303


### Scores on Level Correctness

#### Extracted Levels
For any correct extraction, look at how many levels are correct

#### Matched Levels
For any correct matches, look at how many levels are correct

In [77]:
def get_correct_matches(
    doc_id, anno_df=job_anno_exp, matched_df=job_results_df, column="name"
):
    true_lab = list(set(anno_df[anno_df.doc_id == doc_id][column].dropna()))
    pred_lab = list(set(matched_df[matched_df.doc_id == doc_id][column].dropna()))

    correct_true = set(true_lab) & set(pred_lab)

    return correct_true

In [78]:
job_correct_matches = []

for job_id in job_dev_doc_ids:
    correct = get_correct_matches(job_id)
    job_correct_matches.append({"doc_id": job_id, "correct": correct})

job_correct_matches = pd.DataFrame(job_correct_matches).explode("correct").dropna()

course_correct_matches = []

for course_id in course_dev_doc_ids:
    correct = get_correct_matches(
        course_id, anno_df=course_anno_exp, matched_df=course_results_df
    )
    course_correct_matches.append({"doc_id": course_id, "correct": correct})

course_correct_matches = (
    pd.DataFrame(course_correct_matches).explode("correct").dropna()
)

In [79]:
# job_correct_matches_df = job_correct_matches merged with job_anno_exp on two columns (doc_id and correct)
job_correct_matches_df = (
    job_correct_matches.merge(
        job_anno_exp[["doc_id", "name", "level"]],
        how="left",
        left_on=["doc_id", "correct"],
        right_on=["doc_id", "name"],
    )
    .drop(columns=["name"])
    .rename(columns={"level": "anno_level"})
)

In [80]:
job_correct_matches_df = (
    job_correct_matches_df.merge(
        job_results_df[["doc_id", "name", "level"]],
        how="left",
        left_on=["doc_id", "correct"],
        right_on=["doc_id", "name"],
    )
    .drop(columns=["name"])
    .rename(columns={"level": "matched_level"})
)

In [81]:
# lower case all anno_level
job_correct_matches_df["anno_level"] = job_correct_matches_df["anno_level"].str.lower()
job_correct_matches_df["matched_level"] = job_correct_matches_df[
    "matched_level"
].str.lower()

print("JOB\nLEVEL F1 ON MATCHED SKILLS - LOWEST LEVEL\n")

print(
    f1_score(
        job_correct_matches_df["anno_level"],
        job_correct_matches_df["matched_level"],
        average="micro",
    ),
    accuracy_score(
        job_correct_matches_df["anno_level"], job_correct_matches_df["matched_level"]
    ),
)

JOB
LEVEL F1 ON MATCHED SKILLS - LOWEST LEVEL

0.0 0.0


In [82]:
# # look at mismatched levels
# job_correct_matches_df[
#     job_correct_matches_df["anno_level"] != job_correct_matches_df["matched_level"]
# ]

In [83]:
course_correct_matches_df = (
    course_correct_matches.merge(
        course_anno_exp[["doc_id", "name", "level"]],
        how="left",
        left_on=["doc_id", "correct"],
        right_on=["doc_id", "name"],
    )
    .drop(columns=["name"])
    .rename(columns={"level": "anno_level"})
)

course_correct_matches_df = (
    course_correct_matches_df.merge(
        course_results_df[["doc_id", "name", "level"]],
        how="left",
        left_on=["doc_id", "correct"],
        right_on=["doc_id", "name"],
    )
    .drop(columns=["name"])
    .rename(columns={"level": "matched_level"})
)
# drop nan
course_correct_matches_df = course_correct_matches_df.dropna()

# lower case all anno_level
course_correct_matches_df["anno_level"] = course_correct_matches_df[
    "anno_level"
].str.lower()

course_correct_matches_df["matched_level"] = course_correct_matches_df[
    "matched_level"
].str.lower()

In [84]:
course_correct_matches_df
print("COURSES")
print("LEVEL F1 ON MATCHED SKILLS - LOWEST LEVEL\n")

print(
    "f1:",
    f1_score(
        course_correct_matches_df["anno_level"],
        course_correct_matches_df["matched_level"],
        average="micro",
    ),
    precision_score(
        course_correct_matches_df["anno_level"],
        course_correct_matches_df["matched_level"],
        average="micro",
    ),
    recall_score(
        course_correct_matches_df["anno_level"],
        course_correct_matches_df["matched_level"],
        average="micro",
    ),
    accuracy_score(
        course_correct_matches_df["anno_level"],
        course_correct_matches_df["matched_level"],
    ),
)

COURSES
LEVEL F1 ON MATCHED SKILLS - LOWEST LEVEL

f1: 0.296875 0.296875 0.296875 0.296875


In [85]:
course_correct_matches_df

Unnamed: 0,doc_id,correct,anno_level,matched_level
0,7855,Resilienz,unknown,unknown
1,7855,Resilienz,unknown,unknown
2,7855,Resilienz,unknown,unknown
3,7855,Resilienz,unknown,unknown
4,7855,Resilienz,unknown,unknown
...,...,...,...,...
59,9,Management der ICT,unknown,unknown
60,9,Management der ICT,unknown,unknown
61,9,Management der ICT,unknown,unknown
62,9,Architekturen,unknown,expert


In [86]:
# # look at mismatched levels
# course_correct_matches_df[
#     course_correct_matches_df["anno_level"]
#     != course_correct_matches_df["matched_level"]
# ]

### Scores on Optionality Correctness (Jobs Only)

#### Extracted Levels
For any correct extraction, look at how many optionality are correct

#### Matched Levels
For any correct matches, look at how many optionality are correct

In [87]:
job_correct_matches_opt = (
    job_correct_matches.merge(
        job_anno_exp[["doc_id", "name", "req_status"]],
        how="left",
        left_on=["doc_id", "correct"],
        right_on=["doc_id", "name"],
    )
    .drop(columns=["name"])
    .rename(columns={"req_status": "anno_req_status"})
)

In [88]:
job_correct_matches_opt = (
    job_correct_matches_opt.merge(
        job_results_df[["doc_id", "name", "req_status"]],
        how="left",
        left_on=["doc_id", "correct"],
        right_on=["doc_id", "name"],
    )
    .drop(columns=["name"])
    .rename(columns={"req_status": "matched_req_status"})
)

In [89]:
# lower case all anno_level
job_correct_matches_opt["anno_req_status"] = job_correct_matches_opt[
    "anno_req_status"
].str.lower()
job_correct_matches_opt["matched_req_status"] = job_correct_matches_opt[
    "matched_req_status"
].str.lower()

# convert mandatory to required
job_correct_matches_opt["matched_req_status"] = job_correct_matches_opt[
    "matched_req_status"
].apply(lambda x: "required" if x == "mandatory" else x)


print("OPT F1 ON MATCHED SKILLS - LOWEST LEVEL\n")
print(
    "f1:",
    f1_score(
        job_correct_matches_opt["anno_req_status"],
        job_correct_matches_opt["matched_req_status"],
        average="micro",
    ),
    precision_score(
        job_correct_matches_opt["anno_req_status"],
        job_correct_matches_opt["matched_req_status"],
        average="micro",
    ),
    recall_score(
        job_correct_matches_opt["anno_req_status"],
        job_correct_matches_opt["matched_req_status"],
        average="micro",
    ),
    accuracy_score(
        job_correct_matches_opt["anno_req_status"],
        job_correct_matches_opt["matched_req_status"],
    ),
)

OPT F1 ON MATCHED SKILLS - LOWEST LEVEL



ValueError: Classification metrics can't handle a mix of multiclass and unknown targets

In [None]:
job_correct_matches_opt

Unnamed: 0,doc_id,correct,anno_req_status,matched_req_status
0,13965,Analysieren von Daten oder Informationen,required,required
1,13965,Problemlösungsfähigkeit,required,required
2,13965,Problemlösungsfähigkeit,required,required
3,13965,Problemlösungsfähigkeit,required,required
4,13965,Problemlösungsfähigkeit,required,required
5,13965,Architekturen,required,required
6,13965,Architekturen,required,required
7,9577,Business- & Requirements-Engineering,unknown,required
8,9577,Business- & Requirements-Engineering,unknown,required
9,9577,Business- & Requirements-Engineering,unknown,required


### Evaluation of candidate selection

See if the ground truth is in the top n candidates for each method - complicated bc we need skill-level match for this
