# Evaluations on Marco's Annotations

We first read in the annotations from Marco's files, run the pipeline of those annotations using GPT3.5 and GPT4-turbo, and then evaluate the results.

## Functions

In [107]:
import os
import sys
import evaluate
import numpy as np
import pandas as pd
import json
import matplotlib.pyplot as plt
import seaborn as sns

In [108]:
def extract_skills(document):
    """Get extracted skills from each sentence in a document from the detailed output"""
    # extracted_skills_count = 0
    # matched_skills_count = 0
    matched_skills = []
    existing_skills = set()

    for sentences in document:
        # create zip dict of sentences["extracted_skills"] and sentences["extracted_skills_levels"]
        skill_level_dict = dict(
            zip(
                sentences.get("extracted_skills", [""]),
                sentences.get("extracted_skills_levels", [""]),
            )
        )
        # extracted_skills_count += len(skill_level_dict)
        matched_skill = sentences.get("matched_skills", {})
        # matched_skills_count += len(matched_skill)
        for skill_name, skill_info in matched_skill.items():
            unique_id = skill_info["unique_id"]
            skill_level = skill_level_dict[skill_name]
            if unique_id not in existing_skills:
                matched_skills.append((unique_id, skill_level, skill_name))
                existing_skills.add(unique_id)

    return matched_skills  # , extracted_skills_count, matched_skills_count


def get_matched_skills(data):
    """get relevant information on matched skills from the detailed output"""
    clean_output_dict = {}
    # extract_skills_dict = {}
    # matched_skills_dict = {}

    for doc_id, doc_data in data.items():
        if isinstance(doc_data, list):
            # If the data is a list, directly extract matched skills
            clean_output_dict[doc_id] = extract_skills(doc_data)
            # clean_output_dict[doc_id], extract_skills_dict[doc_id], matched_skills_dict[doc_id] = extract_skills(doc_data)
        elif isinstance(doc_data, dict):
            # If the data is a dictionary, check for nested keys
            nested_keys = ["required", "to_acquire"]
            clean_output_dict[doc_id] = {}
            # extract_skills_dict[doc_id] = {}
            # matched_skills_dict[doc_id] = {}
            for key in nested_keys:
                key_dict = {}
                # ext_dict = {}
                # match_dict = {}
                if key in doc_data:
                    key_dict[key] = extract_skills(doc_data[key])
                    # key_dict[key], ext_dict[key], match_dict[key] = extract_skills(doc_data[key])
                    clean_output_dict[doc_id].update(key_dict)
                    # extract_skills_dict[doc_id].update(ext_dict)
                    # matched_skills_dict[doc_id].update(match_dict)

    return clean_output_dict  # , extract_skills_dict, matched_skills_dict


def get_level_3(row):
    """
    Returns the lowest level of the taxonomy that is not NaN in each
    """
    for level in ["Type Level 3", "Type Level 2", "Type Level 1"]:
        value = row[level]
        if not pd.isna(value):
            return value


def get_level_2(row):
    """
    Returns the lowest level of the taxonomy that is not NaN in each
    """
    for level in ["Type Level 2", "Type Level 1"]:
        value = row[level]
        if not pd.isna(value):
            return value

## Read in the Annotations

In [109]:
# read in json file
df = pd.read_json("../../data/annotation/anno_matching.json")

In [110]:
drop_cols = ["doc_type", "sample_num"]
job_anno = df[df["doc_type"] == "job"].drop(columns=drop_cols)
course_anno = df[df["doc_type"] == "course"].drop(columns=drop_cols)

In [111]:
job_ids = job_anno["doc_id"].astype(str)
course_ids = course_anno["doc_id"].astype(str)

# output ids as txt files
with open("anno_job_ids.txt", "w") as f:
    for id in job_ids:
        f.write(id + "\n")

with open("anno_course_ids.txt", "w") as f:
    for id in course_ids:
        f.write(id + "\n")

job_ids = job_ids.astype(int)
course_ids = course_ids.astype(int)

In [112]:
# !python ../skillExtract/pipeline_jobs_courses.py --num-sentences 2 --do-extraction --detailed --max_tokens 2000 --do-matching --prompt_type wlevels --language de --datapath ../data/processed/job_evl_all.csv --candidates_method mixed --max_candidates 3 --ids evaluation/anno_job_ids.txt
# !python ../skillExtract/pipeline_jobs_courses.py --num-sentences 2 --do-extraction --detailed --max_tokens 2000 --do-matching --prompt_type wlevels --language de --datapath ../data/processed/course_evl_all.csv --candidates_method mixed --max_candidates 3 --ids evaluation/anno_course_ids.txt

In [113]:
taxonomy = pd.read_csv("../../data/taxonomy/taxonomy_V4.csv")
taxonomy.drop(columns=["name+definition"], inplace=True)
# drop duplicates
taxonomy.drop_duplicates(subset=["unique_id", "name"], inplace=True)

In [114]:
# taxonomy_old = pd.read_csv("../../data/taxonomy/taxonomy_V4_simple.csv")
# # check that the unique_id and name are the same
# assert taxonomy["unique_id"].equals(taxonomy_old["unique_id"])
# assert taxonomy["name"].equals(taxonomy_old["name"])

In [115]:
job_anno_exp = job_anno.explode("extraction").reset_index(drop=True)
course_anno_exp = course_anno.explode("extraction").reset_index(drop=True)

In [116]:
_temp_df = pd.json_normalize(job_anno_exp["extraction"])
job_anno_exp = pd.concat([job_anno_exp, _temp_df], axis=1)

_temp_df = pd.json_normalize(course_anno_exp["extraction"])
course_anno_exp = pd.concat([course_anno_exp, _temp_df], axis=1)

In [117]:
rename_dict = {
    "doc_id": "doc_id",
    "text": "extracted",
    "label.text": "level",
    "req_status": "req_status",
    "match_1": "matched",
    # "match_1s": "matched1",
}  # keeping only the first match for now

job_anno_exp = job_anno_exp.rename(columns=rename_dict)
course_anno_exp = course_anno_exp.rename(columns=rename_dict)

# keep only columns in rename_dict value
job_anno_exp = job_anno_exp[rename_dict.values()]
course_anno_exp = course_anno_exp[rename_dict.values()]

In [118]:
job_anno_exp

Unnamed: 0,doc_id,extracted,level,req_status,matched
0,15490,Selbständigkeit,Unknown,Unknown,Unabhängigkeit
1,15490,Zuverlässigkeit,Unknown,,
2,15490,Freude am Umgang mit anderen Menschen,Unknown,,
3,15490,Teamorientiert,Unknown,,
4,15490,Schnelle Auffassungsgabe,Unknown,,
...,...,...,...,...,...
207,6837,kundenorientiertes Denken,Unknown,Unknown,Kundenorientierung
208,6837,Qualitätsbewusstsein,Unknown,Unknown,Gewissenhaftigkeit
209,6837,interessiert Neues zu lernen,Unknown,Unknown,Lernbereitschaft
210,6837,Freude im Umgang mit Kunden,Unknown,Unknown,Kundenorientierung


In [119]:
job_anno_exp = job_anno_exp.merge(
    taxonomy, how="left", left_on="matched", right_on="name"
).drop(columns=["matched"])
course_anno_exp = course_anno_exp.merge(
    taxonomy, how="left", left_on="matched", right_on="name"
).drop(columns=["matched"])

## Read in the Extraction & Matching results

In [120]:
c_file_name = "course_gpt-3.5-turbo_ids_2sent_n10_mixed_V4_detailed_anno.json"
j_file_name = "job_gpt-3.5-turbo_ids_2sent_n10_mixed_V4_detailed_anno.json"
path = "../results/"

with open(path + c_file_name, "r") as f:
    course_results = json.load(f)

with open(path + j_file_name, "r") as f:
    job_results = json.load(f)

In [121]:
job_matched = get_matched_skills(job_results)
course_matched = get_matched_skills(course_results)

In [122]:
processed_data = []
for doc_id, categories in course_matched.items():
    for req_status, skills in categories.items():
        for skill_id, level, extracted in skills:
            processed_data.append(
                {
                    "doc_id": doc_id,
                    "skill_id": skill_id,
                    "level": level,
                    "extracted": extracted,
                    "req_status": req_status,
                }
            )

# Create DataFrame
course_matched_df = pd.DataFrame(processed_data)


processed_data = []
for doc_id, annotations in job_matched.items():
    for skill_id, level, extracted in annotations:
        processed_data.append(
            {
                "doc_id": doc_id,
                "skill_id": skill_id,
                "level": level,
                "extracted": extracted,
            }
        )

# Create DataFrame
job_matched_df = pd.DataFrame(processed_data)

In [123]:
job_matched_df = job_matched_df.merge(
    taxonomy, how="left", left_on="skill_id", right_on="unique_id"
).drop(columns=["unique_id"])


course_matched_df = course_matched_df.merge(
    taxonomy, how="left", left_on="skill_id", right_on="unique_id"
).drop(columns=["unique_id"])

# convert both ids to int
job_matched_df["doc_id"] = job_matched_df["doc_id"].astype(int)
course_matched_df["doc_id"] = course_matched_df["doc_id"].astype(int)
job_matched_df["skill_id"] = job_matched_df["skill_id"].astype(int)
course_matched_df["skill_id"] = course_matched_df["skill_id"].astype(int)

## Compute Some Metrics

### F1 Scores for Matching Results

Looking at how well for each document, the final matched result matches the annotations

#### Looking at Lowest Level

In [124]:
from sklearn.metrics import accuracy_score, f1_score, recall_score, precision_score
from seqeval.metrics import f1_score as seqeval_f1
from sklearn.preprocessing import MultiLabelBinarizer
import warnings


warnings.filterwarnings("ignore", category=UserWarning)

In [125]:
def get_acc_f1_r_p(
    doc_id, anno_df=job_anno_exp, matched_df=job_matched_df, column="name"
):
    true = list(set(anno_df[anno_df.doc_id == doc_id][column].dropna()))
    pred = list(set(matched_df[matched_df.doc_id == doc_id][column].dropna()))
    if not true or not pred:
        return 0.0, 0.0, 0.0  # or handle it in a way that makes sense for your analysis

    # one-hot encode
    mlb = MultiLabelBinarizer()
    true = mlb.fit_transform([true])
    pred = mlb.transform([pred])

    f1 = f1_score(true, pred, average="micro")
    recall = recall_score(true, pred, average="micro")
    precision = precision_score(true, pred, average="micro")

    return f1, recall, precision

In [126]:
print("JOB F1 SCORES ON MATCHED SKILLS - LOWEST LEVEL\n")

job_f1_scores = []
job_recall_scores = []
job_precision_scores = []

for job_id in job_ids:
    f1, recall, precision = get_acc_f1_r_p(job_id)
    job_f1_scores.append(f1)
    job_recall_scores.append(recall)
    job_precision_scores.append(precision)

print("f1:", np.mean(job_f1_scores))
print("recall:", np.mean(job_recall_scores))
print("precision:", np.mean(job_precision_scores))

JOB F1 SCORES ON MATCHED SKILLS - LOWEST LEVEL

f1: 0.4341832677126795
recall: 0.31186628186628185
precision: 0.8


In [127]:
print("COURSE F1 SCORES ON MATCHED SKILLS - LOWEST LEVEL\n")

course_f1_scores = []
course_recall_scores = []
course_precision_scores = []

for course_id in course_ids:
    f1, recall, precision = get_acc_f1_r_p(
        course_id, anno_df=course_anno_exp, matched_df=course_matched_df
    )
    course_f1_scores.append(f1)
    course_recall_scores.append(recall)
    course_precision_scores.append(precision)

print("f1:", np.mean(course_f1_scores))
print("recall:", np.mean(course_recall_scores))
print("precision:", np.mean(course_precision_scores))

COURSE F1 SCORES ON MATCHED SKILLS - LOWEST LEVEL

f1: 0.2727272727272727
recall: 0.20714285714285713
precision: 0.4


#### Looking at Level 3

In [128]:
taxonomy["name3"] = taxonomy.apply(get_level_3, axis=1)
taxonomy["name2"] = taxonomy.apply(get_level_2, axis=1)

In [129]:
# merge in name2 and name3 to matched and anno dfs on name

job_matched_df = job_matched_df.merge(
    taxonomy[["name", "name2", "name3"]], how="left", on="name"
)
job_anno_exp = job_anno_exp.merge(
    taxonomy[["name", "name2", "name3"]], how="left", on="name"
)

course_matched_df = course_matched_df.merge(
    taxonomy[["name", "name2", "name3"]], how="left", on="name"
)
course_anno_exp = course_anno_exp.merge(
    taxonomy[["name", "name2", "name3"]], how="left", on="name"
)

In [130]:
print("JOB F1 SCORES ON MATCHED SKILLS - LEVEL 3\n")

job_f1_scores = []
job_recall_scores = []
job_precision_scores = []

for job_id in job_ids:
    f1, recall, precision = get_acc_f1_r_p(
        job_id, anno_df=job_anno_exp, matched_df=job_matched_df, column="name3"
    )
    job_f1_scores.append(f1)
    job_recall_scores.append(recall)
    job_precision_scores.append(precision)

print("f1:", np.mean(job_f1_scores))
print("recall:", np.mean(job_recall_scores))
print("precision:", np.mean(job_precision_scores))

JOB F1 SCORES ON MATCHED SKILLS - LEVEL 3

f1: 0.45825623178564356
recall: 0.3364069264069265
precision: 0.8


In [131]:
print("COURSE F1 SCORES ON MATCHED SKILLS - LEVEL 3\n")

course_f1_scores = []
course_recall_scores = []
course_precision_scores = []

for course_id in course_ids:
    f1, recall, precision = get_acc_f1_r_p(
        course_id, anno_df=course_anno_exp, matched_df=course_matched_df, column="name3"
    )
    course_f1_scores.append(f1)
    course_recall_scores.append(recall)
    course_precision_scores.append(precision)

print("f1:", np.mean(course_f1_scores))
print("recall:", np.mean(course_recall_scores))
print("precision:", np.mean(course_precision_scores))

COURSE F1 SCORES ON MATCHED SKILLS - LEVEL 3

f1: 0.2727272727272727
recall: 0.20714285714285713
precision: 0.4


#### Looking at Level 2

In [132]:
print("JOB F1 SCORES ON MATCHED SKILLS - LEVEL 2\n")

job_f1_scores = []
job_recall_scores = []
job_precision_scores = []

for job_id in job_ids:
    f1, recall, precision = get_acc_f1_r_p(
        job_id, anno_df=job_anno_exp, matched_df=job_matched_df, column="name2"
    )
    job_f1_scores.append(f1)
    job_recall_scores.append(recall)
    job_precision_scores.append(precision)

print("f1:", np.mean(job_f1_scores))
print("recall:", np.mean(job_recall_scores))
print("precision:", np.mean(job_precision_scores))

JOB F1 SCORES ON MATCHED SKILLS - LEVEL 2

f1: 0.650312476194829
recall: 0.5422222222222222
precision: 0.8666666666666667


In [133]:
print("COURSE F1 SCORES ON MATCHED SKILLS - LEVEL 2\n")

course_f1_scores = []
course_recall_scores = []
course_precision_scores = []

for course_id in course_ids:
    f1, recall, precision = get_acc_f1_r_p(
        course_id, anno_df=course_anno_exp, matched_df=course_matched_df, column="name2"
    )
    course_f1_scores.append(f1)
    course_recall_scores.append(recall)
    course_precision_scores.append(precision)

print("f1:", np.mean(course_f1_scores))
print("recall:", np.mean(course_recall_scores))
print("precision:", np.mean(course_precision_scores))

JOB F1 SCORES ON MATCHED SKILLS - LEVEL 2

f1: 0.43499999999999994
recall: 0.3975
precision: 0.5


### F1 Scores on Extraction Results


In [179]:
# bring back document text for each doc_id
course_data = pd.read_csv("../../data/processed/course_evl_de.csv")
job_data = pd.read_csv("../../data/processed/job_evl_all.csv")

# filter to only ids in anno df
course_data = course_data[course_data["id"].isin(course_ids.astype(str))]
job_data = job_data[job_data["id"].astype(str).isin(job_ids.astype(str))]

job_data["id"] = job_data["id"].astype(int)
course_data.id = course_data.id.astype(int)

In [170]:
print("num_job_ids:", len(job_data["id"].unique()))
print("num_course_ids:", len(course_data["id"].unique()))

num_job_ids: 15
num_course_ids: 10


In [171]:
from nltk.tokenize import word_tokenize
from fuzzywuzzy import process
from fuzzywuzzy import fuzz
import nltk

nltk.download("punkt")


def find_span_in_text(extracted_span, full_text):
    # Use fuzzy matching to find the closest match in the text
    # Returns the start and end positions of the match
    closest_match, _ = process.extractOne(
        extracted_span, full_text.split("."), scorer=fuzz.token_set_ratio
    )
    start = full_text.find(closest_match)
    end = start + len(closest_match)
    return start, end


def tokenize_and_tag(full_text, spans, label_prefix):
    tokens = word_tokenize(full_text)
    tags = ["O"] * len(tokens)

    # Function to update tags for a span
    def update_tags_for_span(span_start, span_end):
        token_index = 0
        for i, token in enumerate(tokens):
            token_start = full_text.find(token, token_index)
            token_end = token_start + len(token)
            token_index = token_end
            if token_start >= span_end:
                break
            if span_start <= token_start < span_end:
                tags[i] = (
                    f"B-{label_prefix}"
                    if token_start == span_start
                    else f"I-{label_prefix}"
                )

    # Update tags for spans
    for span_text in spans:
        span_start, span_end = find_span_in_text(span_text, full_text)
        update_tags_for_span(span_start, span_end)

    return list(zip(tokens, tags))

[nltk_data] Downloading package punkt to /Users/annadai/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [180]:
def process_data(ids, data, matched_df, anno_df):
    """Process data for seqeval"""
    tagged_data = {}

    for doc_id in ids:
        full_text = data[data["id"] == doc_id]["fulltext"].values[0]
        annotated_spans = matched_df[matched_df["doc_id"] == doc_id]["extracted"].values
        extracted_spans = anno_df[anno_df["doc_id"] == doc_id]["extracted"].values

        # Tagging the annotated spans
        tagged_annotated = tokenize_and_tag(full_text, annotated_spans, "ANNOTATED")

        # Tagging the extracted spans
        tagged_extracted = tokenize_and_tag(full_text, extracted_spans, "EXTRACTED")

        # Combine the tagged data for seqeval
        tagged_data[doc_id] = (tagged_annotated, tagged_extracted)
    return tagged_data


def get_seqeval_metrics(tagged_data):
    true_labels = []
    pred_labels = []

    for doc_id, (tagged_annotated, tagged_extracted) in tagged_data.items():
        doc_true_labels = []
        doc_pred_labels = []

        # Process annotated spans
        for token, tag in tagged_annotated:
            label = (
                "B-SKILL"
                if "B-ANNOTATED" in tag
                else "I-SKILL"
                if "I-ANNOTATED" in tag
                else "O"
            )
            doc_true_labels.append(label)

        # Process extracted spans
        for token, tag in tagged_extracted:
            label = (
                "B-SKILL"
                if "B-EXTRACTED" in tag
                else "I-SKILL"
                if "I-EXTRACTED" in tag
                else "O"
            )
            doc_pred_labels.append(label)

        true_labels.append(doc_true_labels)
        pred_labels.append(doc_pred_labels)

    # Import seqeval metrics
    from seqeval.metrics import precision_score, recall_score, f1_score

    # Calculate metrics
    f1 = f1_score(true_labels, pred_labels, average="micro")
    precision = precision_score(true_labels, pred_labels, average="micro")
    recall = recall_score(true_labels, pred_labels, average="micro")

    # Print results
    return f1, precision, recall

In [181]:
print("JOB F1 SCORES ON EXTRACTIONS SEQEVAL\n")

job_processed_data = process_data(job_ids, job_data, job_matched_df, job_anno_exp)
f1, precision, recall = get_seqeval_metrics(job_processed_data)

print("f1:", f1)
print("precision:", precision)
print("recall:", recall)

JOB F1 SCORES ON EXTRACTIONS SEQEVAL

f1: 0.6141732283464567
precision: 0.7647058823529411
recall: 0.5131578947368421


In [184]:
print("COURSE F1 SCORES ON EXTRACTIONS SEQEVAL\n")

course_processed_data = process_data(
    course_ids, course_data, course_matched_df, course_anno_exp
)
f1, precision, recall = get_seqeval_metrics(course_processed_data)

print("f1:", f1)
print("precision:", precision)
print("recall:", recall)

COURSE F1 SCORES ON EXTRACTIONS SEQEVAL

f1: 0.39455782312925175
precision: 0.3411764705882353
recall: 0.46774193548387094


### Scores on Level Correctness

#### Extracted Levels
For any correct extraction, look at how many levels are correct

#### Matched Levels
For any correct matches, look at how many levels are correct

### Scores on Optionality Correctness (Jobs Only)

#### Extracted Levels
For any correct extraction, look at how many optionality are correct

#### Matched Levels
For any correct matches, look at how many optionality are correct