In [1]:
# %%
import pandas as pd
import json
import os
from deepdiff import DeepDiff

os.getcwd()

# %%
# read in json from different folder
# os.chdir("../results")
with open("../results/job_gpt-3.5-turbo_2sent_n100_jBd_V4_detailed.json", "r") as f:
    jobs_v4 = json.load(f)

with open("../results/job_gpt-3.5-turbo_2sent_n100_jBd_esco_detailed.json", "r") as f:
    jobs_esco = json.load(f)

with open("../results/course_gpt-3.5-turbo_2sent_n100_jBd_V4_detailed.json", "r") as f:
    courses_v4 = json.load(f)

with open(
    "../results/course_gpt-3.5-turbo_2sent_n100_jBd_esco_detailed.json", "r"
) as f:
    courses_esco = json.load(f)
# %%
# read in taxonomy from different folder
os.chdir("../../data/taxonomy")
taxonomy = pd.read_csv("taxonomy_V4.csv")
keep_cols = [
    "unique_id",
    "Type Level 1",
    "Type Level 2",
    "Type Level 3",
    "Type Level 4",
]
taxonomy = taxonomy[keep_cols]

In [2]:
# get list of job_ids which is the key in the json
job_ids = list(jobs_v4.keys())
assert job_ids == list(jobs_esco.keys())

course_ids = list(courses_v4.keys())
assert course_ids == list(courses_esco.keys())

In [8]:
job_matched_esco = {}
job_matched_v4 = {}

course_matched_esco = {}
course_matched_v4 = {}

for job_id in job_ids:
    job_matched_esco[job_id] = []
    job_matched_v4[job_id] = []
    for sentence in jobs_v4[job_id]:
        job_matched_v4[job_id].append(sentence["matched_skills"])
    for sentence in jobs_esco[job_id]:
        job_matched_esco[job_id].append(sentence["matched_skills"])

for course_id in course_ids:
    course_matched_v4[course_id] = {}
    course_matched_esco[course_id] = {}
    for type_skill in courses_v4[course_id]:
        course_matched_v4[course_id][type_skill] = []
        course_matched_esco[course_id][type_skill] = []
        for sentence in courses_v4[course_id][type_skill]:
            course_matched_v4[course_id][type_skill].append(sentence["matched_skills"])
        for sentence in courses_esco[course_id][type_skill]:
            course_matched_esco[course_id][type_skill].append(
                sentence["matched_skills"]
            )

In [9]:
# get how many skills are matched in each
job_matched_v4_count = {}
job_matched_esco_count = {}
job_matched_v4_count["total"] = 0
job_matched_esco_count["total"] = 0

course_matched_v4_count = {}
course_matched_esco_count = {}
course_matched_v4_count["total"] = 0
course_matched_esco_count["total"] = 0

for job_id in job_ids:
    job_matched_v4_count[job_id] = 0
    job_matched_esco_count[job_id] = 0
    for sentence in job_matched_v4[job_id]:
        job_matched_v4_count[job_id] += len(sentence)
        job_matched_v4_count["total"] += len(sentence)
    for sentence in job_matched_esco[job_id]:
        job_matched_esco_count[job_id] += len(sentence)
        job_matched_esco_count["total"] += len(sentence)

for course_id in course_ids:
    course_matched_v4_count[course_id] = {}
    course_matched_esco_count[course_id] = {}
    for type_skill in courses_v4[course_id]:
        course_matched_v4_count[course_id][type_skill] = 0
        course_matched_esco_count[course_id][type_skill] = 0
        for sentence in course_matched_v4[course_id][type_skill]:
            course_matched_v4_count[course_id][type_skill] += len(sentence)
            course_matched_v4_count["total"] += len(sentence)
        for sentence in course_matched_esco[course_id][type_skill]:
            course_matched_esco_count[course_id][type_skill] += len(sentence)
            course_matched_esco_count["total"] += len(sentence)

print("job_matched_v4_count total", job_matched_v4_count["total"])
print("job_matched_esco_count total", job_matched_esco_count["total"])
print("course_matched_v4_count total", course_matched_v4_count["total"])
print("course_matched_esco_count total", course_matched_esco_count["total"])

job_matched_v4_count total 326
job_matched_esco_count total 255
course_matched_v4_count total 534
course_matched_esco_count total 494


In [10]:
# for each job_id or course_id, we want to see what are the skills that are matched in common between v4 and esco

job_matched_common = {}
job_matched_common["total"] = 0
course_matched_common = {}
course_matched_common["total"] = 0

for job_id in job_ids:
    job_matched_common[job_id] = []
    for v4_extracted in job_matched_v4[job_id]:
        for v4_matched_skill in v4_extracted:
            for esco_extracted in job_matched_esco[job_id]:
                for esco_matched_skill in esco_extracted:
                    if v4_matched_skill == esco_matched_skill:
                        job_matched_common[job_id].append(v4_matched_skill)
                        job_matched_common["total"] += 1

for course_id in course_ids:
    course_matched_common[course_id] = {}
    for type_skill in courses_v4[course_id]:
        course_matched_common[course_id][type_skill] = []
        for v4_extracted in course_matched_v4[course_id][type_skill]:
            for v4_matched_skill in v4_extracted:
                for esco_extracted in course_matched_esco[course_id][type_skill]:
                    for esco_matched_skill in esco_extracted:
                        if v4_matched_skill == esco_matched_skill:
                            course_matched_common[course_id][type_skill].append(
                                v4_matched_skill
                            )
                            course_matched_common["total"] += 1

print("job_matched_common total", job_matched_common["total"])
print("course_matched_common total", course_matched_common["total"])

job_matched_common total 166
course_matched_common total 365


In [35]:
# find which matched skills in common also match to the same "name+definition" in our output

job_matched_same_match = []
job_matched_same_match["total"] = 0
course_matched_same_match = []
course_matched_same_match["total"] = 0

job_matched_diff_match = []
job_matched_diff_match["total"] = 0
course_matched_diff_match = []
course_matched_diff_match["total"] = 0


for esco_sentence in job_matched_esco.values():
    for v4_sentence in job_matched_v4.values():
        for esco_skills in esco_sentence:
            for v4_skills in v4_sentence:
                for esco_skill in esco_skills.values():
                    for v4_skill in v4_skills.values():
                        if esco_skill["name+definition"] == v4_skill["name+definition"]:
                            job_matched_same_match["total"] += 1
                            job_matched_same_match.append(esco_skill)
                        else:
                            job_matched_diff_match["total"] += 1
                            job_matched_diff_match.append(esco_skill)

    # if esco_skill["name+definition"] == v4_skill["name+definition"]:
    #     job_matched_same_match["total"] += 1
    #     job_matched_same_match[job_id] = esco_skill
    # break
    # if esco_skill["name+definition"] == v4_skill["name+definition"]:
    #     job_matched_same_match["total"] += 1
    #     job_matched_same_match[job_id] = esco_skill

TypeError: list indices must be integers or slices, not str

In [34]:
job_matched_same_match

{'total': 5755,
 '10389': {'unique_id': 108,
  'name+definition': 'Soziale Fertigkeit: Fertigkeiten, um in der Zusammenarbeit mit anderen Menschen Ziele zu erreichen'}}

In [7]:
# # check which of the common matched jobs and courses are matched to the same items in the taxonomy (same unique id)

# job_matched_common_taxonomy = {}
# job_matched_common_taxonomy["total"] = 0
# course_matched_common_taxonomy = {}
# course_matched_common_taxonomy["total"] = 0

# for job_id in job_ids:
#     job_matched_common_taxonomy[job_id] = []
#     for v4_extracted in job_matched_v4[job_id]:
#         for v4_matched_skill in v4_extracted:
#             for esco_extracted in job_matched_esco[job_id]:
#                 for esco_matched_skill in esco_extracted:
#                     if v4_matched_skill == esco_matched_skill:
#                         for index, row in taxonomy.iterrows():
#                             if row["unique_id"] == v4_matched_skill:
#                                 job_matched_common_taxonomy[job_id].append(
#                                     v4_matched_skill
#                                 )
#                                 job_matched_common_taxonomy["total"] += 1

Notes: in skill_candidates, we observe candidates generated from ESCO but we actually don't see any items from ESCO selected
Concerns: we do however see different items in v4 taxonomy selected

In [6]:
# get how many matched skills in esco that has unique ids over 999

job_esco_matches = {}
job_esco_matches["total"] = 0

for job_id in job_ids:
    job_esco_matches[job_id] = []
    for job_skill in job_matched_esco[job_id]:
        if job_skill["unique_id"] > 999:
            job_esco_matches[job_id].append(job_skill)
            job_esco_matches["total"] += 1

KeyError: 'unique_id'

In [26]:
job_esco_matches["total"]

508