# Entity Linking
This EL script works on two levels: 
1) Extracted phrases matched to ESCO skill labels.
2) Extracted units matched to ESCO skill descriptions.

The final output is a table that stores all the matches for each level, as well as the aggregated *final_meta* column, which contains the top matches from both levels and additional information about them.

In [3]:
import json
import sqlite3
import random
from typing import List, Dict, Tuple
from pathlib import Path
import numpy as np
import pandas as pd
import regex as re

import config as C
from utils import get_stopwords_de, tokenize


SQLITE_PATH = Path(C.SQLITE_PATH)

JOB_IDS: List[int] | None = C.JOB_IDS
SAMPLE_JOBS = C.SAMPLE_JOBS
TOP_K_PHRASE = C.TOP_K_PHRASE
MAX_UNIT_MATCHES = C.MAX_UNIT_MATCHES
FINAL_TOP = C.FINAL_TOP

W_SEMANTIC = C.W_SEMANTIC
W_LEXICAL = C.W_LEXICAL
W_CONTEXT = C.W_CONTEXT

l = None
if not JOB_IDS:
    l = "All Jobs"
else: 
    l = JOB_IDS

print("Jobs that are going to be matched:", l)
print("Random number of jobs to be matched (if present):", SAMPLE_JOBS)
print("Top-k matches per skillType on phrase level:", TOP_K_PHRASE)
print("Top-k matches per skillType on unit level:", MAX_UNIT_MATCHES)
print("Final number of ESCO matches in the aggregated list:", FINAL_TOP)


Jobs that are going to be matched: All Jobs
Random number of jobs to be matched (if present): None
Top-k matches per skillType on phrase level: 20
Top-k matches per skillType on unit level: 20
Final number of ESCO matches in the aggregated list: 40


### Before we procees, additional helper functions need to be added

In [6]:
def add_percentile_norm(items, score_key="score", out_key="norm"): #we need to normalize the similarity scores for each level, to then rank them together and form the final_meta column
    
    #lowest score - small percentile
    #highest score - percentile close to 1

    if not items:
        return items

    scores = []
    for x in items:
        value = x.get(score_key, 0.0)
        scores.append(float(value))

    values = np.array(scores, dtype = float) #convert to np array
    order = np.argsort(values)
    norm = np.empty_like(values, dtype = float)

    #assign the percentile
    length = float(len(values))
    for rank, idx in enumerate(order):
        norm[idx] = (rank + 1) / length

    for i in range(len(items)):
        items[i][out_key] = float(norm[i])

    return items


def jaccard(a, b): #jaccard similarity between two sets
    
    if not a or not b:
        return 0.0

    inter = len(a&b)
    union = len(a|b)

    if inter == 0:
        return 0.0

    return inter / float(union)


def parse_vec(json_str): #from json to np array
    
    data = json.loads(json_str)
    return np.asarray(data, dtype=np.float32)


def normalize_rows(mat): #I normalize to keep the vectors consistent after conversions and storage
    
    if mat.size == 0:
        return mat

    norms = np.linalg.norm(mat, axis=1, keepdims=True) #compute the L2 norm of each row vector
    norms = np.maximum(norms, 1e-12) #to prevent division by 0, we replace zero length with a small number

    return mat/norms

def to_py_int_or_none(x): #helper function that converts numpy/pandas int-like values to plain python int (used for char offset numbers)
    
    if x is None:
        return None
    try:
        import math
        if isinstance(x, float) and math.isnan(x):
            return None
    except Exception:
        pass
    try:
        return int(x)
    except Exception:
        return None

def load_table(conn, name):
    
    query = "SELECT * FROM " + name
    return pd.read_sql_query(query, conn)


def greedy_funk(items, key="label"): #remove duplicates based on string field
   
    seen = set()
    result = []

    for t in items:
        value = t.get(key)
        if value is None:
            continue

        value_str = str(value).strip().lower()
        if value_str == "":
            continue

        if value_str not in seen:
            seen.add(value_str)
            result.append(t)

    return result


def find_char_span(text, fragment): #start and end index of a matched phrase/unit/word inside the description. 
    
    if not text or not fragment:
        return None

    text_low = text.lower()
    frag_low = fragment.lower()

    idx = text_low.find(frag_low)
    if idx == -1:
        return None

    start = int(idx)
    end = int(idx + len(fragment))

    return {"start": start, "end": end}

print("Done")

Done


### Load all needed tables from SQLite

In [9]:
conn = sqlite3.connect(str(SQLITE_PATH))

jobs_clean = load_table(conn, "jobs_clean")
job_phr = load_table(conn, "job_phrases")
job_units = load_table(conn, "job_units")
esco_labels = load_table(conn, "esco_labels")
esco_desc = load_table(conn, "esco_desc")
esco_skills = load_table(conn, "esco_skills")

conn.close()

jobs = jobs_clean[["job_id", "title", "text_deduped"]].copy()
job_phr = job_phr[["job_id", "phrase_surface", "phrase_start", "phrase_end", "embedding_json"]].copy()
job_units = job_units[["job_id", "unit_text", "unit_start", "unit_end", "embedding_json"]].copy()
esco_labels = esco_labels[["esco_id", "label", "skillType", "embedding_json"]].copy()
esco_desc = esco_desc[["esco_id", "description", "skillType", "embedding_json"]].copy()

### Filter by Job ID or Sample
(If specific/random jobs are to be put through the pipeline)

In [12]:
selected_jobs = []

if JOB_IDS is not None and len(JOB_IDS) > 0:
    for i in range(len(jobs)):
        jid = jobs.loc[i, "job_id"]
        if jid in JOB_IDS:
            selected_jobs.append(i)

    #keep only the selected rows
    jobs = jobs.iloc[selected_jobs].copy()
    jobs = jobs.reset_index(drop=True)

else:
    #if random job selection
    if SAMPLE_JOBS is not None and SAMPLE_JOBS > 0:
        
        max_jobs = len(jobs)
        how_many = min(SAMPLE_JOBS, max_jobs)

        random_indices = random.sample(range(max_jobs), how_many)

        jobs = jobs.iloc[random_indices].copy()
        jobs = jobs.reset_index(drop=True)

#otherwise we keep all jobs

job_id_list = []
for i in range(len(jobs)):
    job_id_list.append(jobs.loc[i, "job_id"])

print("Selected job_ids:", job_id_list)
print("Number of selected jobs:", len(jobs))


Selected job_ids: [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, 192, 193, 194, 195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207, 208, 209, 210, 211, 212, 213, 214, 215, 216, 217, 218,

### Prepare stopwords and ESCO tokens
This step is done to support the hybrid matching model that uses lexical overlap

In [15]:
STOP_DE = set(get_stopwords_de())
label_tokens_list = []

#one by one through the esco labels list
for i in range(len(esco_labels)):
    text = esco_labels.loc[i, "label"]
    
    if not isinstance(text, str): #double check:)
        text = ""

    tokens = tokenize(text, stopwords=STOP_DE)
    label_tokens_list.append(set(tokens))


esco_labels["label_tokens"] = label_tokens_list
display(esco_labels[["label", "label_tokens"]].head(5))



Unnamed: 0,label,label_tokens
0,Musikpersonal verwalten,"{verwalten, musikpersonal}"
1,Strafvollzugsverfahren beaufsichtigen,"{beaufsichtigen, strafvollzugsverfahren}"
2,nicht unterdrückende Praktiken anwenden,"{anwenden, unterdrückende, praktiken}"
3,Einhaltung von Vorschriften von Eisenbahnfahrz...,"{überprüfen, einhaltung, vorschriften, eisenba..."
4,verfügbare Dienste ermitteln,"{ermitteln, verfügbare, dienste}"


### Split ESCO into knowledge vs skill, build embeddings

In [18]:
#each list will store either knowledge or skill concepts
#we start with the labels
TF_kn_lbl = []

for i in range(len(esco_labels)):
    st = esco_labels.loc[i, "skillType"]
    st_str = str(st).lower()
    if "knowledge" in st_str:
        TF_kn_lbl.append(True)
    else:
        TF_kn_lbl.append(False)

lbl_kn = esco_labels[TF_kn_lbl].copy() #store knowledge labels
lbl_kn = lbl_kn.reset_index(drop=True)

lbl_sk = esco_labels[[not t for t in TF_kn_lbl]].copy() #store skill labels
lbl_sk = lbl_sk.reset_index(drop=True)


In [20]:
#same thing with descriptions
#however, although we match descriptions with units, we output the corresponding label, as it will be easier to interpret and compare with phrase-level results

#for this, there is a small lookup table
id_to_label = {}
for i in range(len(esco_skills)):
    esco_id_value = esco_skills.loc[i, "esco_id"]
    label_value = esco_skills.loc[i, "label"]
    id_to_label[esco_id_value] = label_value



TF_kn_desc = []
for i in range(len(esco_desc)):
    st = esco_desc.loc[i, "skillType"]
    st_str = str(st).lower()
    if "knowledge" in st_str:
        TF_kn_desc.append(True)
    else:
        TF_kn_desc.append(False)

desc_kn = esco_desc[TF_kn_desc].copy()
desc_kn = desc_kn.reset_index(drop=True)

desc_sk = esco_desc[[not x for x in TF_kn_desc]].copy()
desc_sk = desc_sk.reset_index(drop=True)

#we add the label column to desc_kn and desc_sk
label_list_kn = []
for i in range(len(desc_kn)):
    esco_id_value = desc_kn.loc[i, "esco_id"]
    label_value = id_to_label.get(esco_id_value, "")
    label_list_kn.append(label_value)
desc_kn["label"] = label_list_kn

label_list_sk = []
for i in range(len(desc_sk)):
    esco_id_value = desc_sk.loc[i, "esco_id"]
    label_value = id_to_label.get(esco_id_value, "")
    label_list_sk.append(label_value)
desc_sk["label"] = label_list_sk

print("Label: knowledge =", len(lbl_kn), "skill =", len(lbl_sk))
print("Desc: knowledge =", len(desc_kn), "skill =", len(desc_sk))


Label: knowledge = 3219 skill = 10720
Desc: knowledge = 3219 skill = 10720


In [22]:
#find out the embedding dimensions from row 1
if len(esco_labels) > 0:
    first_emb_str = esco_labels.loc[0, "embedding_json"]
    example_vec = parse_vec(first_emb_str)
    EMB_DIM = example_vec.shape[0]
else:
    EMB_DIM = 384  #fallback

In [24]:
#helper matrix function
def build_embedding_matrix(df, col_name="embedding_json"):
    
    vec_list = []

    for i in range(len(df)):
        emb_str = df.loc[i, col_name]
        vec = parse_vec(emb_str)
        vec_list.append(vec)

    #build the matrix and normalize
    mat = np.vstack(vec_list)
    mat_norm = normalize_rows(mat)
    return mat_norm

In [26]:
#build the matrix

mx_lbl_kn = build_embedding_matrix(lbl_kn, "embedding_json")
mx_lbl_sk = build_embedding_matrix(lbl_sk, "embedding_json")
mx_desc_kn = build_embedding_matrix(desc_kn, "embedding_json")
mx_desc_sk = build_embedding_matrix(desc_sk, "embedding_json")


print("mx_lbl_kn =", mx_lbl_kn.shape)
print("mx_lbl_sk =", mx_lbl_sk.shape)
print("mx_desc_kn =", mx_desc_kn.shape)
print("mx_desc_sk =", mx_desc_sk.shape)


mx_lbl_kn = (3219, 384)
mx_lbl_sk = (10720, 384)
mx_desc_kn = (3219, 384)
mx_desc_sk = (10720, 384)


### Build phrase and units lists per job
- On Phrase level: job_id -> list of (text, vector, token_set)
- On Bullet level: job_id -> list of (text, vector)

In [29]:
jobid_to_phr = {}
jobid_to_units = {}

#phrase level
for i in range(len(job_phr)):
    row = job_phr.loc[i]
    jid = row["job_id"]
    txt = row["phrase_surface"]
    txt = txt.strip()
    
    emb_str = row["embedding_json"]
    vec = parse_vec(emb_str) #convert json embedding to a vector

    tok_list = tokenize(txt, stopwords=STOP_DE) #tokenize phrase text
    tok_set = set(tok_list)

    #get the previously computed span from preprocessing
    p_start = row.get("phrase_start", None)
    p_end = row.get("phrase_end", None)

    if jid not in jobid_to_phr:
        jobid_to_phr[jid] = []

    jobid_to_phr[jid].append((txt, vec, tok_set, p_start, p_end))


#unit level

for i in range(len(job_units)):
    row = job_units.loc[i]
    jid = row["job_id"]
    txt = row["unit_text"]
    txt = txt.strip()

    emb_str = row["embedding_json"]
    vec = parse_vec(emb_str)

    #read the span from preprocessing
    u_start = row.get("unit_start", None)
    u_end = row.get("unit_end", None)

    if jid not in jobid_to_units:
        jobid_to_units[jid] = []

    jobid_to_units[jid].append((txt, vec, u_start, u_end))

### Hybrid Scoring function (phrase level)
Score ESCO labels for one job using:
- semantic similarity (cosine sim via dot product)
- lexical Jaccard (phrase tokens vs label tokens)
- context Jaccard (title/phrase-union vs label tokens)

In [32]:
def score_bucket_labels(
    Phr_emb, #phrase embeddings (matrix)
    phr_txt, #list of phrase strings
    phr_tok_list, #list of sets of tokens for each phrase
    phr_tok_union,
    title_tok,
    Esco_lbl_emb, #ESCO label embeddings (matrix)
    esco_lbl_df, #ESCO labels dataframe
    phr_spans  #list of (start, end) per phrase
):

    #semantic similarity

    # S is a matrix: rows = phrases, columns = ESCO labels
    #each value is the dot product between phrase embedding and label embedding
    S = np.dot(Phr_emb, Esco_lbl_emb.T)

    #for each ESCO label we find the best semantic score and the index of the phrase that gives that score
    sem_scores = S.max(axis=0)
    best_phrase_idx = S.argmax(axis=0)

    #lexical Jaccard
    n_labels = Esco_lbl_emb.shape[0]
    lex_scores = np.zeros(n_labels, dtype=np.float32)

    for j in range(n_labels):
        label_tokens = esco_lbl_df.iloc[j]["label_tokens"] #label_tokens is a set of tokens for ESCO label j

        best_lex = 0.0

        #then compare this label to every phrase token set
        for phrase_tokens in phr_tok_list:
            v = jaccard(label_tokens, phrase_tokens)
            if v > best_lex:
                best_lex = v

        lex_scores[j] = best_lex

    #context Jaccard
    ctx_scores = np.zeros(n_labels, dtype=np.float32)

    for j in range(n_labels):
        label_tokens = esco_lbl_df.iloc[j]["label_tokens"]
        
        a = jaccard(label_tokens, title_tok)  #compare label tokens with job title tokens
        b = jaccard(label_tokens, phr_tok_union) #compare label tokens with union of all phrase tokens

        #take the better of the two
        ctx_scores[j] = max(a, b)

    #and now we combine scores using the predefined weights
    combined_score = (W_SEMANTIC * sem_scores + W_LEXICAL  * lex_scores + W_CONTEXT  * ctx_scores)

    #pick the top k labels
    k = min(TOP_K_PHRASE, len(combined_score))

    #get the indices of labels sorted by score
    sorted_idx = np.argsort(-combined_score)  #descending
    top_idx = sorted_idx[:k]

    #output list
    results = []

    for j in top_idx:
        #j is the label index in esco_df
        label_text = esco_lbl_df.iloc[j]["label"]
        skill_type = str(esco_lbl_df.iloc[j]["skillType"])
        score_value = float(combined_score[j])

        #best phrase index for this label
        if len(phr_txt) > 0:
            p_idx = int(best_phrase_idx[j])
            matched_phrase = phr_txt[p_idx]

            #get the stored span for that phrase (may be None,None)
            if p_idx < len(phr_spans):
                raw_start, raw_end = phr_spans[p_idx]
            else:
                raw_start, raw_end = (None, None)
        else:
            matched_phrase = ""
            raw_start, raw_end = (None, None)

        #convert to plain Python ints (or None) so JSON can handle it
        p_start = to_py_int_or_none(raw_start)
        p_end = to_py_int_or_none(raw_end)

        item = {
            "label": label_text,
            "skillType": skill_type,
            "score": score_value,
            "matched_phrase": matched_phrase,
            "char_span": {
                "start": p_start,
                "end": p_end
            } if p_start is not None and p_end is not None else None
        }

        results.append(item)

    #remove duplicate labels, keep first
    results = greedy_funk(results, key="label")

    return results


### Unit Level matching helper
The unit texts are matched to ESCO descriptions using cosine similarity, but return ESCO labels. Duplicates are removed, and the process runs until the top-k is reached.

In [35]:
def match_units_to_desc_return_label_capped(units_list, E_desc, esco_lbl_df, max_matches):


    #first we build the unit embedding matrix
    vec_list = []
    for text, vec, u_start, u_end in units_list:  #unpack spans as well
        vec_list.append(vec.astype(np.float32))

    U = np.vstack(vec_list)
    U = normalize_rows(U)

    #next we compute the cosine simularity
    S = np.dot(U, E_desc.T)

    n_units = S.shape[0]
    n_esco = S.shape[1]

    per_unit_window = max(50, max_matches * 3) #how many ESCO candidates we keep per unit
    if per_unit_window > n_esco:
        per_unit_window = n_esco

    #next we build a big list of (score, unit_index, esco_index)
    candidates = []

    for i in range(n_units):
        sim_row = S[i]  #similarities for unit i to all ESCO descriptions
        idx_sorted = np.argsort(-sim_row) #sort descending
        top_idx = idx_sorted[:per_unit_window]

        for j in top_idx:
            score_ij = float(sim_row[j])
            candidates.append((score_ij, i, j))

    #sort all candidates globally by score
    candidates.sort(key=lambda x: -x[0])

    #deduplicate labels and cap at max_matches
    label_list = esco_lbl_df["label"].astype(str).tolist()
    used_labels = set()
    out = []

    for score_val, unit_idx, esco_idx in candidates:
        lab_text = label_list[esco_idx].strip()
        key = lab_text.lower()

        if key == "" or key in used_labels:
            continue

        #get unit text + span for this unit
        u_text, _, raw_start, raw_end = units_list[unit_idx]

        u_start = to_py_int_or_none(raw_start)
        u_end = to_py_int_or_none(raw_end)

        item = {
            "label": lab_text,
            "skillType": str(esco_lbl_df.iloc[esco_idx]["skillType"]),
            "score": float(score_val),
            "matched_unit": u_text,  # original unit text
            "char_span": {
                "start": u_start,
                "end": u_end
            } if u_start is not None and u_end is not None else None
        }

        out.append(item)
        used_labels.add(key)

        if len(out) >= max_matches:
            break

    return out


### Main matching algorithm

In [39]:
rows = []

for i in range(len(jobs)):
    jr = jobs.loc[i]
    jid = jr["job_id"]

    # title and cleaned job text
    title = jr.get("title", "")
    if not isinstance(title, str):
        title = ""
    title = title.strip()

    tded = jr.get("text_deduped", "")
    if not isinstance(tded, str):
        tded = ""
    tded = tded.strip()

    #phrase level
    phrases = jobid_to_phr.get(jid, [])
    phrase_matches_kn = []
    phrase_matches_sk = []

    if len(phrases) > 0:
        #build matrix P from phrase embeddings
        vec_list = []
        phr_txt = []
        phr_tok_list = []
        phr_spans = []  #list of (start, end) per phrase

        for p in phrases:
            phr_txt.append(p[0]) #text
            vec_list.append(p[1]) #embedding
            phr_tok_list.append(p[2]) #token set
            
            if len(p) >= 5:
                phr_spans.append((p[3], p[4]))
            else:
                phr_spans.append((None, None))

        Phr_emb = np.vstack(vec_list).astype(np.float32)
        Phr_emb = normalize_rows(Phr_emb)

        #union of all phrase tokens
        phr_tok_union = set()
        for token_set in phr_tok_list:
            phr_tok_union = phr_tok_union.union(token_set)

        #title tokens
        title_tok = tokenize(title, stopwords=STOP_DE)
        title_tok = set(title_tok)

        #score ESCO labels for knowledge and skill buckets
        phrase_matches_kn = score_bucket_labels(
            Phr_emb,
            phr_txt,
            phr_tok_list,
            phr_tok_union,
            title_tok,
            mx_lbl_kn,
            lbl_kn,
            phr_spans
        )

        phrase_matches_sk = score_bucket_labels(
            Phr_emb,
            phr_txt,
            phr_tok_list,
            phr_tok_union,
            title_tok,
            mx_lbl_sk,
            lbl_sk,
            phr_spans
        )


    #unit level
    units = jobid_to_units.get(jid, [])
    unit_matches_kn = []
    unit_matches_sk = []

    if len(units) > 0:
        unit_matches_kn = match_units_to_desc_return_label_capped(
            units,
            mx_desc_kn,
            desc_kn,
            MAX_UNIT_MATCHES
        )

        unit_matches_sk = match_units_to_desc_return_label_capped(
            units,
            mx_desc_sk,
            desc_sk,
            MAX_UNIT_MATCHES
        )

    #final_meta table (with normalized results)
    add_percentile_norm(phrase_matches_kn)
    add_percentile_norm(phrase_matches_sk)
    add_percentile_norm(unit_matches_kn)
    add_percentile_norm(unit_matches_sk)

    combined = []

    #phrase-knowledge matches
    for x in phrase_matches_kn:
        item = {
            "source": "phrase",
            "bucket": "knowledge",
            "label": x["label"],
            "score": float(x["score"]),
            "norm": float(x.get("norm", 0.0)),
            "context": x.get("matched_phrase", ""),
            "char_span": x.get("char_span")
        }
        combined.append(item)

    #phrase-skill matches
    for x in phrase_matches_sk:
        item = {
            "source": "phrase",
            "bucket": "skill",
            "label": x["label"],
            "score": float(x["score"]),
            "norm": float(x.get("norm", 0.0)),
            "context": x.get("matched_phrase", ""),
            "char_span": x.get("char_span")
        }
        combined.append(item)

    #unit-knowledge matches
    for x in unit_matches_kn:
        item = {
            "source": "unit",
            "bucket": "knowledge",
            "label": x["label"],
            "score": float(x["score"]),
            "norm": float(x.get("norm", 0.0)),
            "context": x.get("matched_unit", ""),
            "char_span": x.get("char_span")
        }
        combined.append(item)

    #unit-skill matches
    for x in unit_matches_sk:
        item = {
            "source": "unit",
            "bucket": "skill",
            "label": x["label"],
            "score": float(x["score"]),
            "norm": float(x.get("norm", 0.0)),
            "context": x.get("matched_unit", ""),
            "char_span": x.get("char_span")
        }
        combined.append(item)


    #sort by normalized score
    combined.sort(key=lambda z: -z["norm"])

    best_overall = []
    seen_labels = set()

    for x in combined:
        label_key = x["label"].strip().lower()

        if label_key == "" or label_key in seen_labels:
            continue

        #character span was already computed in preprocessing and propagated
        span = x.get("char_span")

        best_item = {
            "source": x["source"],
            "bucket": x["bucket"],
            "label": x["label"],
            "score": x["score"],
            "norm": x["norm"],
            "context": x["context"],
            "char_span": span
        }

        best_overall.append(best_item)
        seen_labels.add(label_key)

        if len(best_overall) >= FINAL_TOP:
            break


    #store summary for this job
    rows.append({
        "job_id": jid,
        "title": title,
        "text_deduped": tded,
        "phrase_matches_knowledge": json.dumps(phrase_matches_kn, ensure_ascii=False),
        "phrase_matches_skills": json.dumps(phrase_matches_sk, ensure_ascii=False),
        "unit_matches_knowledge": json.dumps(unit_matches_kn, ensure_ascii=False),
        "unit_matches_skills": json.dumps(unit_matches_sk, ensure_ascii=False),
        "final_meta": json.dumps(best_overall, ensure_ascii=False),
    })

print("Finished matching all selected jobs.")
print("Total jobs matched:", len(rows))


Finished matching all selected jobs.
Total jobs matched: 400


### Save matched results to SQLite

In [42]:
res = pd.DataFrame(rows)

print("Result DataFrame shape:", res.shape)

conn = sqlite3.connect(str(SQLITE_PATH))
res.to_sql("matched_results", conn, if_exists="replace", index=False)
conn.close()


print("Rows written:", len(res))

print("Configuration that was used:")
print("JOB_IDS =", JOB_IDS)
print("SAMPLE_JOBS =", SAMPLE_JOBS)
print("TOP_K_PHRASE =", TOP_K_PHRASE)
print("MAX_UNIT_MATCHES =", MAX_UNIT_MATCHES)
print("FINAL_TOP =", FINAL_TOP)

print("Preview (first 3 rows):")
display(res.head(3))


Result DataFrame shape: (400, 8)
Rows written: 400

Configuration that was used:
 JOB_IDS = []
 SAMPLE_JOBS = None
 TOP_K_PHRASE = 20
 MAX_UNIT_MATCHES = 20
 FINAL_TOP = 40

Preview (first 3 rows):


Unnamed: 0,job_id,title,text_deduped,phrase_matches_knowledge,phrase_matches_skills,unit_matches_knowledge,unit_matches_skills,final_meta
0,1,Vertragshochschullehrperson/Hochschullehrperso...,Die Verwendung als Vertragshochschullehrperson...,"[{""label"": ""Beratung"", ""skillType"": ""knowledge...","[{""label"": ""Qualitätssicherung durchführen"", ""...","[{""label"": ""IKT-Qualitätspolitik"", ""skillType""...","[{""label"": ""Kontakt zur Qualitätssicherung auf...","[{""source"": ""phrase"", ""bucket"": ""knowledge"", ""..."
1,2,Vertragshochschullehrperson/Hochschullehrperso...,Die Verwendung als Vertragshochschullehrperson...,"[{""label"": ""Beratung"", ""skillType"": ""knowledge...","[{""label"": ""Publikationspläne vorlegen"", ""skil...","[{""label"": ""Fachkenntnisse im Bereich Ausbildu...","[{""label"": ""mit Bildungsträgern zusammenarbeit...","[{""source"": ""phrase"", ""bucket"": ""knowledge"", ""..."
2,3,Vertragshochschullehrperson/Hochschullehrperso...,Die Verwendung als Vertragshochschullehrperson...,"[{""label"": ""Ziele für nachhaltige Entwicklung""...","[{""label"": ""nachhaltigen Beschaffung umsetzen""...","[{""label"": ""organisatorischer Aufbau"", ""skillT...","[{""label"": ""organisatorische Richtlinien festl...","[{""source"": ""phrase"", ""bucket"": ""knowledge"", ""..."
