In [None]:
# !pip install --upgrade pip
# !pip install pigeonXT
# !pip install jupyter_innotater

In [29]:
import jupyterannotate
import pigeonXT as pixt
import pandas as pd
import ipywidgets as widgets
from IPython.display import display, clear_output
from IPython.display import HTML


import numpy as np


def load_data(type):
    if type == "job":
        df = pd.read_json("../raw/vacancies.json")
        df["full_text"] = df["fulltext"] = df["name"] + "\n" + df["description"]

    elif type == "course":
        df = pd.read_json("../raw/learning_opportunities.json")
        df = df[df["active"] == True]
        keep_ids = {1, 5, 9}
        df = df[df["study_ids"].apply(lambda x: bool(set(x) & keep_ids))]

    return df


def load_lv2():
    df = pd.read_csv("../taxonomy/taxonomy_V4.csv", sep=",")
    df = df.dropna(subset=["Definition", "Type Level 2"])
    # df["name+definition"] = df.apply(concatenate_cols_skillname, axis=1)
    keep_cols = [
        "Type Level 2",
    ]
    df = df[keep_cols]
    df = df.drop_duplicates().reset_index(drop=True)
    list_lv2 = df["Type Level 2"].tolist()
    list_lv2 = list_lv2 + ["NONE", "ADD_NEW"]

    return list_lv2


def doc_widget():
    text_input = widgets.BoundedIntText(
        value=1,  # Initial value
        min=1,  # Minimum value
        max=100,  # Maximum value
        description="DOC:",
        layout=widgets.Layout(width="70%"),
    )
    text_input.layout.width = "20%"
    style = widgets.HTML(
        "<style>.widget-text .widget-label, .widget-text input {font-size: 20px; font-weight: bold;}</style>"
    )
    return style, text_input


def get_skills_per_doc(span):
    list_of_skills = []
    for skill in span:
        text = skill["text"]
        text = text.strip()
        list_of_skills.append(text)
    skills = [{"example": skill} for skill in list_of_skills]
    df = pd.DataFrame(skills)
    return df


df = load_data("job")
tax = load_lv2()

# SKILL_LABELS = ["Skill"]
LEVEL_LABELS = ["Beginner", "Intermediate", "Expert", "Unknown"]
DOCUMENTS = df[["id", "full_text"]].sample(100, random_state=42).to_dict("records")

In [2]:
style, text_input = doc_widget()
display(style)
display(text_input)

HTML(value='<style>.widget-text .widget-label, .widget-text input {font-size: 20px; font-weight: bold;}</style…

BoundedIntText(value=1, description='DOC:', layout=Layout(width='20%'), min=1)

In [3]:
doc_idx = text_input.value - 1

text_widget = widgets.HTML(
    value=f"<h2 style='font-size: 20px; font-weight: bold;'>Annotating document {text_input.value} of 100 "
    f"(Job/Course ID: {DOCUMENTS[doc_idx]['id']})</h2>",
)

# Display the text widget
display(text_widget)

# Instantiate widget
annotation_widget = jupyterannotate.AnnotateWidget(
    docs=DOCUMENTS[doc_idx]["full_text"],
    labels=LEVEL_LABELS,
    # change size of text
)

annotation_widget

HTML(value="<h2 style='font-size: 20px; font-weight: bold;'>Annotating document 1 of 100 (Job/Course ID: 6469)…

AnnotateWidget(value=None, docs=['Embedded Software Engineer (m/w) 100%\nMit dem ersten automatischen Türantri…

In [4]:
text_widget = widgets.HTML(
    value=f"<h2 style='font-size: 20px; font-weight: bold;'>Annotating document {text_input.value} of 100 "
    f"(Job/Course ID: {DOCUMENTS[doc_idx]['id']})</h2>",
)

display(text_widget)
try:
    annotations = pixt.annotate(
        get_skills_per_doc(annotation_widget.spans[0]),
        options=tax,
        task_type="multilabel-classification",
        buttons_in_a_row=5,
        reset_buttons_after_click=True,
    )

    fulltext = DOCUMENTS[doc_idx]["full_text"]
    text_widget = widgets.HTML(
        value=f"<h2 style='font-size: 14px; font-weight: normal;'>FOR REFERENCE:<br>{fulltext}",
    )
    display(text_widget)
except:
    print("No annotations found for this document")

HTML(value="<h2 style='font-size: 20px; font-weight: bold;'>Annotating document 1 of 100 (Job/Course ID: 6469)…

HTML(value='0 of 6 Examples annotated, Current Position: 0 ')

VBox(children=(HBox(children=(ToggleButton(value=False, description='Kognitive Fertigkeit'), ToggleButton(valu…

Output()

HTML(value="<h2 style='font-size: 14px; font-weight: normal;'>FOR REFERENCE:<br>Embedded Software Engineer (m/…

In [None]:
# Save annotations
# we need to save the extracted/matched skills in a separate file + allow override if redone
# we wnat to save intermediate steps as well when we are done with just skills

In [41]:
global selected_skills_list  # Make sure to use the same global DataFrame
selected_skills_list = []


In [42]:
t=pd.read_csv('taxonomy_V4.csv',sep=',')
# keep only a few columns

def search_keyword(keyword):
    # look in other columns as well
    return t[t['Definition'].str.contains(keyword, case=False)]

# Create a text input widget
keyword_widget = widgets.Text(
    value='',
    placeholder='Type a keyword',
    description='Keyword:',
    disabled=False
)

# 
selection_widget = widgets.Text(
    value='',
    placeholder='Type skill ids',
    description='Selected:',
    disabled=False
)
save_button = widgets.Button(description='Save')



def on_save_button_click(b):
    
    selected_skills = process_selection(selection_widget.value)
    selected_skills_list.append(selected_skills)
    print(f'Content saved: {selected_skills}')

save_button.on_click(on_save_button_click)


def process_selection(selection):
    if selection:
        all_skills = selection.split(",")
        all_skills = [int(skill.strip()) for skill in all_skills]
        return all_skills
    else:
        return []
    

def on_button_click(b):
    keyword = keyword_widget.value
    results = search_keyword(keyword)
    clear_output()  # Clear previous output
    display(keyword_widget)
    display(search_button)
    display(results)
    display(selection_widget)

search_button = widgets.Button(description='Search')
search_button.on_click(on_button_click)


display(widgets.VBox([keyword_widget, search_button, selection_widget, save_button]))
# Append this to an dataframe for each element to annotate


VBox(children=(Text(value='', description='Keyword:', placeholder='Type a keyword'), Button(description='Searc…

Content saved: [239, 218]
Content saved: [212]


In [44]:
selected_skills_list

[[239, 218], [212]]