In [205]:
import jupyterannotate
import pandas as pd
from ipywidgets import widgets, interact, Layout, HBox, VBox
from IPython.display import display
from IPython.display import HTML
import json


def load_data(data_type):
    if data_type != "job" and data_type != "course":
        raise ValueError("data_type must be 'job' or 'course'")
    df = pd.read_json(f"../processed/{data_type}_sample_100.json")
    return df


def get_lowest_level(row):
    """
    Returns the lowest level of the taxonomy that is not NaN in each
    """
    for level in ["Type Level 4", "Type Level 3", "Type Level 2", "Type Level 1"]:
        value = row[level]
        if not pd.isna(value):
            return value
            # appending level also just in case different levels have the same name


def load_taxonomy(level="lowest"):
    df = pd.read_csv("../taxonomy/taxonomy_V4.csv", sep=",")
    df = df.dropna(subset=["Definition", "Type Level 2"])
    keep_cols = [
        "Type Level 1",
        "Type Level 2",
        "Type Level 3",
        "Type Level 4",
    ]
    df = df[keep_cols]
    if level == "lowest":
        list_levels = list(set(df.apply(get_lowest_level, axis=1)))
    if level == "level2":
        list_levels = list(set(df["Type Level 2"]))
    list_levels = list_levels + ["NONE", "ADD_NEW"]

    return list_levels


def doc_widget():
    text_input = widgets.BoundedIntText(
        value=1,  # Initial value
        min=1,  # Minimum value
        max=100,  # Maximum value
        description="DOC:",
        layout=widgets.Layout(width="70%"),
    )
    text_input.layout.width = "20%"
    style = widgets.HTML(
        "<style>.widget-text .widget-label, .widget-text input {font-size: 20px; font-weight: bold;}</style>"
    )
    return style, text_input


def get_skills_per_doc(span):
    list_of_skills = []
    for skill in span:
        text = skill["text"]
        text = text.strip()
        list_of_skills.append(text)
    skills = [{"example": skill} for skill in list_of_skills]
    df = pd.DataFrame(skills)
    return df


def extraction_step(text_input, doc_idx):
    text_widget = widgets.HTML(
        value=f"<h2 style='font-size: 20px; font-weight: bold;'>Annotating document {text_input.value} of 100 "
        f"(Job/Course ID: {DOCUMENTS[doc_idx]['id']})</h2>",
    )

    display(text_widget)

    extraction_widget = jupyterannotate.AnnotateWidget(
        docs=DOCUMENTS[doc_idx]["fulltext"],
        labels=LEVEL_LABELS,
        # change size of text
    )

    return extraction_widget


def save_extractions(doc_idx, extraction_widget):
    extractions_list = []
    try:
        extractions_list.append(
            {
                "doc_id": DOCUMENTS[doc_idx]["id"],
                "extraction": extraction_widget.spans[0],
            }
        )
    except:
        print("No terms highlighted from extraction step")

    try:
        with open("anno_extractions.json", "r", encoding="utf-8") as f:
            extractions = json.load(f)
    except FileNotFoundError:
        extractions = []

    updated = False
    for idx, extraction in enumerate(extractions):
        if extractions_list[0]["doc_id"] == extraction["doc_id"]:
            extractions[idx] = extractions_list[0]
            updated = True
            break

    if not updated:
        extractions.append(extractions_list[0])

    # Writing the updated/modified data back to the file
    with open("anno_extractions.json", "w", encoding="utf-8") as f:
        json.dump(extractions, f, ensure_ascii=False)


def matching_step(extraction_widget):
    user_inputs = {}
    widget_sets = []

    for index, item in enumerate(extraction_widget.spans[0]):
        # create a text widget
        text_widget = widgets.HTML(
            value=f"<h2 style='font-size: 18px; font-weight: bold;'>{item['text']}</h2>",
        )

        req_v_optional = widgets.RadioButtons(
            options=["SELECT_BELOW", "Required", "Optional"],
            description="Required or Optional Skill:",
            disabled=False,
        )

        match_1 = widgets.Combobox(
            options=TAX_ELEMENTS,
            placeholder="Select or type to add",
            ensure_option=True,
            description="Label 1:",
        )

        match_2 = widgets.Combobox(
            options=TAX_ELEMENTS,
            placeholder="NONE",
            ensure_option=True,
            description="Label 2:",
        )

        match_3 = widgets.Combobox(
            options=TAX_ELEMENTS,
            placeholder="NONE",
            ensure_option=True,
            description="Label 3:",
        )

        widget_group = VBox([text_widget, req_v_optional, match_1, match_2, match_3])
        display(widget_group)

        # Save each widget set with the associated text item
        widget_sets.append(
            {
                "text": item["text"],
                "widgets": (req_v_optional, match_1, match_2, match_3),
            }
        )

    # Function to capture user inputs and associate them with the text
    def on_value_change(change):
        for widget_set in widget_sets:
            labels = {}
            for idx, widget in enumerate(widget_set["widgets"]):
                labels[f"Match {idx+1}"] = widget.value
            user_inputs[widget_set["text"]] = labels

    # Capture and save user inputs when the widget values change
    for widget_set in widget_sets:
        for widget in widget_set["widgets"]:
            widget.observe(on_value_change, names="value")

    try:
        with open("anno_extractions.json", "r") as f:
            extractions = json.load(f)
    except FileNotFoundError:
        print("File not found - please check")

    def submit_button_matching(_):
        for entry in extractions:
            for extraction in entry["extraction"]:
                text = extraction["text"]
                if text in user_inputs:
                    extraction.update(user_inputs[text])

        with open("anno_matching.json", "w", encoding="utf-8") as f:
            json.dump(extractions, f, ensure_ascii=False)

    # Create a submit button
    submit_button = widgets.Button(description="Submit", button_style="success")

    # Assign the submit_button_matching function to the button's on_click event
    submit_button.on_click(submit_button_matching)

    # Display the button widget
    display(VBox([submit_button]))


df = load_data("job")
# SKILL_LABELS = ["Skill"]
LEVEL_LABELS = ["Beginner", "Intermediate", "Expert", "Unknown"]
DOCUMENTS = df[["id", "fulltext"]].sample(100, random_state=42).to_dict("records")
TAX_ELEMENTS = load_taxonomy("lowest")

In [192]:
style, text_input = doc_widget()
display(style, text_input)

HTML(value='<style>.widget-text .widget-label, .widget-text input {font-size: 20px; font-weight: bold;}</style…

BoundedIntText(value=1, description='DOC:', layout=Layout(width='20%'), min=1)

In [198]:
doc_idx = text_input.value - 1
extraction_widget = extraction_step(text_input, doc_idx=doc_idx)
display(extraction_widget)

HTML(value="<h2 style='font-size: 20px; font-weight: bold;'>Annotating document 11 of 100 (Job/Course ID: 6096…

AnnotateWidget(value=None, docs=['ICT Business Partner/in\nWas erwartet Sie?   Sie übernehmen und betreuen rel…

In [201]:
extraction_widget.spans[0]

[{'start': 556,
  'end': 578,
  'text': 'ICT-Providermanagement',
  'label': {'text': 'Beginner', 'color': 'red'}},
 {'start': 921,
  'end': 936,
  'text': 'flexiblen Team ',
  'label': {'text': 'Intermediate', 'color': 'cyan'}},
 {'start': 1131,
  'end': 1148,
  'text': 'Bachelorabschluss',
  'label': {'text': 'Intermediate', 'color': 'cyan'}},
 {'start': 1467,
  'end': 1485,
  'text': 'nformationsanalyse',
  'label': {'text': 'Unknown', 'color': 'violet'}},
 {'start': 1512,
  'end': 1536,
  'text': 'Computerhardwaresystemen',
  'label': {'text': 'Intermediate', 'color': 'cyan'}},
 {'start': 434,
  'end': 454,
  'text': 'Bewerbungsunterlagen',
  'label': {'text': 'Unknown', 'color': 'violet'}}]

In [206]:
save_extractions(doc_idx, extraction_widget)
matching_step(extraction_widget)

VBox(children=(HTML(value="<h2 style='font-size: 18px; font-weight: bold;'>ICT-Providermanagement</h2>"), Radi…

VBox(children=(HTML(value="<h2 style='font-size: 18px; font-weight: bold;'>flexiblen Team </h2>"), RadioButton…

VBox(children=(HTML(value="<h2 style='font-size: 18px; font-weight: bold;'>Bachelorabschluss</h2>"), RadioButt…

VBox(children=(HTML(value="<h2 style='font-size: 18px; font-weight: bold;'>nformationsanalyse</h2>"), RadioBut…

VBox(children=(HTML(value="<h2 style='font-size: 18px; font-weight: bold;'>Computerhardwaresystemen</h2>"), Ra…

VBox(children=(HTML(value="<h2 style='font-size: 18px; font-weight: bold;'>Bewerbungsunterlagen</h2>"), RadioB…

VBox(children=(Button(button_style='success', description='Submit', style=ButtonStyle()),))

In [None]:
# import sqlite3


# class Database:
#     """SQLite database to store"""

#     def __init__(self):
#         self.con = sqlite3.connect("annotations.db")
#         self.cursor = self.con.cursor()

#         self.cursor.execute(
#             """
#                 CREATE TABLE annotations (
#                 entity_id TEXT,
#                 text_id INTEGER,
#                 extracted_term TEXT,
#                 match_1 TEXT,
#                 match_2 TEXT,
#                 match_3 TEXT
#                 );
#             """
#         )

#     def insert_term(self, row: dict):
#         self.cursor.execute(
#             """
#             INSERT INTO annotations VALUES (:entity_id, :text_id, :entracted_term, :match_1, :match_2, :match_3)
#             """,
#             row,
#         )
#         self.con.commit()


# db = Database()

In [41]:
global selected_skills_list  # Make sure to use the same global DataFrame
selected_skills_list = []

In [207]:
t = pd.read_csv("taxonomy_V4.csv", sep=",")
# keep only a few columns


def search_keyword(keyword):
    # look in other columns as well
    return t[t["Definition"].str.contains(keyword, case=False)]


# Create a text input widget
keyword_widget = widgets.Text(
    value="", placeholder="Type a keyword", description="Keyword:", disabled=False
)

#
selection_widget = widgets.Text(
    value="", placeholder="Type skill ids", description="Selected:", disabled=False
)
save_button = widgets.Button(description="Save")


def on_save_button_click(b):
    selected_skills = process_selection(selection_widget.value)
    selected_skills_list.append(selected_skills)
    print(f"Content saved: {selected_skills}")


save_button.on_click(on_save_button_click)


def process_selection(selection):
    if selection:
        all_skills = selection.split(",")
        all_skills = [int(skill.strip()) for skill in all_skills]
        return all_skills
    else:
        return []


def on_button_click(b):
    keyword = keyword_widget.value
    results = search_keyword(keyword)
    clear_output()  # Clear previous output
    display(keyword_widget)
    display(search_button)
    display(results)
    display(selection_widget)


search_button = widgets.Button(description="Search")
search_button.on_click(on_button_click)


display(widgets.VBox([keyword_widget, search_button, selection_widget, save_button]))
# Append this to an dataframe for each element to annotate

VBox(children=(Text(value='', description='Keyword:', placeholder='Type a keyword'), Button(description='Searc…

In [44]:
selected_skills_list

[[239, 218], [212]]