In [None]:
# !pip install --upgrade pip
# !pip install pigeonXT
# !pip install jupyter_innotater

In [134]:
# import sqlite3


# class Database:
#     """SQLite database to store"""

#     def __init__(self):
#         self.con = sqlite3.connect("annotations.db")
#         self.cursor = self.con.cursor()

#         self.cursor.execute(
#             """
#                 CREATE TABLE annotations (
#                 entity_id TEXT,
#                 text_id INTEGER,
#                 extracted_term TEXT,
#                 match_1 TEXT,
#                 match_2 TEXT,
#                 match_3 TEXT
#                 );
#             """
#         )

#     def insert_term(self, row: dict):
#         self.cursor.execute(
#             """
#             INSERT INTO annotations VALUES (:entity_id, :text_id, :entracted_term, :match_1, :match_2, :match_3)
#             """,
#             row,
#         )
#         self.con.commit()


# db = Database()

In [169]:
import jupyterannotate

# import pigeonXT as pixt
import pandas as pd
from ipywidgets import widgets, interact, Layout, HBox, VBox
from IPython.display import display
from IPython.display import HTML
import json

import numpy as np


def load_data(type):
    if type == "job":
        df = pd.read_json("../raw/vacancies.json")
        df["full_text"] = df["fulltext"] = df["name"] + "\n" + df["description"]

    elif type == "course":
        df = pd.read_json("../raw/learning_opportunities.json")
        df = df[df["active"] == True]
        keep_ids = {1, 5, 9}
        df = df[df["study_ids"].apply(lambda x: bool(set(x) & keep_ids))]

    return df


def get_lowest_level(row):
    """
    Returns the lowest level of the taxonomy that is not NaN in each
    """
    for level in ["Type Level 4", "Type Level 3", "Type Level 2", "Type Level 1"]:
        value = row[level]
        if not pd.isna(value):
            return value
            # appending level also just in case different levels have the same name


def load_taxonomy(level="lowest"):
    df = pd.read_csv("../taxonomy/taxonomy_V4.csv", sep=",")
    df = df.dropna(subset=["Definition", "Type Level 2"])
    keep_cols = [
        "Type Level 1",
        "Type Level 2",
        "Type Level 3",
        "Type Level 4",
    ]
    df = df[keep_cols]
    if level == "lowest":
        list_levels = list(set(df.apply(get_lowest_level, axis=1)))
    if level == "level2":
        list_levels = list(set(df["Type Level 2"]))
    list_levels = list_levels + ["NONE", "ADD_NEW"]

    return list_levels


def doc_widget():
    text_input = widgets.BoundedIntText(
        value=1,  # Initial value
        min=1,  # Minimum value
        max=100,  # Maximum value
        description="DOC:",
        layout=widgets.Layout(width="70%"),
    )
    text_input.layout.width = "20%"
    style = widgets.HTML(
        "<style>.widget-text .widget-label, .widget-text input {font-size: 20px; font-weight: bold;}</style>"
    )
    return style, text_input


def get_skills_per_doc(span):
    list_of_skills = []
    for skill in span:
        text = skill["text"]
        text = text.strip()
        list_of_skills.append(text)
    skills = [{"example": skill} for skill in list_of_skills]
    df = pd.DataFrame(skills)
    return df


def extraction_step(text_input, doc_idx):
    text_widget = widgets.HTML(
        value=f"<h2 style='font-size: 20px; font-weight: bold;'>Annotating document {text_input.value} of 100 "
        f"(Job/Course ID: {DOCUMENTS[doc_idx]['id']})</h2>",
    )

    display(text_widget)

    extraction_widget = jupyterannotate.AnnotateWidget(
        docs=DOCUMENTS[doc_idx]["full_text"],
        labels=LEVEL_LABELS,
        # change size of text
    )

    return extraction_widget


def save_extractions(doc_idx, extraction_widget):
    extractions_list = []
    try:
        extractions_list.append(
            {
                "doc_id": DOCUMENTS[doc_idx]["id"],
                "extraction": extraction_widget.spans[0],
            }
        )
    except:
        print("No terms highlighted from extraction step")

    try:
        with open("extractions.json", "r") as f:
            extractions = json.load(f)
    except FileNotFoundError:
        extractions = []

    updated = False
    for idx, extraction in enumerate(extractions):
        if extractions_list[0]["doc_id"] == extraction["doc_id"]:
            extractions[idx] = extractions_list[0]
            updated = True
            break

    if not updated:
        extractions.append(extractions_list[0])

    # Writing the updated/modified data back to the file
    with open("extractions.json", "w") as f:
        json.dump(extractions, f)


def matching_step(extraction_widget):
    user_inputs = {}
    widget_sets = []

    for index, item in enumerate(extraction_widget.spans[0]):
        # create a text widget
        text_widget = widgets.HTML(
            value=f"<h2 style='font-size: 18px; font-weight: bold;'>{item['text']}</h2>",
        )

        match_1 = widgets.Combobox(
            options=TAX_ELEMENTS,
            placeholder="Select or type to add",
            ensure_option=True,
            description="Label 1:",
        )

        match_2 = widgets.Combobox(
            options=TAX_ELEMENTS,
            placeholder="NONE",
            ensure_option=True,
            description="Label 2:",
        )

        match_3 = widgets.Combobox(
            options=TAX_ELEMENTS,
            placeholder="NONE",
            ensure_option=True,
            description="Label 3:",
        )

        widget_group = VBox([text_widget, match_1, match_2, match_3])
        display(widget_group)

        # Save each widget set with the associated text item
        widget_sets.append(
            {"text": item["text"], "widgets": (match_1, match_2, match_3)}
        )

    # Function to capture user inputs and associate them with the text
    def on_value_change(change):
        for widget_set in widget_sets:
            labels = {}
            for idx, widget in enumerate(widget_set["widgets"]):
                labels[f"Match {idx+1}"] = widget.value
            user_inputs[widget_set["text"]] = labels

    # Capture and save user inputs when the widget values change
    for widget_set in widget_sets:
        for widget in widget_set["widgets"]:
            widget.observe(on_value_change, names="value")

    try:
        with open("extractions.json", "r") as f:
            extractions = json.load(f)
    except FileNotFoundError:
        print("File not found - please check")

    def submit_button_matching(_):
        for entry in extractions:
            for extraction in entry["extraction"]:
                text = extraction["text"]
                if text in user_inputs:
                    extraction.update(user_inputs[text])

        with open("matching.json", "w") as f:
            json.dump(extractions, f)

    # Create a submit button
    submit_button = widgets.Button(description="Submit", button_style="success")

    # Assign the submit_button_matching function to the button's on_click event
    submit_button.on_click(submit_button_matching)

    # Display the button widget
    VBox([submit_button])


df = load_data("job")
# SKILL_LABELS = ["Skill"]
LEVEL_LABELS = ["Beginner", "Intermediate", "Expert", "Unknown"]
DOCUMENTS = df[["id", "full_text"]].sample(100, random_state=42).to_dict("records")
TAX_ELEMENTS = load_taxonomy("lowest")

In [157]:
style, text_input = doc_widget()
display(style, text_input)

HTML(value='<style>.widget-text .widget-label, .widget-text input {font-size: 20px; font-weight: bold;}</style…

BoundedIntText(value=1, description='DOC:', layout=Layout(width='20%'), min=1)

In [161]:
doc_idx = text_input.value - 1
extraction_widget = extraction_step(text_input, doc_idx=doc_idx)
display(extraction_widget)

HTML(value="<h2 style='font-size: 20px; font-weight: bold;'>Annotating document 60 of 100 (Job/Course ID: 1535…

AnnotateWidget(value=None, docs=["Director - Microsoft Technology Capability and Go-To-Market Lead (all gender…

In [170]:
matching_step(extraction_widget)

VBox(children=(HTML(value="<h2 style='font-size: 18px; font-weight: bold;'>he ability to build relationships</…