Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions doccano_mini/examples/named_entity_recognition.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
[
{"text": "EU rejects German call to boycott British lamb."},
{"text": "Peter Blackburn"},
{"text": "BRUSSELS 1996-08-22"}
]
4 changes: 4 additions & 0 deletions doccano_mini/layout.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,11 +43,15 @@ def make_prompt(self, examples: List[Dict]) -> FewShotPromptTemplate:
def prepare_inputs(self, columns: List[str]) -> Dict:
raise NotImplementedError()

def annotate(self, examples: List[Dict]) -> List[Dict]:
return examples

def render(self) -> None:
st.title(self.title)
st.header("Annotate your data")
columns = self.columns
examples = self.make_examples(columns)
examples = self.annotate(examples)

prompt = self.make_prompt(examples)
prompt = task_instruction_editor(prompt)
Expand Down
7 changes: 7 additions & 0 deletions doccano_mini/models/entity.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
from typing import TypedDict


class Entity(TypedDict):
start: int
end: int
label: str
28 changes: 28 additions & 0 deletions doccano_mini/models/stepper.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
class Stepper:
def __init__(self, step=0):
self._step = step

@property
def step(self) -> int:
return self._step

def fit(self, total: int):
if self._step >= total:
self._step = total - 1

def at(self, step: int, total: int):
if step >= total:
raise ValueError(f"step must be less than {total}")
if step < 0:
raise ValueError("step must be greater than 0")
self._step = step

def increment(self, total: int):
self._step += 1
if self._step >= total:
self._step = 0

def decrement(self, total: int):
self._step -= 1
if self._step < 0:
self._step = total - 1
60 changes: 60 additions & 0 deletions doccano_mini/pages/05_Named_Entity_Recognition.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,60 @@
from typing import Dict, List

import pandas as pd
import streamlit as st
from st_ner_annotate import st_ner_annotate

from doccano_mini.layout import BasePage
from doccano_mini.prompts import make_named_entity_recognition_prompt
from doccano_mini.storages.entity import EntitySessionStorage
from doccano_mini.storages.stepper import StepperSessionStorage


class NamedEntityRecognitionPage(BasePage):
example_path = "named_entity_recognition.json"

def __init__(self, title: str) -> None:
super().__init__(title)
self.types: List[str] = []
self.entity_repository = EntitySessionStorage()
self.stepper_repository = StepperSessionStorage()

def define_entity_types(self):
st.subheader("Define entity types")
default_types = pd.DataFrame([{"type": entity_type} for entity_type in ["ORG", "LOC", "PER"]])
edited_df = st.experimental_data_editor(default_types, num_rows="dynamic", width=1000)
types = edited_df["type"].values
self.types = types
return types

def annotate(self, examples: List[Dict]) -> List[Dict]:
if len(examples) == 0:
return []

types = self.define_entity_types()
selected_type = st.selectbox("Select an entity type", types)

col1, col2, _ = st.columns([1, 1, 8])
col1.button("Prev", on_click=self.stepper_repository.decrement, args=(len(examples),))
col2.button("Next", on_click=self.stepper_repository.increment, args=(len(examples),))

self.stepper_repository.fit(len(examples))
step = self.stepper_repository.get_step()
text = examples[step]["text"]
entities = self.entity_repository.find_by_text(text)
entities = st_ner_annotate(selected_type, text, entities, key=text)
self.entity_repository.store_by_text(text, entities)
return examples

def make_prompt(self, examples: List[Dict]):
examples = [
{**example, "entities": self.entity_repository.find_by_text(example["text"])} for example in examples
]
return make_named_entity_recognition_prompt(examples, types=self.types)

def prepare_inputs(self, columns: List[str]):
return {"text": st.text_area(label="Please enter your text.", value="", height=300)}


page = NamedEntityRecognitionPage(title="Named Entity Recognition")
page.render()
34 changes: 34 additions & 0 deletions doccano_mini/prompts.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
import json
from typing import List

from langchain.prompts.few_shot import FewShotPromptTemplate
Expand Down Expand Up @@ -79,3 +80,36 @@ def make_task_free_prompt(examples: List[dict]) -> FewShotPromptTemplate:
input_variables=columns[:-1],
)
return prompt


def make_named_entity_recognition_prompt(examples: List[dict], **kwargs) -> FewShotPromptTemplate:
task_instruction = (
"You are a highly intelligent and accurate Named-entity recognition(NER) system. "
"You take Passage as input and your task is to recognize and extract specific types of "
"named entities in that given passage and classify into a set of entity types.\n"
)
types = kwargs.get("types", [])
task_instruction += "The following entity types are allowed:\n"
for type in types:
task_instruction += f"- {type}\n"

for example in examples:
entities = [
{"mention": example["text"][entity["start"] : entity["end"]], "type": entity["label"]}
for entity in example["entities"]
]
example["entities"] = json.dumps(entities)

example_prompt = PromptTemplate(
input_variables=["text", "entities"],
template="text: {text}\nentities: {entities}",
)
prompt = FewShotPromptTemplate(
examples=examples,
example_prompt=example_prompt,
prefix=task_instruction,
suffix="text: {{text}}",
input_variables=["text"],
template_format="jinja2",
)
return prompt
22 changes: 22 additions & 0 deletions doccano_mini/storages/entity.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
from collections import defaultdict
from typing import List

import streamlit as st

from doccano_mini.models.entity import Entity
from doccano_mini.storages.session_storage import SessionStorage


class EntitySessionStorage:
def __init__(self) -> None:
self.storage = SessionStorage(state=st.session_state)
self.storage.init_state("entities", defaultdict(list))

def find_by_text(self, text: str) -> List[Entity]:
entities = self.storage.get_state("entities")
return entities.get(text, [])

def store_by_text(self, text: str, entities: List[Entity]) -> None:
current_entities = self.storage.get_state("entities")
current_entities[text] = entities
self.storage.set_state("entities", current_entities)
21 changes: 21 additions & 0 deletions doccano_mini/storages/session_storage.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
from typing import Any

from streamlit.runtime.state import SessionStateProxy


class SessionStorage:
def __init__(self, state: SessionStateProxy) -> None:
self.state = state

def init_state(self, key: str, value: Any) -> None:
if key not in self.state:
self.state[key] = value

def set_state(self, key: str, value: Any, *, do_init: bool = False) -> None:
if do_init:
self.init_state(key, value)

self.state[key] = value

def get_state(self, key: str) -> Any:
return self.state.get(key, None)
31 changes: 31 additions & 0 deletions doccano_mini/storages/stepper.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
import streamlit as st

from doccano_mini.models.stepper import Stepper
from doccano_mini.storages.session_storage import SessionStorage


class StepperSessionStorage:
def __init__(self) -> None:
self.storage = SessionStorage(state=st.session_state)
self.storage.init_state("step", 0)

def get_step(self) -> int:
return self.storage.get_state("step")

def fit(self, total: int) -> None:
step = self.storage.get_state("step")
stepper = Stepper(step)
stepper.fit(total)
self.storage.set_state("step", stepper.step)

def increment(self, total: int) -> None:
step = self.storage.get_state("step")
stepper = Stepper(step)
stepper.increment(total)
self.storage.set_state("step", stepper.step)

def decrement(self, total: int) -> None:
step = self.storage.get_state("step")
stepper = Stepper(step)
stepper.decrement(total)
self.storage.set_state("step", stepper.step)
Loading