# NER with Data from Doccano

<div>
<img src="https://github.com/flxst/nerblackbox/blob/master/docs/docs/images/nerblackbox_doccano.png?raw=true" width="800"/>
</div>

# Preparations

### nerblackbox: Create a Store for data, models, results etc.

In [None]:
# import main classes from nerblackbox
from nerblackbox import AnnotationTool, Store, Dataset, Experiment, Model

In [None]:
Store.create()

In [None]:
# view the content of the newly created folder
!ls store

### Doccano: Run Server & Connect

In [None]:
# check that Doccano server is running
!curl http://localhost:8082/v1/health/

In [None]:
# show configuration file for Doccano
!cat doccano.ini

In [None]:
# connect to Doccano
annotation_tool = AnnotationTool.from_config(dataset_name="ehealth_kd", config_file=f"doccano.ini")

assert annotation_tool.connected is True

# 0. Create human-annotated data

In [None]:
from utils import prepare_raw_data, upload_raw_data, simulate_annotation

### 0a. Prepare raw data

In [None]:
prepare_raw_data("ehealth_kd")

In [None]:
# check that the files exist
!ls ./store/datasets/ehealth_kd/*_data.jsonl

In [None]:
# inspect the first row of "annotated_data.jsonl"
!head -n 1 ./store/datasets/ehealth_kd/annotated_data.jsonl

### 0b. Upload raw data

In [None]:
upload_raw_data("ehealth_kd", annotation_tool)

### 0c. Simulate annotation

In [None]:
simulate_annotation("ehealth_kd", annotation_tool)

In [None]:
# inspect the first row of "annotated_data.jsonl"
!head -n 1 ./store/datasets/ehealth_kd/annotated_data.jsonl

# 1. Download human-annotated data

In [None]:
annotation_tool.download(project_name="annotated_data")

In [None]:
# inspect the first row of "annotated_data.jsonl"
!head -n 1 ./store/datasets/ehealth_kd/annotated_data.jsonl

# 2. Train a model

### 2a. Prepare data

In [None]:
file_path = annotation_tool.get_file_path(project_name="annotated_data")
dataset = Dataset(name="ehealth_kd", source="LF", pretokenized=False, split=False, file_path=file_path)
dataset.set_up()

### 2b. train

In [None]:
# define an experiment
experiment = Experiment("doccano", dataset="ehealth_kd", model="mrm8488/electricidad-base-discriminator")

In [None]:
# run the experiment (this will take at least a few minutes)
experiment.run()

In [None]:
# inspect the model performance
experiment.get_result(label="micro", level="entity", metric="f1", phase="test")

# 3. Evaluate the model

In [None]:
# load the model
model = Model.from_experiment("doccano")

In [None]:
# evaluate and get the micro-averaged f1 score
results = model.evaluate_on_dataset("ehealth_kd", "jsonl", phase="test")
results["micro"]["entity"]["f1"]

# 4. Apply the model

### 4a. Single example

In [None]:
# predict on single example sentence
model.predict("La vitamina D ayuda al cuerpo a absorber el calcio.")

### 4b. Raw data

In [None]:
# download the raw data
annotation_tool.download(project_name="raw_data")

In [None]:
# let the model annotate the raw data 
input_file = annotation_tool.get_file_path(project_name="raw_data")
output_file = annotation_tool.get_file_path(project_name="raw_data_model_annotated")

model.predict_on_file(input_file, output_file)

In [None]:
# inspect the first row of "raw_data_model_annotated.jsonl"
!head -n 1 ./store/datasets/ehealth_kd/raw_data_model_annotated.jsonl

# 5. Upload model-annotated data

In [None]:
# upload the model-annotated data
annotation_tool.upload(project_name="raw_data_model_annotated")