# NER with Data from HuggingFace

<div>
<img src="https://github.com/flxst/nerblackbox/blob/master/docs/docs/images/nerblackbox.png?raw=true" width="800"/>
</div>

# Preparations

### nerblackbox: Create a Store for data, models, results etc.

In [None]:
# import main classes from nerblackbox
from nerblackbox import AnnotationTool, Store, Dataset, Experiment, Model

In [None]:
Store.create()

In [None]:
# view the content of the newly created folder
!ls store

# 1. Download human-annotated data

In [None]:
dataset = Dataset("ehealth_kd")
dataset.set_up()

In [None]:
# inspect the first row of the training data
!head -n 1 ./store/datasets/ehealth_kd/train.jsonl

# 2. Train a model

In [None]:
# define an experiment
experiment = Experiment("huggingface", dataset="ehealth_kd", model="mrm8488/electricidad-base-discriminator")

In [None]:
# run the experiment (this will take at least a few minutes)
experiment.run()

In [None]:
# inspect the model performance
experiment.get_result(label="micro", level="entity", metric="f1", phase="test")

# 3. Evaluate the model

### 3a. Evaluate our model

In [None]:
# load the model
model = Model.from_experiment("huggingface")

In [None]:
# evaluate and get the micro-averaged f1 score
evaluation_dict = model.evaluate_on_dataset("ehealth_kd", "jsonl", phase="test")
evaluation_dict["micro"]["entity"]["f1"]

### 3b. Evaluate other model from HuggingFace

In [None]:
# load a model from HuggingFace that was trained on (nearly) the same data
model_huggingface = Model.from_huggingface("fmmolina/bert-base-spanish-wwm-uncased-finetuned-NER-medical")

In [None]:
# evaluate and get the micro-averaged f1 score
evaluation_dict_huggingface = model_huggingface.evaluate_on_dataset("ehealth_kd", "jsonl", phase="test")
evaluation_dict_huggingface["micro"]["entity"]["f1"]

# 4. Apply the model

### 4a. Single example

In [None]:
# predict on single example sentence
model.predict("La vitamina D ayuda al cuerpo a absorber el calcio.")

### 4b. Test data

In [None]:
# let the model annotate the test data
input_file = "./store/datasets/ehealth_kd/test.jsonl"
output_file = "./store/datasets/ehealth_kd/test_model_annotated.jsonl"

model.predict_on_file(input_file, output_file)

In [None]:
# inspect the first row of "test_model_annotated.jsonl"
!head -n 1 ./store/datasets/ehealth_kd/test_model_annotated.jsonl