Skip to content

Commit

Permalink
feat: add Ontonotes NER with Senna
Browse files Browse the repository at this point in the history
* feat: Ontonotes NER added

* chore: train part removed from config

* fix: readme dataset_iterator fixed, json removed from striong

* feat: raw version of test added

* fix: test modes

* fix: folder name in ontonotes config and download path now consistent

* fix: skip tests
  • Loading branch information
mu-arkhipov authored and seliverstov committed Apr 2, 2018
1 parent 737b8a7 commit ae91d8f
Show file tree
Hide file tree
Showing 7 changed files with 461 additions and 66 deletions.
3 changes: 2 additions & 1 deletion deeppavlov/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,12 +27,13 @@
import deeppavlov.models.embedders.glove_embedder
import deeppavlov.models.encoders.bow
import deeppavlov.models.ner.slotfill
import deeppavlov.models.ner.ner
import deeppavlov.models.ner.ner_ontonotes
import deeppavlov.models.spellers.error_model.error_model
import deeppavlov.models.trackers.hcn_at
import deeppavlov.models.trackers.hcn_et
import deeppavlov.models.preprocessors.str_lower
import deeppavlov.models.preprocessors.squad_preprocessor
import deeppavlov.models.ner.ner
import deeppavlov.models.tokenizers.spacy_tokenizer
import deeppavlov.models.tokenizers.split_tokenizer
import deeppavlov.models.squad.squad
Expand Down
52 changes: 52 additions & 0 deletions deeppavlov/configs/ner/ner_ontonotes.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
{
"dataset_reader": {
"name": "conll2003_reader",
"data_path": "ontonotes/"
},
"dataset_iterator": {
"name": "basic_dataset_iterator"
},
"chainer": {
"in": ["x"],
"pipe": [
{
"id": "pos_vocab",
"name": "default_vocab",
"load_path": "ner_ontonotes_senna/pos.dict",
"save_path": "ner_ontonotes_senna/pos.dict"
},
{
"id": "tag_vocab",
"name": "default_vocab",
"load_path": "ner_ontonotes_senna/tag.dict",
"save_path": "ner_ontonotes_senna/tag.dict"
},
{
"id": "ner_vocab",
"name": "default_vocab",
"load_path": "ner_ontonotes_senna/ner.dict",
"save_path": "ner_ontonotes_senna/ner.dict"
},
{
"id": "glove_emb",
"name": "glove",
"load_path": "embeddings/glove.6B.100d.txt",
"save_path": "embeddings/glove.6B.100d.txt"
},
{
"in": ["x"],
"out": ["y_predicted"],
"name": "ner_ontonotes",
"main": true,
"save_path": "ner_ontonotes_senna/model.ckpt",
"load_path": "ner_ontonotes_senna/model.ckpt",
"ner_vocab": "#ner_vocab",
"tag_vocab": "#tag_vocab",
"pos_vocab": "#pos_vocab",
"embedder": "#glove_emb"
}
],
"out": ["y_predicted"]
}
}

2 changes: 2 additions & 0 deletions deeppavlov/core/data/urls.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,8 @@
'http://lnsigo.mipt.ru/export/deeppavlov_data/squad_model.tar.gz',
'http://lnsigo.mipt.ru/export/deeppavlov_data/seq2seq_go_bot.tar.gz',
'http://lnsigo.mipt.ru/export/deeppavlov_data/ner_ontonotes.tar.gz',
'http://lnsigo.mipt.ru/export/deeppavlov_data/ner_ontonotes_senna.tar.gz',
'http://lnsigo.mipt.ru/export/deeppavlov_data/senna.tar.gz'
}

OPT_URLS = {
Expand Down
121 changes: 90 additions & 31 deletions deeppavlov/models/ner/README_NER.md
Original file line number Diff line number Diff line change
Expand Up @@ -78,7 +78,7 @@ Configuration of the model can be performed in code or in JSON configuration fil
the model you need to specify four groups of parameters:

- **`dataset_reader`**
- **`dataset`**
- **`dataset_iterator`**
- **`chainer`**
- **`train`**

Expand All @@ -89,7 +89,7 @@ In the subsequent text we show the parameter specification in config file. Howev
The dataset reader is a class which reads and parses the data. It returns a dictionary with
three fields: "train", "test", and "valid". The basic dataset reader is "ner_dataset_reader."
The dataset reader config part with "ner_dataset_reader" should look like:
```json
```
"dataset_reader": {
"name": "ner_dataset_reader",
"data_path": "/home/user/Data/conll2003/"
Expand All @@ -102,13 +102,13 @@ contain data in the format presented in *Training data* section. Each line in th
may contain additional information such as POS tags. However, the token must be the first in
line and NER tag must be the last.

### Dataset
### Dataset Iterator

For simple batching and shuffling you can use "basic_dataset". The part of the
For simple batching and shuffling you can use "basic_dataset_iterator". The part of the
configuration file for the dataset looks like:
```json
"dataset": {
"name": "basic_dataset"
```
"dataset_iterator": {
"name": "basic_dataset_iterator"
}
```

Expand All @@ -119,7 +119,7 @@ There is no additional parameters in this part.
The chainer part of the configuration file contains the specification of the neural network
model and supplementary things such as vocabularies. Chainer should be defined as follows:

```json
```
"chainer": {
"in": ["x"],
"in_y": ["y"],
Expand All @@ -137,7 +137,7 @@ predictions.
The major part of "chainer" is "pipe". The "pipe" contains network and vocabularies. Firstly
we define vocabularies needed to build the neural network:

```json
```
"pipe": [
{
"id": "word_vocab",
Expand Down Expand Up @@ -255,7 +255,7 @@ works well in most of the cases

After the "chainer" part you should specify the "train" part:

```json
```
"train": {
"epochs": 100,
"batch_size": 64,
Expand All @@ -280,14 +280,14 @@ training parameters are:


And now all parts together:
```json
```
{
"dataset_reader": {
"name": "ner_dataset_reader",
"data_path": "conll2003/"
},
"dataset": {
"name": "basic_dataset"
"dataset_iterator": {
"name": "basic_dataset_iterator"
},
"chainer": {
"in": ["x"],
Expand Down Expand Up @@ -372,43 +372,102 @@ interact_model(PIPELINE_CONFIG_PATH)
This example assumes that the working directory is deeppavlov.


## OntoNotes NER

A pre-trained model for solving OntoNotes task can be used as following:
```python
from deeppavlov.core.commands.infer import interact_model
interact_model('deeppavlov/configs/ner/ner_ontonotes.json')
```
Or from command line:

```bash
python deeppavlov/deep.py interact deeppavlov/configs/ner/ner_ontonotes.json
```

Since the model is built with cuDNN version of LSTM, the GPU along with installed cuDNN library needed to run this model.
The F1 scores of this model on test part of OntoNotes is presented in table below.

| Model | F1 score |
|----------------------------|:----------------:|
|DeepPavlov |**87.07** ± 0.21 |
|Strubell at al. (2017) [1]|86.84 ± 0.19 |
|Chiu and Nichols (2016) [2]|86.19 ± 0.25 |
|Spacy |85.85 |
|Durrett and Klein (2014) [3]|84.04 |
|Ratinov and Roth (2009) [4]|83.45 |

Scores by entity type are presented in the table below:

|Tag |F1 score|
|------ |:------:|
|TOTAL | 87.07 |
|CARDINAL |82.80|
|DATE |84.87|
|EVENT |68.39 |
|FAC |68.07|
|GPE |94.61|
|LANGUAGE |62.91|
|LAW |48.27|
|LOC |72.39|
|MONEY |87.79|
|NORP |94.27|
|ORDINAL |79.53|
|ORG |85.59|
|PERCENT |89.41|
|PERSON |91.67|
|PRODUCT |58.90|
|QUANTITY |77.93|
|TIME |62.50|
|WORK_OF_ART |53.17|


## Results

The NER network component reproduces the architecture from the paper "_Application of a Hybrid Bi-LSTM-CRF model to the task of Russian Named Entity Recognition_" https://arxiv.org/pdf/1709.09686.pdf, which is inspired by LSTM+CRF architecture from https://arxiv.org/pdf/1603.01360.pdf.

Bi-LSTM architecture of NER network was tested on three datasets:
- Gareev corpus [1] (obtainable by request to authors)
- FactRuEval 2016 [2]
- Persons-1000 [3]
- Gareev corpus [5] (obtainable by request to authors)
- FactRuEval 2016 [6]
- Persons-1000 [7]

The F1 measure for our model along with the results of other published solutions are provided in the table below:

| Models | Gareev’s dataset | Persons-1000 | FactRuEval 2016 |
|---------------------- |:----------------:|:------------:|:---------------:|
| Gareev et al. [1] | 75.05 | | |
| Malykh et al. [4] | 62.49 | | |
| Trofimov [5] | | 95.57 | |
| Rubaylo et al. [6] | | | 78.13 |
| Sysoev et al. [7] | | | 74.67 |
| Ivanitsky et al. [7] | | | **87.88** |
| Mozharova et al. [8] | | 97.21 | |
| Gareev et al. [5] | 75.05 | | |
| Malykh et al. [8] | 62.49 | | |
| Trofimov [13] | | 95.57 | |
| Rubaylo et al. [9] | | | 78.13 |
| Sysoev et al. [10] | | | 74.67 |
| Ivanitsky et al. [11]| | | **87.88** |
| Mozharova et al. [12]| | 97.21 | |
| Our (Bi-LSTM+CRF) | **87.17** | **99.26** | 82.10 ||

## Literature
[1] - Strubell at al. (2017) Strubell, Emma, et al. "Fast and accurate entity recognition with iterated dilated convolutions." Proceedings of the 2017 Conference on Empirical Methods in Natural Language Processing. 2017.

[2] - Jason PC Chiu and Eric Nichols. 2016. Named entity recognition with bidirectional lstm-cnns. Transactions of the Association for Computational Linguistics, 4:357–370.

[3] - Greg Durrett and Dan Klein. 2014. A joint model for entity analysis: Coreference, typing and linking. Transactions of the Association for Computational Linguistics, 2:477–490.

[1] - Rinat Gareev, Maksim Tkachenko, Valery Solovyev, Andrey Simanovsky, Vladimir Ivanov: Introducing Baselines for Russian Named Entity Recognition. Computational Linguistics and Intelligent Text Processing, 329 -- 342 (2013).
[4] - Lev Ratinov and Dan Roth. 2009. Design challenges and misconceptions in named entity recognition. In Proceedings of the Thirteenth Conference on Computational Natural Language Learning, pages 147–155. Association for Computational Linguistics.

[2] - https://github.com/dialogue-evaluation/factRuEval-2016
[5] - Rinat Gareev, Maksim Tkachenko, Valery Solovyev, Andrey Simanovsky, Vladimir Ivanov: Introducing Baselines for Russian Named Entity Recognition. Computational Linguistics and Intelligent Text Processing, 329 -- 342 (2013).

[3] - http://ai-center.botik.ru/Airec/index.php/ru/collections/28-persons-1000
[6] - https://github.com/dialogue-evaluation/factRuEval-2016

[4] - Reproducing Russian NER Baseline Quality without Additional Data. In proceedings of the 3rd International Workshop on ConceptDiscovery in Unstructured Data, Moscow, Russia, 54 – 59 (2016)
[7] - http://ai-center.botik.ru/Airec/index.php/ru/collections/28-persons-1000

[5] - Rubaylo A. V., Kosenko M. Y.: Software utilities for natural language information
[8] - Malykh, Valentin, and Alexey Ozerin. "Reproducing Russian NER Baseline Quality without Additional Data." CDUD@ CLA. 2016.

[9] - Rubaylo A. V., Kosenko M. Y.: Software utilities for natural language information
retrievial. Almanac of modern science and education, Volume 12 (114), 87 – 92.(2016)

[6] - Sysoev A. A., Andrianov I. A.: Named Entity Recognition in Russian: the Power of Wiki-Based Approach. dialog-21.ru
[10] - Sysoev A. A., Andrianov I. A.: Named Entity Recognition in Russian: the Power of Wiki-Based Approach. dialog-21.ru

[11] - Ivanitskiy Roman, Alexander Shipilo, Liubov Kovriguina: Russian Named Entities Recognition and Classification Using Distributed Word and Phrase Representations. In SIMBig, 150 – 156. (2016).

[7] - Ivanitskiy Roman, Alexander Shipilo, Liubov Kovriguina: Russian Named Entities Recognition and Classification Using Distributed Word and Phrase Representations. In SIMBig, 150156. (2016).
[12] - Mozharova V., Loukachevitch N.: Two-stage approach in Russian named entity recognition. In Intelligence, Social Media and Web (ISMW FRUCT), 2016 International FRUCT Conference, 16 (2016)

[8] - Mozharova V., Loukachevitch N.: Two-stage approach in Russian named entity recognition. In Intelligence, Social Media and Web (ISMW FRUCT), 2016 International FRUCT Conference, 16 (2016)
[13] - Trofimov, I.V.: Person name recognition in news articles based on the persons- 1000/1111-F collections. In: 16th All-Russian Scientific C onference Digital Libraries: Advanced Methods and Technologies, Digital Collections, RCDL 2014,pp. 217221 (2014).
79 changes: 79 additions & 0 deletions deeppavlov/models/ner/ner_ontonotes.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,79 @@
"""
Copyright 2017 Neural Networks and Deep Learning lab, MIPT
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
"""
import tensorflow as tf
from overrides import overrides
from copy import deepcopy
import inspect
import json

from deeppavlov.core.common.registry import register
from deeppavlov.core.data.utils import tokenize_reg
from deeppavlov.models.ner.network_ontonotes import NerNetwork
from deeppavlov.core.models.tf_model import TFModel
from deeppavlov.core.common.log import get_logger

log = get_logger(__name__)


@register('ner_ontonotes')
class NER(TFModel):
def __init__(self, **kwargs):
self.opt = deepcopy(kwargs)
vocabs = self.opt.pop('vocabs')
self.opt.update(vocabs)

# Find all input parameters of the network init
network_parameter_names = list(inspect.signature(NerNetwork.__init__).parameters)
# Fill all provided parameters from opt
network_parameters = {par: self.opt[par] for par in network_parameter_names if par in self.opt}

self.sess = tf.Session()
network_parameters['sess'] = self.sess
self._network_parameters = network_parameters
self._net = NerNetwork(**network_parameters)

# Try to load the model (if there are some model files the model will be loaded from them)
super().__init__(**kwargs)
if self.load_path is not None:
self.load()

def load(self, *args, **kwargs):
super().load(*args, **kwargs)

def save(self, *args, **kwargs):
super().save(*args, **kwargs)
self.save_params()

def save_params(self):
params_to_save = {param: self.opt.get(param, None) for param in self.GRAPH_PARAMS}
for vocab in self.VOCABS:
params_to_save[vocab] = [self.opt[vocab][i] for i in range(len(self.opt[vocab]))]
path = str(self.save_path.with_suffix('.json').resolve())
log.info('[saving parameters to {}]'.format(path))
with open(path, 'w') as fp:
json.dump(params_to_save, fp, indent=4)

def train_on_batch(self, batch_x, batch_y):
raise NotImplementedError

@overrides
def __call__(self, batch, *args, **kwargs):
if isinstance(batch[0], str):
batch = [tokenize_reg(utterance) for utterance in batch]
return self._net.predict_on_batch(batch)

def shutdown(self):
self._net.shutdown()

0 comments on commit ae91d8f

Please sign in to comment.