Skip to content

Commit

Permalink
feta: morphological tagging model which uses Pymorphy as an additiona…
Browse files Browse the repository at this point in the history
…l input

* feature: dictionary-based vectorizer

* feature: dictionary-based vectorizer

* feature: dictionary-based vectorizer

* feature: Pymorphy-based vectorizer for Russian

* feature: Pymorphy-based vectorizer for Russian

* test: update tests and requirements for morpho_tagger

* fix: fix pymorphy_vectorizer bug in interaction

* test: uncomment tests

* test: uncomment tests

* update: update registry

* docs: update morpho_tagger readme, configs and requirements

* refactor: move requirements to dp_requirements

* fix: change requirements to dp_requirements in morpho_tagger config

* refactor: introduce basic class WordIndexVectorizer for word-level vectorizers

* doc: update requirements for morpho_tagger
  • Loading branch information
AlexeySorokin authored and seliverstov committed Jul 27, 2018
1 parent 3aa7ca0 commit 64252de
Show file tree
Hide file tree
Showing 13 changed files with 582 additions and 24 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -60,7 +60,7 @@
"epochs": 50,
"batch_size": 32,

"metrics": ["per_token_accuracy"],
"metrics": ["per_token_accuracy", "accuracy"],
"validation_patience": 10,
"val_every_n_epochs": 1,
"log_every_n_epochs": 1
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,87 @@
{
"dataset_reader": {
"name": "morphotagger_dataset_reader",
"data_path": "UD2.0_source",
"language": "hu", "data_types": ["train", "dev", "test"]
},
"dataset_iterator": {
"name": "morphotagger_dataset"
},
"chainer": {
"in": ["x"],
"in_y": ["y"],
"pipe": [
{
"id": "lowercase_preprocessor",
"name": "lowercase_preprocessor",
"in": ["x"],
"out": ["x_processed"]
},
{
"id": "tag_vocab",
"name": "default_vocab",
"fit_on": ["y"],
"level": "token",
"special_tokens": ["PAD", "BEGIN", "END"],
"save_path": "morpho_tagger/UD2.0/hu/tag.dict",
"load_path": "morpho_tagger/UD2.0/hu/tag.dict"
},
{
"id": "char_vocab",
"name": "default_vocab",
"min_freq": 3,
"fit_on": ["x_processed"],
"special_tokens": ["PAD", "BEGIN", "END"],
"level": "char",
"save_path": "morpho_tagger/UD2.0/hu/char.dict",
"load_path": "morpho_tagger/UD2.0/hu/char.dict"
},
{
"id": "dictionary_vectorizer",
"name": "dictionary_vectorizer",
"save_path": "/home/alexeysorokin/data/DeepPavlov/download/UD2.0_dict",
"load_path": "/home/alexeysorokin/data/DeepPavlov/download/UD2.0_dict",
"in": ["x"],
"out": ["x_possible_tags"]
},
{
"in": ["x_processed", "x_possible_tags"],
"in_y": ["y"],
"out": ["y_predicted"],
"name": "morpho_tagger",
"main": true,
"save_path": "morpho_tagger/UD2.0/hu/model_dict.hdf5",
"load_path": "morpho_tagger/UD2.0/hu/model_dict.hdf5",
"tags": "#tag_vocab",
"symbols": "#char_vocab",
"verbose": 1,
"char_embeddings_size": 32, "char_window_size": [1, 2, 3, 4, 5, 6, 7],
"word_lstm_units": 128, "conv_dropout": 0.0, "char_conv_layers": 1,
"char_highway_layers": 1, "highway_dropout": 0.0, "word_lstm_layers": 1,
"char_filter_multiple": 50, "intermediate_dropout": 0.0, "word_dropout": 0.2,
"lstm_dropout": 0.3, "regularizer": 0.01, "lm_dropout": 0.3,
"word_vectorizers": [["#dictionary_vectorizer.dim", 128]]
}
],
"out": ["y_predicted"]
},
"train": {
"epochs": 2,
"batch_size": 32,

"metrics": ["per_token_accuracy"],
"validation_patience": 10,
"val_every_n_epochs": 1,
"log_every_n_epochs": 1
},
"metadata": {
"download": [
"http://lnsigo.mipt.ru/export/deeppavlov_data/morpho_tagger.tar.gz",
{
"url": "http://lnsigo.mipt.ru/export/datasets/UD2.0_source/hu.tar.gz",
"subdir": "UD2.0_source/hu"
}
],
"telegram_utils": "MorphoTaggerModel"
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,90 @@
{
"dataset_reader": {
"name": "morphotagger_dataset_reader",
"data_path": "UD2.0_source",
"language": "ru_syntagrus", "data_types": ["test"]
},
"dataset_iterator": {
"name": "morphotagger_dataset"
},
"chainer": {
"in": ["x"],
"in_y": ["y"],
"pipe": [
{
"id": "lowercase_preprocessor",
"name": "lowercase_preprocessor",
"in": ["x"],
"out": ["x_processed"]
},
{
"id": "tag_vocab",
"name": "default_vocab",
"level": "token",
"special_tokens": ["PAD", "BEGIN", "END"],
"save_path": "morpho_tagger/UD2.0/ru_syntagrus/tag.dict",
"load_path": "morpho_tagger/UD2.0/ru_syntagrus/tag.dict"
},
{
"id": "char_vocab",
"name": "default_vocab",
"min_freq": 3,
"special_tokens": ["PAD", "BEGIN", "END"],
"level": "char",
"save_path": "morpho_tagger/UD2.0/ru_syntagrus/char.dict",
"load_path": "morpho_tagger/UD2.0/ru_syntagrus/char.dict"
},
{
"id": "pymorphy_vectorizer",
"name": "pymorphy_vectorizer",
"save_path": "morpho_tagger/UD2.0/ru_syntagrus/tags_russian.txt",
"load_path": "morpho_tagger/UD2.0/ru_syntagrus/tags_russian.txt",
"max_pymorphy_variants": 5,
"in": ["x"],
"out": ["x_possible_tags"]
},
{
"in": ["x_processed", "x_possible_tags"],
"in_y": ["y"],
"out": ["y_predicted"],
"name": "morpho_tagger",
"main": true,
"save_path": "morpho_tagger/UD2.0/ru_syntagrus/model_pymorphy.hdf5",
"load_path": "morpho_tagger/UD2.0/ru_syntagrus/model_pymorphy.hdf5",
"tags": "#tag_vocab",
"symbols": "#char_vocab",
"verbose": 1,
"char_embeddings_size": 32, "char_window_size": [1, 2, 3, 4, 5, 6, 7],
"word_lstm_units": 128, "conv_dropout": 0.0, "char_conv_layers": 1,
"char_highway_layers": 1, "highway_dropout": 0.0, "word_lstm_layers": 1,
"char_filter_multiple": 50, "intermediate_dropout": 0.0,
"word_dropout": 0.2, "lstm_dropout": 0.2, "regularizer": 0.01,
"word_vectorizers": [["#pymorphy_vectorizer.dim", 128]]
},
{
"in": ["x", "y_predicted"],
"out": ["y_prettified"],
"name": "tag_output_prettifier",
"end": "\n"
}
],
"out": ["y_prettified"]
},
"predict": {
"batch_size": 32,
"outfile": "results/ud_ru_syntagrus_test.res"
},
"metadata": {
"requirements": [
"../dp_requirements/tf.txt", "../dp_requirements/morpho_tagger.txt"
],
"download": [
"http://lnsigo.mipt.ru/export/deeppavlov_data/morpho_tagger.tar.gz",
{
"url": "http://lnsigo.mipt.ru/export/datasets/UD2.0_source/ru_syntagrus.tar.gz",
"subdir": "UD2.0_source/ru_syntagrus"
}
],
"telegram_utils": "MorphoTaggerPymorphyModel"
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -60,7 +60,7 @@
"epochs": 50,
"batch_size": 32,

"metrics": ["per_token_accuracy"],
"metrics": ["per_token_accuracy", "accuracy"],
"validation_patience": 10,
"val_every_n_epochs": 1,
"log_every_n_epochs": 1
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,91 @@
{
"dataset_reader": {
"name": "morphotagger_dataset_reader",
"language": "ru_syntagrus",
"data_types": ["train", "dev", "test"]
},
"dataset_iterator": {
"name": "morphotagger_dataset"
},
"chainer": {
"in": ["x"],
"in_y": ["y"],
"pipe": [
{
"id": "lowercase_preprocessor",
"name": "lowercase_preprocessor",
"in": ["x"],
"out": ["x_processed"]
},
{
"id": "tag_vocab",
"name": "default_vocab",
"fit_on": ["y"],
"level": "token",
"special_tokens": ["PAD", "BEGIN", "END"],
"save_path": "morpho_tagger/UD2.0/ru_syntagrus/tag.dict",
"load_path": "morpho_tagger/UD2.0/ru_syntagrus/tag.dict"
},
{
"id": "char_vocab",
"name": "default_vocab",
"min_freq": 3,
"fit_on": ["x_processed"],
"special_tokens": ["PAD", "BEGIN", "END"],
"level": "char",
"save_path": "morpho_tagger/UD2.0/ru_syntagrus/char.dict",
"load_path": "morpho_tagger/UD2.0/ru_syntagrus/char.dict"
},
{
"id": "pymorphy_vectorizer",
"name": "pymorphy_vectorizer",
"save_path": "morpho_tagger/UD2.0/ru_syntagrus/tags_russian.txt",
"load_path": "morpho_tagger/UD2.0/ru_syntagrus/tags_russian.txt",
"max_pymorphy_variants": 5,
"in": ["x"],
"out": ["x_possible_tags"]
},
{
"in": ["x_processed", "x_possible_tags"],
"in_y": ["y"],
"out": ["y_predicted"],
"name": "morpho_tagger",
"main": true,
"save_path": "morpho_tagger/UD2.0/ru_syntagrus/model_pymorphy.hdf5",
"load_path": "morpho_tagger/UD2.0/ru_syntagrus/model_pymorphy.hdf5",
"tags": "#tag_vocab",
"symbols": "#char_vocab",
"verbose": 1,
"char_embeddings_size": 32, "char_window_size": [1, 2, 3, 4, 5, 6, 7],
"word_lstm_units": 128, "conv_dropout": 0.0, "char_conv_layers": 1,
"char_highway_layers": 1, "highway_dropout": 0.0, "word_lstm_layers": 1,
"char_filter_multiple": 50, "intermediate_dropout": 0.0, "word_dropout": 0.2,
"lstm_dropout": 0.2, "regularizer": 0.01,
"word_vectorizers": [["#pymorphy_vectorizer.dim", 128]]
}
],
"out": ["y_predicted"]
},
"train": {
"epochs": 75,
"batch_size": 32,

"metrics": ["per_token_accuracy", "accuracy"],
"validation_patience": 10,
"val_every_n_epochs": 1,
"log_every_n_epochs": 1
},
"metadata": {
"requirements": [
"../dp_requirements/tf.txt", "../dp_requirements/morpho_tagger.txt"
],
"download": [
"http://lnsigo.mipt.ru/export/deeppavlov_data/morpho_tagger.tar.gz",
{
"url": "http://lnsigo.mipt.ru/export/datasets/UD2.0_source/ru_syntagrus.tar.gz",
"subdir": "UD2.0_source/ru_syntagrus"
}
],
"telegram_utils": "MorphoTaggerPymorphyModel"
}
}
6 changes: 5 additions & 1 deletion deeppavlov/core/commands/train.py
Original file line number Diff line number Diff line change
Expand Up @@ -133,7 +133,11 @@ def train_evaluate_model_from_config(config: [str, Path, dict], to_train=True, t
raise e
else:
reader = get_model(reader_config.pop('name'))()
data_path = expand_path(reader_config.pop('data_path', ''))
data_path = reader_config.pop('data_path', '')
if isinstance(data_path, list):
data_path = [expand_path(x) for x in data_path]
else:
data_path = expand_path(data_path)
data = reader.read(data_path, **reader_config)
else:
log.warning("No dataset reader is provided in the JSON config.")
Expand Down
2 changes: 2 additions & 0 deletions deeppavlov/core/common/registry.json
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@
"dialog_iterator": "deeppavlov.dataset_iterators.dialog_iterator:DialogDatasetIterator",
"dialog_vocab": "deeppavlov.core.data.simple_vocab:DialogVocab",
"dict_emb": "deeppavlov.models.embedders.dict_embedder:DictEmbedder",
"dictionary_vectorizer": "deeppavlov.models.vectorizers.word_vectorizer:DictionaryVectorizer",
"dirty_comments_preprocessor": "deeppavlov.models.preprocessors.dirty_comments_preprocessor:DirtyCommentsPreprocessor",
"dstc2_intents_iterator": "deeppavlov.dataset_iterators.dstc2_intents_iterator:Dstc2IntentsDatasetIterator",
"dstc2_ner_iterator": "deeppavlov.dataset_iterators.dstc2_ner_iterator:Dstc2NerDatasetIterator",
Expand Down Expand Up @@ -50,6 +51,7 @@
"ontonotes_reader": "deeppavlov.dataset_readers.ontonotes_reader:OntonotesReader",
"params_evolution": "deeppavlov.models.evolution.evolution_param_generator:ParamsEvolution",
"pymorphy_russian_lemmatizer": "deeppavlov.models.preprocessors.russian_lemmatizer:PymorphyRussianLemmatizer",
"pymorphy_vectorizer": "deeppavlov.models.vectorizers.word_vectorizer:PymorphyVectorizer",
"random": "deeppavlov.models.commutators.random_commutator:RandomCommutator",
"random_emb_mat": "deeppavlov.models.preprocessors.assemble_embeddins_matrix:RandomEmbeddingsMatrix",
"ranking_iterator": "deeppavlov.dataset_iterators.ranking_iterator:RankingIterator",
Expand Down
9 changes: 2 additions & 7 deletions deeppavlov/dataset_iterators/morphotagger_iterator.py
Original file line number Diff line number Diff line change
Expand Up @@ -60,14 +60,9 @@ class MorphoTaggerDatasetIterator(DataLearningIterator):

def __init__(self, data, seed=None, shuffle=True,
validation_split=0.2, bucket=True):

# processed_data = {mode: preprocess_data(sample, to_lower=to_lower,
# append_case=append_case)
# for mode, sample in data.items()}
processed_data = data
self.bucket = bucket
self.validation_split = validation_split
super().__init__(processed_data, seed, shuffle)
super().__init__(data, seed, shuffle)

def split(self):
if len(self.valid) == 0:
Expand All @@ -91,7 +86,7 @@ def gen_batches(self, batch_size: int, data_type: str = 'train',
batch_size = L
for start in range(0, L, batch_size):
indexes_to_yield = indexes[start:start+batch_size]
data_to_yield = tuple(zip(*([data[i] for i in indexes_to_yield])))
data_to_yield = tuple(list(x) for x in zip(*([data[i] for i in indexes_to_yield])))
if return_indexes:
yield indexes_to_yield, data_to_yield
else:
Expand Down

0 comments on commit 64252de

Please sign in to comment.