Release 0.1.1

deeppavlov · Dec 12, 2018 · 4686425 · 4686425
2 parents 3aeecc7 + 5280613
commit 4686425
Show file tree

Hide file tree

Showing 56 changed files with 1,181 additions and 405 deletions.
diff --git a/.readthedocs.yml b/.readthedocs.yml
@@ -9,4 +9,4 @@ python:
   pip_install: true
   version: 3.6
   extra_requirements:
-        - docs
+        - docs
diff --git a/Jenkinsfile b/Jenkinsfile
@@ -9,6 +9,7 @@ node('gpu') {
             }
             stage('Setup') {
                 env.CUDA_VISIBLE_DEVICES=0
+                env.TFHUB_CACHE_DIR="tfhub_cache"
                 sh """
                     virtualenv --python=python3 '.venv-$BUILD_NUMBER'
                     . '.venv-$BUILD_NUMBER/bin/activate'

diff --git a/README.md b/README.md
@@ -1,6 +1,6 @@
+[![License Apache 2.0](https://img.shields.io/badge/license-Apache%202.0-blue.svg)](https://github.com/deepmipt/DeepPavlov/blob/master/LICENSE)
 ![Python 3.6](https://img.shields.io/badge/python-3.6-green.svg)
 [![Downloads](https://pepy.tech/badge/deeppavlov)](https://pepy.tech/project/deeppavlov)
-[![License Apache 2.0](https://img.shields.io/badge/license-Apache%202.0-blue.svg)](https://github.com/deepmipt/DeepPavlov/blob/master/LICENSE)
 
 DeepPavlov is an open-source conversational AI library built on [TensorFlow](https://www.tensorflow.org/) and [Keras](https://keras.io/). It is designed for
  * development of production ready chat-bots and complex conversational systems,
@@ -71,7 +71,7 @@ print(HelloBot(['Hello!', 'Boo...', 'Bye.']))
 
 [Goal(Task)-oriented Bot](http://docs.deeppavlov.ai/en/latest/skills/go_bot.html) | [Seq2seq Goal-Oriented bot](http://docs.deeppavlov.ai/en/latest/skills/seq2seq_go_bot.html)
 
-[Open Domain Questions Answering](http://docs.deeppavlov.ai/en/latest/skills/odqa.html) | [eCommerce Bot](http://docs.deeppavlov.ai/en/latest/skills/ecommerce_bot_skill.html) 
+[Open Domain Questions Answering](http://docs.deeppavlov.ai/en/latest/skills/odqa.html) | [eCommerce Bot](http://docs.deeppavlov.ai/en/master/skills/ecommerce.html) 
 
 [Frequently Asked Questions Answering](http://docs.deeppavlov.ai/en/latest/skills/faq.html) | [Pattern Matching](http://docs.deeppavlov.ai/en/latest/skills/pattern_matching.html) 
 
@@ -89,7 +89,7 @@ print(HelloBot(['Hello!', 'Boo...', 'Bye.']))
 
 0. Currently we support `Linux` and `Windows` platforms and `Python 3.6` 
     * **`Python 3.5` is not supported!**
-    * **`Windows` platform requires `Visual Studio 2015/2017` with `C++` build tools installed!**
+    * **`Windows` platform requires `Git` for Windows (for example, [git](https://git-scm.com/download/win)),  `Visual Studio 2015/2017` with `C++` build tools installed!**
 
 1. Create a virtual environment with `Python 3.6`:
     ```

diff --git a/deeppavlov/__init__.py b/deeppavlov/__init__.py
@@ -24,13 +24,13 @@
     from .download import deep_download
 
     # TODO: make better and add typing
-    def train_model(config, download=False):
-        train_evaluate_model_from_config(config, download=download)
+    def train_model(config, download=False, recursive=False):
+        train_evaluate_model_from_config(config, download=download, recursive=recursive)
         return build_model(config, load_trained=True)
 except ImportError:
     'Assuming that requirements are not yet installed'
 
-__version__ = '0.1.0'
+__version__ = '0.1.1'
 __author__ = 'Neural Networks and Deep Learning lab, MIPT'
 __description__ = 'An open source library for building end-to-end dialog systems and training chatbots.'
 __keywords__ = ['NLP', 'NER', 'SQUAD', 'Intents', 'Chatbot']

diff --git a/deeppavlov/agents/ecommerce_agent/ecommerce_agent.py b/deeppavlov/agents/ecommerce_agent/ecommerce_agent.py
@@ -180,7 +180,7 @@ def make_agent() -> EcommerceAgent:
         agent: created Ecommerce agent
     """
 
-    config_path = find_config('bleu_retrieve')
+    config_path = find_config('tfidf_retrieve')
     skill = build_model(config_path)
     agent = EcommerceAgent(skills=[skill])
     return agent

diff --git a/deeppavlov/configs/doc_retrieval/en_ranker_tfidf_enwiki20161221.json b/deeppavlov/configs/doc_retrieval/en_ranker_tfidf_enwiki20161221.json
@@ -0,0 +1,81 @@
+{
+  "dataset_reader": {
+    "class_name": "odqa_reader",
+    "data_path": "{DOWNLOADS_PATH}/odqa/enwiki20161221",
+    "save_path": "{DOWNLOADS_PATH}/odqa/enwiki20161221.db",
+    "dataset_format": "wiki"
+  },
+  "dataset_iterator": {
+    "class_name": "sqlite_iterator",
+    "shuffle": false,
+    "load_path": "{DOWNLOADS_PATH}/odqa/enwiki20161221.db"
+  },
+  "chainer": {
+    "in": [
+      "x"
+    ],
+    "out": [
+      "y"
+    ],
+    "pipe": [
+      {
+        "class_name": "hashing_tfidf_vectorizer",
+        "id": "vectorizer",
+        "fit_on_batch": [
+          "x"
+        ],
+        "save_path": "{MODELS_PATH}/odqa/enwiki20161221_tfidf_matrix.npz",
+        "load_path": "{MODELS_PATH}/odqa/enwiki20161221_tfidf_matrix.npz",
+        "tokenizer": {
+          "class_name": "stream_spacy_tokenizer",
+          "lemmas": true,
+          "ngram_range": [
+            1,
+            2
+          ]
+        }
+      },
+      {
+        "class_name": "tfidf_ranker",
+        "top_n": 25,
+        "in": [
+          "x"
+        ],
+        "out": [
+          "y",
+          "score"
+        ],
+        "vectorizer": "#vectorizer"
+      }
+    ]
+  },
+  "train": {
+    "validate_best": false,
+    "test_best": false,
+    "batch_size": 10000
+  },
+  "metadata": {
+    "variables": {
+      "ROOT_PATH": "~/.deeppavlov",
+      "DOWNLOADS_PATH": "{ROOT_PATH}/downloads",
+      "MODELS_PATH": "{ROOT_PATH}/models"
+    },
+    "requirements": [
+      "{DEEPPAVLOV_PATH}/requirements/spacy.txt",
+      "{DEEPPAVLOV_PATH}/requirements/en_core_web_sm.txt"
+    ],
+    "labels": {
+      "server_utils": "Ranker"
+    },
+    "download": [
+      {
+        "url": "http://files.deeppavlov.ai/datasets/wikipedia/enwiki20161221.tar.gz",
+        "subdir": "{DOWNLOADS_PATH}"
+      },
+      {
+        "url": "http://files.deeppavlov.ai/deeppavlov_data/en_odqa_enwiki20161221.tar.gz",
+        "subdir": "{MODELS_PATH}"
+      }
+    ]
+  }
+}
diff --git a/deeppavlov/configs/doc_retrieval/en_ranker_tfidf_wiki.json b/deeppavlov/configs/doc_retrieval/en_ranker_tfidf_wiki.json
@@ -12,7 +12,11 @@
   },
   "chainer": {
     "in": [
-      "x"
+      "docs"
+    ],
+    "in_y": [
+      "doc_ids",
+      "doc_nums"
     ],
     "out": [
       "y"
@@ -22,7 +26,9 @@
         "class_name": "hashing_tfidf_vectorizer",
         "id": "vectorizer",
         "fit_on_batch": [
-          "x"
+          "docs",
+          "doc_ids",
+          "doc_nums"
         ],
         "save_path": "{MODELS_PATH}/odqa/enwiki_tfidf_matrix.npz",
         "load_path": "{MODELS_PATH}/odqa/enwiki_tfidf_matrix.npz",
@@ -37,9 +43,9 @@
       },
       {
         "class_name": "tfidf_ranker",
-        "top_n": 5,
+        "top_n": 25,
         "in": [
-          "x"
+          "docs"
         ],
         "out": [
           "y",

diff --git a/deeppavlov/configs/doc_retrieval/ru_ranker_tfidf_wiki.json b/deeppavlov/configs/doc_retrieval/ru_ranker_tfidf_wiki.json
@@ -12,7 +12,11 @@
   },
   "chainer": {
     "in": [
-      "x"
+      "docs"
+    ],
+    "in_y": [
+      "doc_ids",
+      "doc_nums"
     ],
     "out": [
       "y"
@@ -22,7 +26,9 @@
         "class_name": "hashing_tfidf_vectorizer",
         "id": "vectorizer",
         "fit_on_batch": [
-          "x"
+          "docs",
+          "doc_ids",
+          "doc_nums"
         ],
         "save_path": "{MODELS_PATH}/odqa/ruwiki_tfidf_matrix.npz",
         "load_path": "{MODELS_PATH}/odqa/ruwiki_tfidf_matrix.npz",
@@ -39,7 +45,7 @@
         "class_name": "tfidf_ranker",
         "top_n": 5,
         "in": [
-          "x"
+          "docs"
         ],
         "out": [
           "y",

diff --git a/deeppavlov/configs/elmo/elmo-lm-ready4fine-tuning-ru-news-simple.json b/deeppavlov/configs/elmo/elmo-lm-ready4fine-tuning-ru-news-simple.json
@@ -0,0 +1,82 @@
+{
+  "dataset_reader": {
+    "class_name": "file_paths_reader",
+    "data_path": "{DOWNLOADS_PATH}/elmo-lm-ready4fine-example-data/data/",
+    "train": "train/*",
+    "valid": "heldout/*"
+  },
+  "dataset_iterator": {
+    "class_name": "elmo_file_paths_iterator",
+    "seed": 31415,
+    "unroll_steps": 20,
+    "max_word_length": 50,
+    "n_gpus": 1,
+    "shuffle": false,
+    "bos":"<S>",
+    "eos":"</S>",
+    "save_path": "{MODELS_PATH}/elmo-lm-ready4fine-tuning-ru-news-simple/vocab.txt",
+    "load_path":"{MODELS_PATH}/elmo-lm-ready4fine-tuning-ru-news-simple/vocab.txt"
+  },
+  "chainer": {
+    "in": [
+      "x_char_ids"
+    ],
+    "in_y": [
+      "y_token_ids"
+    ],
+    "pipe": [
+      {
+        "class_name": "elmo_model",
+        "options_json_path": "{MODELS_PATH}/elmo-lm-ready4fine-tuning-ru-news-simple/options.json",
+        "unroll_steps": 20,
+        "batch_size": 128,
+        "save_path": "{MODELS_PATH}/elmo-lm-ready4fine-tuning-ru-news-simple/saves/model",
+        "load_path": "{MODELS_PATH}/elmo-lm-ready4fine-tuning-ru-news-simple/saves/model",
+        "in": ["x_char_ids", "y_token_ids"],
+        "in_y": [],
+        "n_gpus": 1,
+        "out": ["loss"]
+      }
+    ],
+   "out": [
+    "x_char_ids",
+    "y_token_ids"
+  ]
+  },
+  "train": {
+    "epochs": 20,
+    "batch_size": 128,
+    "log_every_n_batches": 100,
+    "val_every_n_epochs": 1,
+    "validation_patience": 4,
+    "metric_optimization": "minimize",
+    "metrics": [
+      {
+        "name": "elmo_loss2ppl",
+        "inputs": ["loss"]
+      }
+    ],
+    "tensorboard_log_dir": "{MODELS_PATH}/elmo-lm-ready4fine-tuning-ru-news-simple/logs"
+  },
+  "metadata": {
+    "variables": {
+      "ROOT_PATH": "~/.deeppavlov",
+      "DOWNLOADS_PATH": "{ROOT_PATH}/downloads",
+      "MODELS_PATH": "{ROOT_PATH}/models"
+    },
+    "requirements": [
+      "{DEEPPAVLOV_PATH}/requirements/tf.txt",
+      "{DEEPPAVLOV_PATH}/requirements/tf-hub.txt"
+    ],
+    "download": [
+      {
+        "url": "http://files.deeppavlov.ai/deeppavlov_data/elmo-lm-ready4fine-example-data.tar.gz",
+        "subdir": "{DOWNLOADS_PATH}/"
+      },
+      {
+        "url": "http://files.deeppavlov.ai/deeppavlov_data/elmo-lm-ready4fine-tuning-ru-news-simple.tar.gz",
+        "subdir": "{MODELS_PATH}/"
+      }
+    ]
+  }
+}
diff --git a/deeppavlov/configs/elmo/elmo-lm-ready4fine-tuning-ru-news.json b/deeppavlov/configs/elmo/elmo-lm-ready4fine-tuning-ru-news.json
@@ -0,0 +1,82 @@
+{
+  "dataset_reader": {
+    "class_name": "file_paths_reader",
+    "data_path": "{DOWNLOADS_PATH}/elmo-lm-ready4fine-example-data/data/",
+    "train": "train/*",
+    "valid": "heldout/*"
+  },
+  "dataset_iterator": {
+    "class_name": "elmo_file_paths_iterator",
+    "seed": 31415,
+    "unroll_steps": 20,
+    "max_word_length": 50,
+    "n_gpus": 1,
+    "shuffle": false,
+    "bos":"<S>",
+    "eos":"</S>",
+    "save_path": "{MODELS_PATH}/elmo-lm-ready4fine-tuning-ru-news/vocab.txt",
+    "load_path":"{MODELS_PATH}/elmo-lm-ready4fine-tuning-ru-news/vocab.txt"
+  },
+  "chainer": {
+    "in": [
+      "x_char_ids"
+    ],
+    "in_y": [
+      "y_token_ids"
+    ],
+    "pipe": [
+      {
+        "class_name": "elmo_model",
+        "options_json_path": "{MODELS_PATH}/elmo-lm-ready4fine-tuning-ru-news/options.json",
+        "unroll_steps": 20,
+        "batch_size": 128,
+        "save_path": "{MODELS_PATH}/elmo-lm-ready4fine-tuning-ru-news/saves/model",
+        "load_path": "{MODELS_PATH}/elmo-lm-ready4fine-tuning-ru-news/saves/model",
+        "in": ["x_char_ids", "y_token_ids"],
+        "in_y": [],
+        "n_gpus": 1,
+        "out": ["loss"]
+      }
+    ],
+   "out": [
+    "x_char_ids",
+    "y_token_ids"
+  ]
+  },
+  "train": {
+    "epochs": 20,
+    "batch_size": 128,
+    "log_every_n_batches": 100,
+    "val_every_n_epochs": 1,
+    "validation_patience": 4,
+    "metric_optimization": "minimize",
+    "metrics": [
+      {
+        "name": "elmo_loss2ppl",
+        "inputs": ["loss"]
+      }
+    ],
+    "tensorboard_log_dir": "{MODELS_PATH}/elmo-lm-ready4fine-tuning-ru-news/logs"
+  },
+  "metadata": {
+    "variables": {
+      "ROOT_PATH": "~/.deeppavlov",
+      "DOWNLOADS_PATH": "{ROOT_PATH}/downloads",
+      "MODELS_PATH": "{ROOT_PATH}/models"
+    },
+    "requirements": [
+      "{DEEPPAVLOV_PATH}/requirements/tf.txt",
+      "{DEEPPAVLOV_PATH}/requirements/tf-hub.txt"
+    ],
+    "download": [
+      {
+        "url": "http://files.deeppavlov.ai/deeppavlov_data/elmo-lm-ready4fine-example-data.tar.gz",
+        "subdir": "{DOWNLOADS_PATH}/"
+      },
+      {
+        "url": "http://files.deeppavlov.ai/deeppavlov_data/elmo-lm-ready4fine-tuning-ru-news.tar.gz",
+        "subdir": "{MODELS_PATH}/"
+      }
+    ]
+  }
+}