Merge branches 'dev' and 'docs/readthedocs_integration'

# Conflicts: # deeppavlov/models/ner/README.md # deeppavlov/models/slotfill/README.md # deeppavlov/models/spelling_correction/README.md # resolved
deeppavlov · Jul 24, 2018 · 0e9904d · 0e9904d
2 parents 5e6633a + 405fa5b
commit 0e9904d
Show file tree

Hide file tree

Showing 48 changed files with 105 additions and 96 deletions.
diff --git a/deeppavlov/configs/go_bot/gobot_dstc2.json b/deeppavlov/configs/go_bot/gobot_dstc2.json
@@ -86,7 +86,8 @@
     "requirements": [
       "../dp_requirements/tf.txt",
       "../dp_requirements/fasttext.txt",
-      "../dp_requirements/spacy.txt"
+      "../dp_requirements/spacy.txt",
+      "../dp_requirements/en_core_web_sm.txt"
     ],
     "labels": {
       "telegram_utils": "GoalOrientedBot",

diff --git a/deeppavlov/configs/go_bot/gobot_dstc2_all.json b/deeppavlov/configs/go_bot/gobot_dstc2_all.json
@@ -91,7 +91,8 @@
     "requirements": [
       "../dp_requirements/tf.txt",
       "../dp_requirements/fasttext.txt",
-      "../dp_requirements/spacy.txt"
+      "../dp_requirements/spacy.txt",
+      "../dp_requirements/en_core_web_sm.txt"
     ],
     "labels": {
       "telegram_utils": "GoalOrientedBot",

diff --git a/deeppavlov/configs/go_bot/gobot_dstc2_best.json b/deeppavlov/configs/go_bot/gobot_dstc2_best.json
@@ -98,7 +98,8 @@
     "requirements": [
       "../dp_requirements/tf.txt",
       "../dp_requirements/fasttext.txt",
-      "../dp_requirements/spacy.txt"
+      "../dp_requirements/spacy.txt",
+      "../dp_requirements/en_core_web_sm.txt"
     ],
     "labels": {
       "telegram_utils": "GoalOrientedBot",

diff --git a/deeppavlov/configs/go_bot/gobot_dstc2_minimal.json b/deeppavlov/configs/go_bot/gobot_dstc2_minimal.json
@@ -70,7 +70,8 @@
     "requirements": [
       "../dp_requirements/tf.txt",
       "../dp_requirements/fasttext.txt",
-      "../dp_requirements/spacy.txt"
+      "../dp_requirements/spacy.txt",
+      "../dp_requirements/en_core_web_sm.txt"
     ],
 	"epochs": 200,
 	"batch_size": 4,

diff --git a/deeppavlov/configs/odqa/en_odqa_infer_wiki.json b/deeppavlov/configs/odqa/en_odqa_infer_wiki.json
@@ -48,7 +48,8 @@
   "metadata": {
     "requirements": [
       "../dp_requirements/tf-gpu.txt",
-      "../dp_requirements/spacy.txt"
+      "../dp_requirements/spacy.txt",
+      "../dp_requirements/en_core_web_sm.txt"
     ],
     "labels": {
       "server_utils": "ODQA"

diff --git a/deeppavlov/configs/ranking/en_ranker_tfidf_wiki.json b/deeppavlov/configs/ranking/en_ranker_tfidf_wiki.json
@@ -52,7 +52,8 @@
   },
   "metadata": {
     "requirements": [
-      "../dp_requirements/spacy.txt"
+      "../dp_requirements/spacy.txt",
+      "../dp_requirements/en_core_web_sm.txt"
     ],
     "labels": {
       "server_utils": "Ranker"

diff --git a/deeppavlov/configs/seq2seq_go_bot/bot_kvret.json b/deeppavlov/configs/seq2seq_go_bot/bot_kvret.json
@@ -109,7 +109,8 @@
   "metadata": {
     "requirements": [
       "../dp_requirements/tf.txt",
-      "../dp_requirements/spacy.txt"
+      "../dp_requirements/spacy.txt",
+      "../dp_requirements/en_core_web_sm.txt"
     ],
     "labels": {
       "telegram_utils": "Seq2SeqGoalOrientedBot",

diff --git a/deeppavlov/configs/seq2seq_go_bot/bot_kvret_infer.json b/deeppavlov/configs/seq2seq_go_bot/bot_kvret_infer.json
@@ -84,7 +84,8 @@
   "metadata": {
     "requirements": [
       "../dp_requirements/tf.txt",
-      "../dp_requirements/spacy.txt"
+      "../dp_requirements/spacy.txt",
+      "../dp_requirements/en_core_web_sm.txt"
     ],
     "labels": {
       "telegram_utils": "Seq2SeqGoalOrientedBot",

diff --git a/deeppavlov/core/commands/infer.py b/deeppavlov/core/commands/infer.py
@@ -114,7 +114,7 @@ def predict_on_stream(config_path, batch_size=1, file_path=None):
             raise RuntimeError('To process data from terminal please use interact mode')
         f = sys.stdin
     else:
-        f = open(file_path)
+        f = open(file_path, encoding='utf8')
 
     config = read_json(config_path)
     model: Chainer = build_model_from_config(config)

diff --git a/deeppavlov/core/common/file.py b/deeppavlov/core/common/file.py
@@ -19,12 +19,12 @@
 
 
 def read_json(fpath):
-    with open(fpath) as fin:
+    with open(fpath, encoding='utf8') as fin:
         return json.load(fin)
 
 
 def save_json(data, fpath):
-    with open(fpath, 'w') as fout:
+    with open(fpath, 'w', encoding='utf8') as fout:
         return json.dump(data, fout, ensure_ascii=False, indent=2)
 
 

diff --git a/deeppavlov/core/common/log.py b/deeppavlov/core/common/log.py
@@ -29,7 +29,7 @@ def get_logger(logger_name):
         config_dir = Path(__file__).resolve().parent
         log_config_path = Path(config_dir, '..', '..', LOG_CONFIG_FILENAME).resolve()
 
-        with open(log_config_path) as log_config_json:
+        with open(log_config_path, encoding='utf8') as log_config_json:
             log_config = json.load(log_config_json)
 
         configured_loggers = [log_config.get('root', {})] + log_config.get('loggers', [])

diff --git a/deeppavlov/core/data/simple_vocab.py b/deeppavlov/core/data/simple_vocab.py
@@ -81,7 +81,7 @@ def __call__(self, batch, **kwargs):
 
     def save(self):
         log.info("[saving vocabulary to {}]".format(self.save_path))
-        with self.save_path.open('wt') as f:
+        with self.save_path.open('wt', encoding='utf8') as f:
             for n in range(len(self)):
                 token = self._i2t[n]
                 cnt = self.freqs[token]
@@ -93,7 +93,7 @@ def load(self):
             if self.load_path.is_file():
                 log.info("[loading vocabulary from {}]".format(self.load_path))
                 tokens, counts = [], []
-                for ln in self.load_path.open('r'):
+                for ln in self.load_path.open('r', encoding='utf8'):
                     token, cnt = ln.split('\t', 1)
                     tokens.append(token)
                     counts.append(int(cnt))

diff --git a/deeppavlov/core/data/utils.py b/deeppavlov/core/data/utils.py
@@ -235,7 +235,7 @@ def copytree(src: Path, dest: Path):
 
 def load_vocab(vocab_path):
     vocab_path = Path(vocab_path)
-    with vocab_path.open() as f:
+    with vocab_path.open(encoding='utf8') as f:
         return f.read().split()
 
 

diff --git a/deeppavlov/dataset_iterators/dstc2_ner_iterator.py b/deeppavlov/dataset_iterators/dstc2_ner_iterator.py
@@ -42,7 +42,7 @@ def __init__(self, data, dataset_path, seed=None, shuffle=False):
         # TODO: include slot vals to dstc2.tar.gz
         dataset_path = expand_path(dataset_path) / 'slot_vals.json'
         self._build_slot_vals(dataset_path)
-        with open(dataset_path) as f:
+        with open(dataset_path, encoding='utf8') as f:
             self._slot_vals = json.load(f)
         for data_type in ['train', 'test', 'valid']:
             bio_markup_data = self._preprocess(data.get(data_type, []))

diff --git a/deeppavlov/dataset_readers/babi_reader.py b/deeppavlov/dataset_readers/babi_reader.py
@@ -44,7 +44,7 @@ def read(self, file_path):
         responses = self._get_responses(file_path, dialogs)
 
         responses_path = Path(paths.deeppavlov_root) / 'responses.txt'
-        responses_path.write_text('\n'.join(responses))
+        responses_path.write_text('\n'.join(responses), encoding='utf8')
 
         trainset = [{'context': u, 'response': r} for u, r in zip(utterances, responses)]
 
@@ -72,7 +72,7 @@ def filter_(dialogs):
                     filtered_.append(row)
             return filtered_
 
-        with open(file_path) as f:
+        with open(file_path, encoding='utf8') as f:
             dialogs = filter_([rm_index(row.split('\t')) for row in f.read().split('\n')])
             # organize dialogs -> dialog_indices
             prev_idx = -1
@@ -110,7 +110,7 @@ def _get_responses(self, file_path, dialogs=None):
 #TODO: move save_vocab to babi_dataset
     @staticmethod
     def save_vocab(dialogs, fpath):
-        with open(fpath, 'w') as f:
+        with open(fpath, 'w', encoding='utf8') as f:
             words = sorted(list(set(chain.from_iterable(
                 [instance['context'].split() for dialog in dialogs for instance in dialog]))))
             f.write(' '.join(words))
diff --git a/deeppavlov/dataset_readers/conll2003_reader.py b/deeppavlov/dataset_readers/conll2003_reader.py
@@ -30,7 +30,7 @@ def read(self, dir_path: str, dataset_name='conll2003', provide_pos=False):
 
     def parse_ner_file(self, file_name: Path):
         samples = []
-        with file_name.open() as f:
+        with file_name.open(encoding='utf8') as f:
             tokens = ['<DOCSTART>']
             pos_tags = ['O']
             tags = ['O']

diff --git a/deeppavlov/dataset_readers/dstc2_reader.py b/deeppavlov/dataset_readers/dstc2_reader.py
@@ -108,7 +108,7 @@ def _format_turn(turn):
 
     @staticmethod
     def _iter_file(file_path):
-        for ln in open(file_path, 'rt'):
+        for ln in open(file_path, 'rt', encoding='utf8'):
             if ln.strip():
                 yield json.loads(ln)
             else:
@@ -236,7 +236,7 @@ def _format_turn(turn):
 
     @staticmethod
     def _iter_file(file_path):
-        for ln in open(file_path, 'rt'):
+        for ln in open(file_path, 'rt', encoding='utf8'):
             if ln.strip():
                 yield json.loads(ln)
             else:

diff --git a/deeppavlov/dataset_readers/insurance_reader.py b/deeppavlov/dataset_readers/insurance_reader.py
@@ -30,19 +30,19 @@ def download_data(self, data_path):
 
     def _build_context2toks_vocabulary(self, train_f, val_f, test_f):
         contexts = []
-        with open(train_f, 'r') as f:
+        with open(train_f, 'r', encoding='utf8') as f:
             data = f.readlines()
         for eli in data:
             eli = eli[:-1]
             c, _ = eli.split('\t')
             contexts.append(c)
-        with open(val_f, 'r') as f:
+        with open(val_f, 'r', encoding='utf8') as f:
             data = f.readlines()
         for eli in data:
             eli = eli[:-1]
             _, c, _ = eli.split('\t')
             contexts.append(c)
-        with open(test_f, 'r') as f:
+        with open(test_f, 'r', encoding='utf8') as f:
             data = f.readlines()
         for eli in data:
             eli = eli[:-1]
@@ -55,7 +55,7 @@ def preprocess_data_train(self, fname):
         positive_responses_pool = []
         contexts = []
         responses = []
-        with open(fname, 'r') as f:
+        with open(fname, 'r', encoding='utf8') as f:
             data = f.readlines()
         for eli in data:
             eli = eli[:-1]
@@ -75,7 +75,7 @@ def preprocess_data_valid_test(self, fname):
         neg_responses_pool = []
         contexts = []
         pos_responses = []
-        with open(fname, 'r') as f:
+        with open(fname, 'r', encoding='utf8') as f:
             data = f.readlines()
         for eli in data:
             eli = eli[:-1]

diff --git a/deeppavlov/dataset_readers/kvret_reader.py b/deeppavlov/dataset_readers/kvret_reader.py
@@ -111,6 +111,7 @@ def _check_dialog(dialog):
         #    return False
         return True
 
+    @staticmethod
     def _filter_duplicates(dialog):
         last_turn, last_utter = None, None
         for turn in dialog:
@@ -121,7 +122,8 @@ def _filter_duplicates(dialog):
 
     @classmethod
     def _iter_file(cls, file_path):
-        data = json.load(open(file_path, 'rt'))
+        with open(file_path, 'rt', encoding='utf8') as f:
+            data = json.load(f)
         for i, sample in enumerate(data):
             dialog = list(cls._filter_duplicates(sample['dialogue']))
             if cls._check_dialog(dialog):

diff --git a/deeppavlov/dataset_readers/squad_dataset_reader.py b/deeppavlov/dataset_readers/squad_dataset_reader.py
@@ -53,7 +53,8 @@ def read(self, dir_path: str, dataset='SQuAD'):
 
         dataset = {}
         for f in required_files:
-            data = json.load((dir_path / f).open('r'))
+            with dir_path.joinpath(f).open('r', encoding='utf8') as fp:
+                data = json.load(fp)
             if f == 'dev-v1.1.json':
                 dataset['valid'] = data
             else:

diff --git a/deeppavlov/dataset_readers/typos_reader.py b/deeppavlov/dataset_readers/typos_reader.py
@@ -45,7 +45,7 @@ def build(data_path: str):
     @classmethod
     def read(cls, data_path: str, *args, **kwargs):
         fname = cls.build(data_path)
-        with fname.open(newline='') as tsvfile:
+        with fname.open(newline='', encoding='utf8') as tsvfile:
             reader = csv.reader(tsvfile, delimiter='\t')
             next(reader)
             res = [(mistake, correct) for mistake, correct in reader]
@@ -73,7 +73,7 @@ def build(data_path: str):
                     data.append([typo.strip(), correct.strip()])
 
             fname.parent.mkdir(parents=True, exist_ok=True)
-            with fname.open('w', newline='') as tsvfile:
+            with fname.open('w', newline='', encoding='utf8') as tsvfile:
                 writer = csv.writer(tsvfile, delimiter='\t')
                 for line in data:
                     writer.writerow(line)
@@ -108,7 +108,7 @@ def build(data_path: str):
     @staticmethod
     def read(data_path: str, *args, **kwargs):
         fname = TyposKartaslov.build(data_path)
-        with open(str(fname), newline='') as csvfile:
+        with open(str(fname), newline='', encoding='utf8') as csvfile:
             reader = csv.reader(csvfile, delimiter=';')
             next(reader)
             res = [(mistake, correct) for correct, mistake, weight in reader]

diff --git a/deeppavlov/evolve.py b/deeppavlov/evolve.py
@@ -277,7 +277,7 @@ def results_to_table(population, evolution, considered_metrics, result_file, res
     for i in range(population_size):
         with open(str(expand_path(Path(evolution.get_value_from_config(
                 population[i],
-                evolution.main_model_path + ["save_path"])).parent.joinpath("out.txt"))), "r") as fout:
+                evolution.main_model_path + ["save_path"])).parent.joinpath("out.txt"))), "r", encoding='utf8') as fout:
             reports_data = fout.read().splitlines()[-2:]
         reports = []
         for j in range(2):

diff --git a/deeppavlov/metrics/mrr_classification.py b/deeppavlov/metrics/mrr_classification.py
@@ -29,7 +29,7 @@ def calc_mrr(rank):
 
 def mrr_from_json(fname):
     data = []
-    with open(fname) as f:
+    with open(fname, encoding='utf8') as f:
         for line in f.readlines():
             data += [json.loads(line)]
     rank_i = []
@@ -55,7 +55,7 @@ def mrr_from_dict(data):
 
 def make_json_predictions(fname, predictions):
     data = []
-    with open(fname) as f:
+    with open(fname, encoding='utf8') as f:
         for line in f.readlines():
             data += [json.loads(line)]
 

diff --git a/deeppavlov/models/embedders/dict_embedder.py b/deeppavlov/models/embedders/dict_embedder.py
@@ -51,7 +51,7 @@ def load(self):
         else:
             log.info('Loading existing dictionary of embeddings from {}'.format(self.load_path))
 
-            with open(str(self.load_path)) as fin:
+            with open(self.load_path, encoding='utf8') as fin:
                 for line in fin:
                     values = line.rsplit(sep=' ', maxsplit=self.dim)
                     assert (len(values) == self.dim + 1)

diff --git a/deeppavlov/models/embedders/glove_embedder.py b/deeppavlov/models/embedders/glove_embedder.py
@@ -49,7 +49,7 @@ def load(self, *args, **kwargs):
         """
 
         # Check that header with n_words emb_dim present
-        with open(self.load_path) as f:
+        with open(self.load_path, encoding='utf8') as f:
             header = f.readline()
             if len(header.split()) != 2:
                 raise RuntimeError('The GloVe file must start with number_of_words embeddings_dim line! '

diff --git a/deeppavlov/models/evolution/Results_analysis.ipynb b/deeppavlov/models/evolution/Results_analysis.ipynb
@@ -37,7 +37,7 @@
     "KEY_MAIN_MODEL = \"main\"\n",
     "POPULATION_SIZE = 2\n",
     "    \n",
-    "with open(CONFIG_FILE, \"r\") as f:\n",
+    "with open(CONFIG_FILE, \"r\", encoding='utf8') as f:\n",
     "    basic_params = json.load(f)\n",
     "\n",
     "set_deeppavlov_root(basic_params)\n",

diff --git a/deeppavlov/models/go_bot/network.py b/deeppavlov/models/go_bot/network.py
@@ -330,13 +330,13 @@ def save(self, *args, **kwargs):
     def save_params(self):
         path = str(self.save_path.with_suffix('.json').resolve())
         log.info('[saving parameters to {}]'.format(path))
-        with open(path, 'w') as fp:
+        with open(path, 'w', encoding='utf8') as fp:
             json.dump(self.opt, fp)
 
     def load_params(self):
         path = str(self.load_path.with_suffix('.json').resolve())
         log.info('[loading parameters from {}]'.format(path))
-        with open(path, 'r') as fp:
+        with open(path, 'r', encoding='utf8') as fp:
             params = json.load(fp)
         for p in self.GRAPH_PARAMS:
             if self.opt.get(p) != params.get(p):

diff --git a/deeppavlov/models/go_bot/templates.py b/deeppavlov/models/go_bot/templates.py
@@ -175,13 +175,14 @@ def templates(self):
         return self._templates
 
     def load(self, filename):
-        for ln in open(filename, 'r'):
-            act, template = ln.strip('\n').split('\t', 1)
-            self.__setitem__(act, self.ttype.from_str(template))
+        with open(filename, 'r', encoding='utf8') as fp:
+            for ln in fp:
+                act, template = ln.strip('\n').split('\t', 1)
+                self.__setitem__(act, self.ttype.from_str(template))
         return self
 
     def save(self, filename):
-        with open(filename, 'w') as outfile:
+        with open(filename, 'w', encoding='utf8') as outfile:
             for act in sorted(self.actions):
                 template = self.__getitem__(act)
                 outfile.write('{}\t{}\n'.format(act, template))
diff --git a/deeppavlov/models/preprocessors/squad_preprocessor.py b/deeppavlov/models/preprocessors/squad_preprocessor.py
@@ -227,7 +227,7 @@ def fit(self, contexts, questions, *args, **kwargs):
             else:
                 raise RuntimeError("SquadVocabEmbedder::fit: Unknown level: {}".format(self.level))
 
-            with (self.emb_folder / self.emb_file_name).open('r') as femb:
+            with (self.emb_folder / self.emb_file_name).open('r', encoding='utf8') as femb:
                 emb_voc_size, self.emb_dim = map(int, femb.readline().split())
                 for line in tqdm(femb, total=emb_voc_size):
                     line_split = line.strip().split(' ')