merge master

Delta-ML · Aug 8, 2019 · 2a4f7bb · 2a4f7bb
1 parent 72b961c
commit 2a4f7bb
Show file tree

Hide file tree

Showing 82 changed files with 839 additions and 545 deletions.
diff --git a/.github/ISSUE_TEMPLATE/bug_report.md b/.github/ISSUE_TEMPLATE/bug_report.md
@@ -0,0 +1,38 @@
+---
+name: Bug report
+about: Create a report to help us improve
+title: ''
+labels: ''
+assignees: ''
+
+---
+
+**Describe the bug**
+A clear and concise description of what the bug is.
+
+**To Reproduce**
+Steps to reproduce the behavior:
+1. Go to '...'
+2. Click on '....'
+3. Scroll down to '....'
+4. See error
+
+**Expected behavior**
+A clear and concise description of what you expected to happen.
+
+**Screenshots**
+If applicable, add screenshots to help explain your problem.
+
+**Desktop (please complete the following information):**
+ - OS: [e.g. iOS]
+ - Browser [e.g. chrome, safari]
+ - Version [e.g. 22]
+
+**Smartphone (please complete the following information):**
+ - Device: [e.g. iPhone6]
+ - OS: [e.g. iOS8.1]
+ - Browser [e.g. stock browser, safari]
+ - Version [e.g. 22]
+
+**Additional context**
+Add any other context about the problem here.
diff --git a/.github/ISSUE_TEMPLATE/feature_request.md b/.github/ISSUE_TEMPLATE/feature_request.md
@@ -0,0 +1,20 @@
+---
+name: Feature request
+about: Suggest an idea for this project
+title: ''
+labels: ''
+assignees: ''
+
+---
+
+**Is your feature request related to a problem? Please describe.**
+A clear and concise description of what the problem is. Ex. I'm always frustrated when [...]
+
+**Describe the solution you'd like**
+A clear and concise description of what you want to happen.
+
+**Describe alternatives you've considered**
+A clear and concise description of any alternative solutions or features you've considered.
+
+**Additional context**
+Add any other context or screenshots about the feature request here.
diff --git a/.pip/pip.conf b/.pip/pip.conf
@@ -0,0 +1,4 @@
+[global]
+index-url = https://mirrors.ustc.edu.cn/pypi/web/simple
+extra-index-url = https://pypi.tuna.tsinghua.edu.cn/simple
+format = columns
diff --git a/README.md b/README.md
@@ -1,19 +1,24 @@
-# DELTA - a DEep Language Technology plAtform 
+<div align="center">
+  <img src="docs/delta_logo_1.png">
+</div>
+
 
 [![Build Status](https://travis-ci.org/didi/delta.svg?branch=master)](https://travis-ci.org/didi/delta)
 [![Contributions welcome](https://img.shields.io/badge/contributions-welcome-brightgreen.svg)](CONTRIBUTING.md)
 [![GitHub top language](https://img.shields.io/github/languages/top/didi/delta)](https://img.shields.io/github/languages/top/didi/delta)
 [![GitHub Issues](https://img.shields.io/github/issues/didi/delta.svg)](https://github.com/didi/delta/issues)
 [![License](https://img.shields.io/badge/License-Apache%202.0-blue.svg)](https://github.com/didi/delta/blob/master/LICENSE)
 
+# DELTA - A DEep learning Language Technology plAtform
+
 
 ## What is DELTA?
 
 **DELTA** is a deep learning based end-to-end natural language and speech processing platform. 
 DELTA aims to provide easy and fast experiences for using, deploying, and developing natural language processing and speech models 
 for both academia and industry use cases. DELTA is mainly implemented using TensorFlow and Python 3.
 
-For details of DELTA, please refer to this [paper](docs/DELTA.pdf).
+For details of DELTA, please refer to this [paper](https://arxiv.org/abs/1908.01853).
 
 ## What can DELTA do?
 
@@ -39,6 +44,7 @@ It helps you to train, develop, and deploy NLP and/or speech models, featuring:
 - [Benchmarks](#benchmarks)
 - [FAQ](#faq)
 - [Contributing](#contributing)
+- [References](#references)
 - [License](#license)
 - [Acknowledgement](#acknowledgement)
 
@@ -203,6 +209,25 @@ See [FAQ](docs/faq.md) for more information.
 Any contribution is welcome. All issues and pull requests are highly appreciated!
 For more details, please refer to [the contribution guide](CONTRIBUTING.md).
 
+## References
+
+Please cite this [paper](https://arxiv.org/abs/1908.01853) when referencing DELTA.
+```
+@ARTICLE{delta,
+       author = {{Han}, Kun and {Chen}, Junwen and {Zhang}, Hui and {Xu}, Haiyang and
+         {Peng}, Yiping and {Wang}, Yun and {Ding}, Ning and {Deng}, Hui and
+         {Gao}, Yonghu and {Guo}, Tingwei and {Zhang}, Yi and {He}, Yahao and
+         {Ma}, Baochang and {Zhou}, Yulong and {Zhang}, Kangli and {Liu}, Chao and
+         {Lyu}, Ying and {Wang}, Chenxi and {Gong}, Cheng and {Wang}, Yunbo and
+         {Zou}, Wei and {Song}, Hui and {Li}, Xiangang},
+       title = "{DELTA: A DEep learning based Language Technology plAtform}",
+       journal = {arXiv e-prints},
+       year = "2019",
+       url = {https://arxiv.org/abs/1908.01853},
+}
+
+```
+
 ## License
 
 The DELTA platform is licensed under the terms of the Apache license.

diff --git a/delta/data/feat/speech_ops.py b/delta/data/feat/speech_ops.py
@@ -269,10 +269,12 @@ def extract_feature(waveforms, params):
     ], 0)
   return feats  # shape [nframes, featue_size, chnanels]
 
+
 def _new_tensor_array(name, size, dtype=None):
   ''' create empty TensorArray which can store size elements.'''
   return tf.TensorArray(dtype, size, name=name)
 
+
 def batch_extract_feature(waveforms, params):
   ''' waveforms: [batch, samples, audio_channels]
   return: features [batch, nframes, feat_size, channles]
@@ -326,21 +328,23 @@ def splice(feat, left_context, right_context):
       https://github.com/kaldi-asr/kaldi/src/feat/feature-functions.cc#L205:6
   '''
 
-  def _loop_continue(time, end_time, context, unused_left_context, right_context,
-                     unused_output_tas):
+  def _loop_continue(time, end_time, context, unused_left_context,
+                     right_context, unused_output_tas):
     del unused_output_tas
     del unused_left_context
     return time < end_time
 
-  def _loop_body(time, end_time, context, left_context, right_context, output_tas):
+  def _loop_body(time, end_time, context, left_context, right_context,
+                 output_tas):
     shape = tf.shape(context)
     B, _, D = shape[0], shape[1], shape[2]
     N = (1 + left_context + right_context) * D
 
     new_feat = context[:, time:time + left_context + 1 + right_context, :]
     new_feat = tf.reshape(new_feat, [B, N])
     new_output_tas = output_tas.write(time, new_feat)
-    return (time + 1, end_time, context, left_context, right_context, new_output_tas)
+    return (time + 1, end_time, context, left_context, right_context,
+            new_output_tas)
 
   with tf.control_dependencies([
       tf.assert_greater_equal(left_context, 0),
@@ -360,13 +364,14 @@ def _loop_body(time, end_time, context, left_context, right_context, output_tas)
     shape_invariants = tf.contrib.framework.nest.map_structure(
         lambda t: tf.TensorShape(None), loop_vars)
 
-    (time, end_time, context, left_context, right_context, output_tas) = tf.while_loop(
-        _loop_continue,
-        _loop_body,
-        loop_vars=loop_vars,
-        shape_invariants=shape_invariants,
-        parallel_iterations=parallel_iterations,
-        swap_memory=False)
+    (time, end_time, context, left_context, right_context,
+     output_tas) = tf.while_loop(
+         _loop_continue,
+         _loop_body,
+         loop_vars=loop_vars,
+         shape_invariants=shape_invariants,
+         parallel_iterations=parallel_iterations,
+         swap_memory=False)
     del context
     del left_context
     del right_context

diff --git a/delta/data/preprocess/text_ops.py b/delta/data/preprocess/text_ops.py
@@ -83,4 +83,3 @@ def char_cut_tf(input_str):
     raise Exception("Error input shape for input_str.")
   output_str = tf.strings.strip(output_str)
   return output_str
-
diff --git a/delta/data/preprocess/text_seq2seq_preparer.py b/delta/data/preprocess/text_seq2seq_preparer.py
@@ -50,7 +50,9 @@ def prepare_raw_data(self, pre_process_pipeline):
     for mode in self.all_modes:
       paths = self.config["data"][mode]['paths']
       paths = [paths['source'], paths['target']]
-      paths_after_pre_process = [[one_path + ".after" for one_path in path] for path in paths]
+      paths_after_pre_process = [
+          [one_path + ".after" for one_path in path] for path in paths
+      ]
       logging.debug(
           "paths_after_pre_process: {}".format(paths_after_pre_process))
 
@@ -59,9 +61,9 @@ def prepare_raw_data(self, pre_process_pipeline):
       for one_path_text, one_path_target, \
           one_path_text_after, one_path_target_after in zip(*paths, *paths_after_pre_process):
         self.prepare_one_raw_data((one_path_text, one_path_target),
-                                  (one_path_text_after, one_path_target_after), mode,
-                                  infer_without_label, pre_process_pipeline,
-                                  all_texts, all_labels)
+                                  (one_path_text_after, one_path_target_after),
+                                  mode, infer_without_label,
+                                  pre_process_pipeline, all_texts, all_labels)
     return all_texts, all_labels
 
   def load_a_raw_file(self, one_path, mode, infer_without_label):
@@ -94,12 +96,12 @@ def prepare_label_vocab(self, all_labels):
       if os.path.exists(self.label_vocab_file_paths[i]) and \
         self.use_custom_vocab:
         logging.info("Reuse label vocab file: {}".format(
-          self.label_vocab_file_paths[i]))
+            self.label_vocab_file_paths[i]))
       else:
         prepare_vocab(
-          self.label_vocab_file_paths[i],
-          all_labels[i],
-          min_frequency=self.vocab_min_frequency,
-          use_default_dict=True)
+            self.label_vocab_file_paths[i],
+            all_labels[i],
+            min_frequency=self.vocab_min_frequency,
+            use_default_dict=True)
         logging.info("Generate label vocab file: {}".format(
-          self.label_vocab_file_paths[i]))
+            self.label_vocab_file_paths[i]))
diff --git a/delta/data/task/base_text_task.py b/delta/data/task/base_text_task.py
@@ -43,7 +43,8 @@ def __init__(self, config, mode):
     self.data_config = config['data']
     self.task_config = self.data_config['task']
 
-    self.infer_no_label = self.data_config[utils.INFER].get('infer_no_label', False)
+    self.infer_no_label = self.data_config[utils.INFER].get(
+        'infer_no_label', False)
     if self.mode == utils.INFER and self.infer_no_label:
       self.infer_without_label = True
     else:
@@ -92,20 +93,21 @@ def pre_process_pipeline(self, input_sentences):
           main_root = os.environ["MAIN_ROOT"]
           dict_path = os.path.join(main_root,
                                    "tools/cppjieba/dict/jieba.dict.utf8")
-          hmm_path = os.path.join(main_root, "tools/cppjieba/dict/hmm_model.utf8")
+          hmm_path = os.path.join(main_root,
+                                  "tools/cppjieba/dict/hmm_model.utf8")
           user_dict_path = os.path.join(main_root,
                                         "tools/cppjieba/dict/user.dict.utf8")
           idf_path = os.path.join(main_root, "tools/cppjieba/dict/idf.utf8")
           stop_word_path = os.path.join(main_root,
                                         "tools/cppjieba/dict/stop_words.utf8")
           batch = py_x_ops.jieba_cut(
-            input_sentences,
-            hmm=True,
-            dict_path=dict_path,
-            hmm_path=hmm_path,
-            user_dict_path=user_dict_path,
-            idf_path=idf_path,
-            stop_word_path=stop_word_path)
+              input_sentences,
+              hmm=True,
+              dict_path=dict_path,
+              hmm_path=hmm_path,
+              user_dict_path=user_dict_path,
+              idf_path=idf_path,
+              stop_word_path=stop_word_path)
         else:
           batch = char_cut_tf(input_sentences)
     return batch

diff --git a/delta/data/task/speaker_cls_task.py b/delta/data/task/speaker_cls_task.py
@@ -197,6 +197,7 @@ def next_part_utts(self, part_size):
         utt_meta = self.meta.utts[utt_key]
         yield (utt_key, utt_meta)
 
+
 class DataQueueAsync():
   ''' Sample from raw data. '''
 
@@ -278,8 +279,9 @@ def append_requests(self):
     Returns:
       a bool, True if still has unconsumed data, False otherwise.
     '''
-    pool_args = [(None, x) for x in
-                 self.sampler.next_part_utts(self.pool_chunk_size)]
+    pool_args = [
+        (None, x) for x in self.sampler.next_part_utts(self.pool_chunk_size)
+    ]
     self.pool_res = self.pool.imap_unordered(
         self.sampler.get_bare_sampler(), pool_args, chunksize=100)
     if pool_args:
@@ -549,13 +551,14 @@ def dataset(self, mode, batch_size, num_epoch):  # pylint: disable=unused-argume
     if mode == utils.TRAIN:
       data = data.shuffle(buffer_size=buffer_size)
       if self.uniform_resample:
+
         def class_func(inputs, texts, labels, filenames, clip_ids, soft_labels):
           ''' Return the label of a sample tuple. '''
           return labels
         target_dist = tf.ones((self.num_class,), dtype=tf.float32) / \
                       self.num_class
-        data = data.apply(tf.data.experimental.rejection_resample(
-                          class_func, target_dist))
+        data = data.apply(
+            tf.data.experimental.rejection_resample(class_func, target_dist))
 
     def make_example(inputs, texts, labels, filenames, clip_ids, soft_labels):
       features = {

diff --git a/delta/data/task/text_cls_task.py b/delta/data/task/text_cls_task.py
@@ -51,7 +51,7 @@ def __init__(self, config, mode):
       self.dense_npy = config["data"][self.mode]["dense_npy"]
     self.paths = self.data_config[mode]['paths']
     self.paths_after_pre_process = [
-      one_path + ".after" for one_path in self.paths
+        one_path + ".after" for one_path in self.paths
     ]
     self.prepare()
 

diff --git a/delta/data/task/text_cls_task_test.py b/delta/data/task/text_cls_task_test.py
@@ -76,7 +76,7 @@ def test_english(self):
           [data["input_x_dict"]["input_x"], data["input_y_dict"]["input_y"]])
       logging.debug(res[0][0][:5])
       logging.debug(res[1][0][:5])
-      self.assertAllEqual(res[0][0][:5], [6, 7, 8, 0, 0])
+      self.assertAllEqual(res[0][0][:5], [3, 4, 5, 0, 0])
       self.assertEqual(np.shape(res[1]), (32, class_num))
 
     # test online data
@@ -181,7 +181,7 @@ def test_chinese_split_by_space(self):
           [data["input_x_dict"]["input_x"], data["input_y_dict"]["input_y"]])
       logging.debug(res[0][0])
       logging.debug(res[1][0])
-      self.assertAllEqual(res[0][0][:5], [4, 5, 0, 0, 0])
+      self.assertAllEqual(res[0][0][:5], [2, 3, 0, 0, 0])
       self.assertEqual(np.shape(res[1]), (32, class_num))
 
     # test online data
@@ -232,7 +232,7 @@ def test_chinese_word(self):
           [data["input_x_dict"]["input_x"], data["input_y_dict"]["input_y"]])
       logging.debug(res[0][0])
       logging.debug(res[1][0])
-      self.assertAllEqual(res[0][0][:5], [4, 5, 0, 0, 0])
+      self.assertAllEqual(res[0][0][:5], [2, 0, 0, 0, 0])
       self.assertEqual(np.shape(res[1]), (32, class_num))
 
     # test online data
@@ -289,7 +289,7 @@ def test_chinese_char(self):
       ])
       logging.debug(res[0][0])
       logging.debug(res[1][0])
-      self.assertAllEqual(res[0][0][:5], [5, 6, 0, 0, 0])
+      self.assertAllEqual(res[0][0][:5], [2, 3, 4, 0, 0])
       self.assertEqual(np.shape(res[0]), (32, max_len))
       self.assertEqual(np.shape(res[1]), (32, class_num))
       self.assertEqual(np.shape(res[2]), (32,))
@@ -345,7 +345,7 @@ def test_chinese_with_split_token(self):
       ])
       logging.debug(res[0][0][:10])
       logging.debug(res[1][0])
-      self.assertAllEqual(res[0][0][:10], [4, 5, 0, 6, 9, 10, 0, 0, 0, 0])
+      self.assertAllEqual(res[0][0][:10], [2, 0, 0, 0, 6, 2, 0, 0, 8, 0])#[2,3,0,0,6,2,0,0,8,0]
       self.assertEqual(np.shape(res[0]), (32, max_len))
       self.assertEqual(np.shape(res[1]), (32, class_num))
       self.assertEqual(np.shape(res[2]), (32,))

diff --git a/delta/data/task/text_match_task.py b/delta/data/task/text_match_task.py
@@ -44,7 +44,7 @@ def __init__(self, config, mode):
 
     self.paths = self.data_config[mode]['paths']
     self.paths_after_pre_process = [
-      one_path + ".after" for one_path in self.paths
+        one_path + ".after" for one_path in self.paths
     ]
 
     self.prepare()