Merge branch 'master' into kleschenko-patch-1

datarobot · Aug 14, 2019 · 9ecbb2c · 9ecbb2c
2 parents 3a3019d + 2092ca6
commit 9ecbb2c
Show file tree

Hide file tree

Showing 17 changed files with 731 additions and 43 deletions.
diff --git a/CHANGES.rst b/CHANGES.rst
@@ -1,3 +1,10 @@
+1.16.2 (2019 Jul 25)
+====================
+
+Bugfixes
+-------------
+* Fix ``utf-8`` encoding error for Japanese data sets.
+
 1.16.1 (2019 May 27)
 ====================
 

diff --git a/DEV_README.rst b/DEV_README.rst
@@ -14,6 +14,10 @@ Install package in virtualenv::
 
 Now ``batch_scoring`` script should be available in your PATH.
 
+Make sure pip version is less than 10.0.0. Otherwise::
+
+    $ pip install pip==9.0.1
+
 You can also create virtualenvs with different python versions::
 
     $ mkvirtualenv batch_scoring_3 -p /usr/bin/python3.5
@@ -51,11 +55,12 @@ Release
 5. build the **PyInstaller** and **Offlinebundle** for Linux
 
   1. build the image for PyInstaller with ``docker-compose build centos5pyinstaller``
-  2. build both releases with ``make build_release_dockerized``. This will add the following to the dist dir:
+  2. build both releases with ``make build_release_dockerized`` (does not work on MacOS, use Linux instead). This will add the following to the dist dir:
 
     - datarobot_batch_scoring_<TAG>_offlinebundle.zip
     - datarobot_batch_scoring_<TAG>_offlinebundle.tar
     - datarobot_batch_scoring_<TAG>_executables.Linux.x86_64.tar
+    - datarobot_batch_scoring_<TAG>_executables.Linux.x86_64.zip
 
 6. Collect the **PyInstaller** artifacts for OSX and Windows from S3
 
@@ -68,14 +73,19 @@ Release
 
 7. upload the builds produced by step 5 and 6
 
-  1. find the release page for the version that was pushed at https://github.com/datarobot/batch-scoring/releases
-  2. ``edit`` the release on github and attach the 5 files:
+  1. find new version tag that was pushed at https://github.com/datarobot/batch-scoring/tags
+  2. ``create`` the release on github for selected tag and attach 6 files:
 
+    - datarobot_batch_scoring_<TAG>_executables.Linux.x86_64.tar
+    - datarobot_batch_scoring_<TAG>_executables.Linux.x86_64.zip
     - datarobot_batch_scoring_<TAG>_offlinebundle.zip
     - datarobot_batch_scoring_<TAG>_offlinebundle.tar
-    - datarobot_batch_scoring_<TAG>_executables.Windows.x86_64.zip
     - datarobot_batch_scoring_<TAG>_executables.OSX.x86_64.tar
-    - datarobot_batch_scoring_<TAG>_executables.Linux.x86_64.tar
+    - datarobot_batch_scoring_<TAG>_executables.Windows.x86_64.zip
+
+  3. update release subject and description
+  4. publish new release
+
 
 Offline Bundle
 --------------

diff --git a/datarobot_batch_scoring/__init__.py b/datarobot_batch_scoring/__init__.py
@@ -1 +1 @@
-__version__ = '1.16.1'
+__version__ = '1.16.3'
diff --git a/datarobot_batch_scoring/api_response_handlers/pred_api_v10.py b/datarobot_batch_scoring/api_response_handlers/pred_api_v10.py
@@ -112,7 +112,8 @@ def rows_generator():
 
 def prediction_explanation_fields(
     result_sorted,
-    prediction_explanations_key
+    prediction_explanations_key,
+    num_prediction_explanations,
 ):
     """ Generate prediction explanations fields
 
@@ -122,17 +123,20 @@ def prediction_explanation_fields(
         list of results sorted by rowId
     prediction_explanations_key : str
         key of results by which explanation fields can be found
+    num_prediction_explanations : int
+        the total number of prediction explanations in response.
+        Please note that in case output contains less explanations
+        than this limit, the rest of the columns in the output will be pad
+        with empty values for the sake of fixed columns count in the response.
 
     Returns
     -------
     header: list[str]
     row_generator: iterator
     """
     headers = []
-    single_row = result_sorted[0]
 
-    num_reason_codes = len(single_row[prediction_explanations_key])
-    for num in range(1, num_reason_codes + 1):
+    for num in range(1, num_prediction_explanations + 1):
         headers += [
             'explanation_{0}_feature'.format(num),
             'explanation_{0}_strength'.format(num)
@@ -146,6 +150,9 @@ def rows_generator():
                     raw_reason_code['feature'],
                     raw_reason_code['strength']
                 ]
+            # pad response with empty values in case not all prediction
+            # explanations are present
+            reason_codes += [''] * (len(headers) - len(reason_codes))
             yield reason_codes
 
     return headers, rows_generator()
@@ -219,6 +226,7 @@ def format_data(result, batch, **opts):
     skip_row_id = opts.get('skip_row_id')
     fast_mode = opts.get('fast_mode')
     input_delimiter = opts.get('delimiter')
+    max_prediction_explanations = opts.get('max_prediction_explanations')
 
     result_sorted = sorted(
         result,
@@ -258,7 +266,8 @@ def _find_prediction_explanations_key():
         fields.append(
             prediction_explanation_fields(
                 result_sorted,
-                prediction_explanations_key
+                prediction_explanations_key,
+                max_prediction_explanations,
             )
         )
 

diff --git a/datarobot_batch_scoring/batch_scoring.py b/datarobot_batch_scoring/batch_scoring.py
@@ -202,7 +202,9 @@ def run_batch_predictions(base_url, base_headers, user, pwd,
                               lid, keep_cols, n_retry, delimiter,
                               dataset, pred_name, ui, fast_mode,
                               encoding, skip_row_id, output_delimiter,
-                              pred_threshold_name, pred_decision_name))
+                              pred_threshold_name, pred_decision_name,
+                              max_prediction_explanations)
+        )
 
         n_batches_checkpointed_init = len(ctx.db['checkpoints'])
         ui.debug('number of batches checkpointed initially: {}'

diff --git a/datarobot_batch_scoring/reader.py b/datarobot_batch_scoring/reader.py
@@ -17,7 +17,9 @@
                                             REPORT_INTERVAL,
                                             ProgressQueueMsg)
 from datarobot_batch_scoring.detect import Detector
-from datarobot_batch_scoring.utils import get_rusage, SerializableDialect
+from datarobot_batch_scoring.utils import (get_rusage,
+                                           gzip_with_encoding,
+                                           SerializableDialect)
 
 
 if six.PY2:
@@ -383,7 +385,7 @@ def __exit__(self, exc_type, exc_val, exc_tb):
             self.p.terminate()
 
 
-def sniff_dialect(sample, encoding, sep, skip_dialect, ui):
+def sniff_dialect(sample, sep, skip_dialect, ui):
     t1 = time()
     try:
         if skip_dialect:
@@ -396,11 +398,11 @@ def sniff_dialect(sample, encoding, sep, skip_dialect, ui):
             dialect = csv.get_dialect('dataset_dialect')
         else:
             sniffer = csv.Sniffer()
-            dialect = sniffer.sniff(sample.decode(encoding), delimiters=sep)
+            dialect = sniffer.sniff(sample, delimiters=sep)
             ui.debug('investigate_encoding_and_dialect - seconds to detect '
                      'csv dialect: {}'.format(time() - t1))
     except csv.Error:
-        decoded_one = sample.decode(encoding)
+        decoded_one = sample
         t2 = time()
         detector = Detector()
         delimiter, resampled = detector.detect(decoded_one)
@@ -443,12 +445,8 @@ def investigate_encoding_and_dialect(dataset, sep, ui, fast=False,
         sample_size = DETECT_SAMPLE_SIZE_FAST
     else:
         sample_size = DETECT_SAMPLE_SIZE_SLOW
-
     is_gz = dataset.endswith('.gz')
-    opener, mode = (
-        (gzip.open, 'rb') if is_gz
-        else (open, ('rU' if six.PY2 else 'rb'))
-    )
+    opener, mode = _get_opener_and_mode(is_gz)
     with opener(dataset, mode) as dfile:
         sample = dfile.read(sample_size)
 
@@ -462,8 +460,12 @@ def investigate_encoding_and_dialect(dataset, sep, ui, fast=False,
         encoding = encoding.lower()
         sample[:1000].decode(encoding)  # Fail here if the encoding is invalid
 
+    opener, mode = _get_opener_and_mode(is_gz, text=True)
+    with opener(dataset, mode, encoding=encoding) as dfile:
+        sample = dfile.read(sample_size)
+
     try:
-        dialect = sniff_dialect(sample, encoding, sep, skip_dialect, ui)
+        dialect = sniff_dialect(sample, sep, skip_dialect, ui)
     except csv.Error as ex:
         ui.fatal(ex)
         if len(sample) < 10:
@@ -518,15 +520,11 @@ def auto_sampler(dataset, encoding, ui):
 
     sample_size = AUTO_SAMPLE_SIZE
     is_gz = dataset.endswith('.gz')
-    opener, mode = (gzip.open, 'rb') if is_gz else (open, 'rU')
-    with opener(dataset, mode) as dfile:
+    opener, mode = _get_opener_and_mode(is_gz, text=True)
+    with opener(dataset, mode, encoding=encoding) as dfile:
         sample = dfile.read(sample_size)
-        if six.PY3 and not is_gz:
-            sample = sample.encode(encoding or 'utf-8')
-
-    ingestable_sample = sample.decode(encoding)
-    size_bytes = sys.getsizeof(ingestable_sample.encode('utf-8'))
 
+    size_bytes = sys.getsizeof(sample.encode('utf-8'))
     if size_bytes < (sample_size * 0.75):
         #  if dataset is tiny, don't bother auto sampling.
         ui.info('auto_sampler: total time seconds - {}'.format(time() - t0))
@@ -536,15 +534,15 @@ def auto_sampler(dataset, encoding, ui):
 
     if six.PY3:
         buf = io.StringIO()
-        buf.write(ingestable_sample)
+        buf.write(sample)
     else:
         buf = StringIO.StringIO()
-        buf.write(sample)
+        buf.write(sample.encode('utf-8'))
     buf.seek(0)
+
     file_lines, csv_lines = 0, 0
     dialect = csv.get_dialect('dataset_dialect')
-    fd = Recoder(buf, encoding)
-    reader = csv.reader(fd, dialect=dialect, delimiter=dialect.delimiter)
+    reader = csv.reader(buf, dialect=dialect, delimiter=dialect.delimiter)
     line_pos = []
     for _ in buf:
         file_lines += 1
@@ -559,7 +557,6 @@ def auto_sampler(dataset, encoding, ui):
         # PRED-1240 there's no guarantee that we got _any_ fully formed lines.
         # If so, the dataset is super wide, so we only send 10 rows at a time
         return AUTO_SAMPLE_FALLBACK
-
     try:
         for _ in reader:
             csv_lines += 1
@@ -596,3 +593,18 @@ def peek_row(dataset, delimiter, ui, fast_mode, encoding):
     except StopIteration:
         raise ValueError('Cannot peek first row from {}'.format(dataset))
     return batch
+
+
+def _get_opener_and_mode(is_gz, text=False):
+    if is_gz:
+        if six.PY2:
+            return (gzip_with_encoding, 'r' if text else 'rb')
+        else:
+            return (gzip.open, 'rt' if text else 'rb')
+    else:
+        mode = 'rt' if text else 'rb'
+        if six.PY2:
+            from io import open as io_open
+            return (io_open, mode)
+        else:
+            return (open, mode)
diff --git a/datarobot_batch_scoring/utils.py b/datarobot_batch_scoring/utils.py
@@ -1,9 +1,12 @@
+import codecs
 import csv
 import getpass
+import gzip
 import io
 import logging
 import os
 import sys
+from contextlib import contextmanager
 from collections import namedtuple
 from functools import partial
 from gzip import GzipFile
@@ -518,3 +521,16 @@ def state(self, status):
 
     def state_name(self, s=None):
         return self.state_names[s or self.state]
+
+
+@contextmanager
+def gzip_with_encoding(data, mode, encoding=None):
+    """ Decorator to support encoding for gzip in PY2
+    """
+    if encoding is not None:
+        reader = codecs.getreader(encoding)
+        with gzip.open(data, mode) as f:
+            yield reader(f)
+    else:
+        with gzip.open(data, mode) as f:
+            yield f
diff --git a/datarobot_batch_scoring/writer.py b/datarobot_batch_scoring/writer.py
@@ -38,7 +38,9 @@ class RunContext(object):
     def __init__(self, n_samples, out_file, pid, lid, keep_cols,
                  n_retry, delimiter, dataset, pred_name, ui, file_context,
                  fast_mode, encoding, skip_row_id, output_delimiter,
-                 pred_threshold_name, pred_decision_name):
+                 pred_threshold_name, pred_decision_name,
+                 max_prediction_explanations,
+                 ):
         self.n_samples = n_samples
         self.out_file = out_file
         self.project_id = pid
@@ -63,13 +65,16 @@ def __init__(self, n_samples, out_file, pid, lid, keep_cols,
         self.writer_dialect = csv.get_dialect('writer_dialect')
         self.scoring_succeeded = False  # Removes shelves when True
         self.is_open = False  # Removes shelves when True
+        self.max_prediction_explanations = max_prediction_explanations
 
     @classmethod
     def create(cls, resume, n_samples, out_file, pid, lid,
                keep_cols, n_retry,
                delimiter, dataset, pred_name, ui,
                fast_mode, encoding, skip_row_id, output_delimiter,
-               pred_threshold_name, pred_decision_name):
+               pred_threshold_name, pred_decision_name,
+               max_prediction_explanations,
+               ):
         """Factory method for run contexts.
 
         Either resume or start a new one.
@@ -85,7 +90,8 @@ def create(cls, resume, n_samples, out_file, pid, lid,
         return ctx_class(n_samples, out_file, pid, lid, keep_cols, n_retry,
                          delimiter, dataset, pred_name, ui, file_context,
                          fast_mode, encoding, skip_row_id, output_delimiter,
-                         pred_threshold_name, pred_decision_name)
+                         pred_threshold_name, pred_decision_name,
+                         max_prediction_explanations)
 
     def __enter__(self):
         assert(not self.is_open)
@@ -439,7 +445,11 @@ def process_response(self):
                             keep_cols=self.ctx.keep_cols,
                             skip_row_id=self.ctx.skip_row_id,
                             fast_mode=self.ctx.fast_mode,
-                            delimiter=self.ctx.dialect.delimiter)
+                            delimiter=self.ctx.dialect.delimiter,
+                            max_prediction_explanations=(
+                                self.ctx.max_prediction_explanations
+                            )
+                            )
                     except UnexpectedKeptColumnCount:
                         self._ui.fatal('Unexpected number of kept columns ' +
                                        'retrieved. This can happen in ' +

diff --git a/tests/fixtures/10kDiabetes_mixed_explanations.csv b/tests/fixtures/10kDiabetes_mixed_explanations.csv
@@ -0,0 +1,11 @@
+row_id,0,1,explanation_1_feature,explanation_1_strength,explanation_2_feature,explanation_2_strength,explanation_3_feature,explanation_3_strength,explanation_4_feature,explanation_4_strength,explanation_5_feature,explanation_5_strength
+0,0.7374983461,0.2625016539,medical_specialty,-0.2230822974,number_diagnoses,0.2010719684,diag_1,-0.1882141621,number_inpatient,-0.1584939956,,
+1,0.7534670614,0.2465329386,weight,0.4616938769,diag_2,-0.2817732676,payer_code,-0.1999593278,age,-0.1700208122,num_lab_procedures,-0.1699786261
+2,0.673941752,0.326058248,discharge_disposition_id,0.3047681596,number_inpatient,0.2266360201,medical_specialty,-0.2049645033,,,,
+3,0.884682017,0.115317983,number_diagnoses,-0.4503520826,diag_2,-0.2808908801,admission_source_id,-0.2398130772,payer_code,-0.2143588931,number_inpatient,-0.1707111743
+4,0.7116849878,0.2883150122,medical_specialty,-0.1814823805,race,-0.1575075199,number_inpatient,-0.1568230769,admission_source_id,0.1354087199,,
+5,0.5909996414,0.4090003586,,,,,,,,,,
+6,0.8647944819,0.1352055181,discharge_disposition_id,-1.1295014348,medical_specialty,-0.2076687977,number_inpatient,-0.1461341051,payer_code,-0.1242433687,admission_type_id,0.0949499145
+7,0.5144554583,0.4855445417,number_inpatient,0.2826505121,diag_1,-0.2274929185,number_emergency,0.2071603004,number_diagnoses,0.1620381118,discharge_disposition_id,0.1560600053
+8,0.3746314486,0.6253685514,admission_type_id,0.5009928124,number_inpatient,0.2401329203,medical_specialty,-0.1651539909,admission_source_id,-0.1571784835,number_diagnoses,0.1377384773
+9,0.7117755608,0.2882244392,medical_specialty,-0.3821848966,number_diagnoses,-0.273826215,diag_3,0.229366976,number_inpatient,-0.1604879998,,