dmlc · szha · Jun 16, 2020 · Jun 13, 2020 · Jun 13, 2020 · Jun 13, 2020
diff --git a/scripts/question_answering/squad_utils.py b/scripts/question_answering/squad_utils.py
@@ -2,7 +2,6 @@
 from typing import Optional, List
 from collections import namedtuple
 import itertools
-import bisect
 import re
 import numpy as np
 import numpy.ma as ma
@@ -12,6 +11,7 @@
 import json
 import string
 from gluonnlp.data.tokenizers import BaseTokenizerWithVocab
+from gluonnlp.utils.preprocessing import match_tokens_with_char_spans
 from typing import Tuple
 from mxnet.gluon.utils import download
 
@@ -389,34 +389,25 @@ def convert_squad_example_to_feature(example: SquadExample,
     gt_span_start_pos, gt_span_end_pos = None, None
     token_answer_mismatch = False
     unreliable_span = False
+    np_offsets = np.array(offsets)
     if is_training and not example.is_impossible:
         assert example.start_position >= 0 and example.end_position >= 0
-        # From the offsets, we locate the first offset that contains start_pos and the last offset
-        # that contains end_pos, i.e.
-        # offsets[lower_idx][0] <= start_pos < offsets[lower_idx][1]
-        # offsets[upper_idx][0] < end_pos <= offsets[upper_idx[1]
+        # We convert the character-level offsets to token-level offsets
         # Also, if the answer after tokenization + detokenization is not the same as the original
-        # answer,
-        offsets_lower = [offset[0] for offset in offsets]
-        offsets_upper = [offset[1] for offset in offsets]
+        # answer, we try to localize the answer text and do a rematch
         candidates = [(example.start_position, example.end_position)]
         all_possible_start_pos = {example.start_position}
         find_all_candidates = False
         lower_idx, upper_idx = None, None
         first_lower_idx, first_upper_idx = None, None
         while len(candidates) > 0:
             start_position, end_position = candidates.pop()
-            if end_position > offsets_upper[-1] or start_position < offsets_lower[0]:
-                # Detect the out-of-boundary case
-                warnings.warn('The selected answer is not covered by the tokens! '
-                              'Use the end_position. '
-                              'qas_id={}, context_text={}, start_pos={}, end_pos={}, '
-                              'offsets={}'.format(example.qas_id, context_text,
-                                                  start_position, end_position, offsets))
-                end_position = min(offsets_upper[-1], end_position)
-                start_position = max(offsets_upper[0], start_position)
-            lower_idx = bisect.bisect(offsets_lower, start_position) - 1
-            upper_idx = bisect.bisect_left(offsets_upper, end_position)
+            # Match the token offsets
+            token_start_ends = match_tokens_with_char_spans(np_offsets,
+                                                            np.array([[start_position,
+                                                                       end_position]]))
+            lower_idx = int(token_start_ends[0][0])
+            upper_idx = int(token_start_ends[0][1])
             if not find_all_candidates:
                 first_lower_idx = lower_idx
                 first_upper_idx = upper_idx

diff --git a/src/gluonnlp/utils/__init__.py b/src/gluonnlp/utils/__init__.py
@@ -1,7 +1,7 @@
 from . import config
 from . import lazy_imports
-from . import misc
 from . import preprocessing
 from . import registry
 from . import testing
 from .parameter import *
+from .misc import *
diff --git a/src/gluonnlp/utils/lazy_imports.py b/src/gluonnlp/utils/lazy_imports.py
@@ -23,7 +23,8 @@
            'try_import_scipy',
            'try_import_mwparserfromhell',
            'try_import_fasttext',
-           'try_import_langid']
+           'try_import_langid',
+           'try_import_boto3']
 
 
 def try_import_sentencepiece():
@@ -132,3 +133,15 @@ def try_import_langid():
         raise ImportError('"langid" is not installed. You must install langid in order to use the'
                           ' functionality. You may try to use `pip install langid`.')
     return langid
+
+
+def try_import_boto3():
+    try:
+        import boto3
+    except ImportError:
+        raise ImportError('"boto3" is not installed. To enable fast downloading in EC2. You should '
+                          'install boto3 and correctly configure the S3. '
+                          'See https://github.com/facebookresearch/fastText for more information. '
+                          'If you are using EC2, downloading from s3:// will '
+                          'be multiple times faster than using the traditional http/https URL.')
+    return boto3
diff --git a/src/gluonnlp/utils/misc.py b/src/gluonnlp/utils/misc.py
@@ -16,10 +16,15 @@
     import tqdm
 except ImportError:
     tqdm = None
+from .lazy_imports import try_import_boto3
 from mxnet.gluon.utils import shape_is_known, replace_file
 from collections import OrderedDict
 import glob as _glob
 
+
+S3_PREFIX = 's3://'
+
+
 def glob(url, separator=','):
     """Return a list of paths matching a pathname pattern.
 
@@ -396,6 +401,15 @@ def download(url: str,
     fname
         The file path of the downloaded file.
     """
+    is_s3 = url.startswith(S3_PREFIX)
+    if is_s3:
+        boto3 = try_import_boto3()
+        s3 = boto3.resource('s3')
+        components = url[len(S3_PREFIX):].split('/')
+        if len(components) < 2:
+            raise ValueError('Invalid S3 url. Received url={}'.format(url))
+        s3_bucket_name = components[0]
+        s3_key = '/'.join(components[1:])
     if path is None:
         fname = url.split('/')[-1]
         # Empty filenames are invalid
@@ -424,23 +438,40 @@ def download(url: str,
             # pylint: disable=W0703
             try:
                 print('Downloading {} from {}...'.format(fname, url))
-                r = requests.get(url, stream=True, verify=verify_ssl)
-                if r.status_code != 200:
-                    raise RuntimeError('Failed downloading url {}'.format(url))
-                # create uuid for temporary files
-                random_uuid = str(uuid.uuid4())
-                total_size = int(r.headers.get('content-length', 0))
-                chunk_size = 1024
-                if tqdm is not None:
-                    t = tqdm.tqdm(total=total_size, unit='iB', unit_scale=True)
-                with open('{}.{}'.format(fname, random_uuid), 'wb') as f:
-                    for chunk in r.iter_content(chunk_size=chunk_size):
-                        if chunk:  # filter out keep-alive new chunks
-                            if tqdm is not None:
-                                t.update(len(chunk))
-                            f.write(chunk)
-                if tqdm is not None:
-                    t.close()
+                if is_s3:
+                    response = s3.meta.client.head_object(Bucket=s3_bucket_name,
+                                                          Key=s3_key)
+                    total_size = int(response.get('ContentLength', 0))
+                    random_uuid = str(uuid.uuid4())
+                    tmp_path = '{}.{}'.format(fname, random_uuid)
+                    if tqdm is not None:
+                        def hook(t_obj):
+                            def inner(bytes_amount):
+                                t_obj.update(bytes_amount)
+                            return inner
+                        with tqdm.tqdm(total=total_size, unit='iB', unit_scale=True) as t:
+                            s3.meta.client.download_file(s3_bucket_name, s3_key, tmp_path,
+                                                         Callback=hook(t))
+                    else:
+                        s3.meta.client.download_file(s3_bucket_name, s3_key, tmp_path)
+                else:
+                    r = requests.get(url, stream=True, verify=verify_ssl)
+                    if r.status_code != 200:
+                        raise RuntimeError('Failed downloading url {}'.format(url))
+                    # create uuid for temporary files
+                    random_uuid = str(uuid.uuid4())
+                    total_size = int(r.headers.get('content-length', 0))
+                    chunk_size = 1024
+                    if tqdm is not None:
+                        t = tqdm.tqdm(total=total_size, unit='iB', unit_scale=True)
+                    with open('{}.{}'.format(fname, random_uuid), 'wb') as f:
+                        for chunk in r.iter_content(chunk_size=chunk_size):
+                            if chunk:  # filter out keep-alive new chunks
+                                if tqdm is not None:
+                                    t.update(len(chunk))
+                                f.write(chunk)
+                    if tqdm is not None:
+                        t.close()
                 # if the target file exists(created by other processes)
                 # and have the same hash with target file
                 # delete the temporary file

diff --git a/src/gluonnlp/utils/preprocessing.py b/src/gluonnlp/utils/preprocessing.py
@@ -51,3 +51,62 @@ def get_trimmed_lengths(lengths: List[int],
         return trimmed_lengths
     else:
         return np.minimum(lengths, max_length)
+
+
+def match_tokens_with_char_spans(token_offsets: np.ndarray,
+                                 spans: np.ndarray) -> np.ndarray:
+    """Match the span offsets with the character-level offsets.
+
+    For each span, we perform the following:
+
+    1: Cutoff the boundary
+
+        span[0] = max(span[0], token_offsets[0, 0])
+        span[1] = min(span[1], token_offsets[-1, 1])
+
+    2: Find start + end
+
+    We try to select the smallest number of tokens that cover the entity, i.e.,
+    we will find start + end, in which tokens[start:end + 1] covers the span.
+
+    We will use the following algorithm:
+
+        For "start", we search for
+            token_offsets[start, 0] <= span[0] < token_offsets[start + 1, 0]
+
+        For "end", we search for:
+            token_offsets[end - 1, 1] < spans[1] <= token_offsets[end, 1]
+
+    Parameters
+    ----------
+    token_offsets
+        The offsets of the input tokens. Must be sorted.
+        That is, it will satisfy
+            1. token_offsets[i][0] <= token_offsets[i][1]
+            2. token_offsets[i][0] <= token_offsets[i + 1][0]
+            3. token_offsets[i][1] <= token_offsets[i + 1][1]
+        Shape (#num_tokens, 2)
+    spans
+        The character-level offsets (begin/end) of the selected spans.
+        Shape (#spans, 2)
+
+    Returns
+    -------
+    token_start_ends
+        The token-level starts and ends. The end will also be used.
+        Shape (#spans, 2)
+    """
+    offsets_starts = token_offsets[:, 0]
+    offsets_ends = token_offsets[:, 1]
+    span_char_starts = spans[:, 0]
+    span_char_ends = spans[:, 1]
+
+    # Truncate the span
+    span_char_starts = np.maximum(offsets_starts[0], span_char_starts)
+    span_char_ends = np.minimum(offsets_ends[-1], span_char_ends)
+
+    # Search for valid start + end
+    span_token_starts = np.searchsorted(offsets_starts, span_char_starts, side='right') - 1
+    span_token_ends = np.searchsorted(offsets_ends, span_char_ends, side='left')
+    return np.concatenate((np.expand_dims(span_token_starts, axis=-1),
+                           np.expand_dims(span_token_ends, axis=-1)), axis=-1)
diff --git a/tests/test_utils_preprocessing.py b/tests/test_utils_preprocessing.py
@@ -1,6 +1,7 @@
+import pytest
 import numpy as np
 from numpy.testing import assert_allclose
-from gluonnlp.utils.preprocessing import get_trimmed_lengths
+from gluonnlp.utils.preprocessing import get_trimmed_lengths, match_tokens_with_char_spans
 
 
 def test_get_trimmed_lengths():
@@ -15,3 +16,24 @@ def test_get_trimmed_lengths():
                                               max_length=max_length,
                                               do_merge=do_merge)
         assert_allclose(trimmed_lengths, np.array(gt_trimmed_lengths))
+
+
+def test_match_tokens_with_char_spans():
+    token_offsets = np.array([(0, 1), (1, 2), (3, 4), (5, 6)])
+    spans = np.array([(0, 3), (4, 6)])
+    out = match_tokens_with_char_spans(token_offsets, spans)
+    assert_allclose(out, np.array([[0, 2],
+                                   [2, 3]]))
+
+    token_offsets = np.array([(5, 10), (10, 20), (20, 25), (26, 30)])
+    spans = np.array([(0, 3), (4, 6), (10, 30),
+                      (22, 23), (15, 25),
+                      (10, 35), (36, 38)])
+    out = match_tokens_with_char_spans(token_offsets, spans)
+    assert_allclose(out, np.array([[0, 0],
+                                   [0, 0],
+                                   [1, 3],
+                                   [2, 2],
+                                   [1, 2],
+                                   [1, 3],
+                                   [3, 3]]))