Skip to content
This repository has been archived by the owner on Jan 15, 2024. It is now read-only.

[Numpy] Add "match_tokens_with_char_spans" + Enable downloading from S3 + Add Ubuntu test #1249

Merged
merged 18 commits into from
Jun 16, 2020
Merged
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
29 changes: 10 additions & 19 deletions scripts/question_answering/squad_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,6 @@
from typing import Optional, List
from collections import namedtuple
import itertools
import bisect
import re
import numpy as np
import numpy.ma as ma
Expand All @@ -12,6 +11,7 @@
import json
import string
from gluonnlp.data.tokenizers import BaseTokenizerWithVocab
from gluonnlp.utils.preprocessing import match_tokens_with_char_spans
from typing import Tuple
from mxnet.gluon.utils import download

Expand Down Expand Up @@ -389,34 +389,25 @@ def convert_squad_example_to_feature(example: SquadExample,
gt_span_start_pos, gt_span_end_pos = None, None
token_answer_mismatch = False
unreliable_span = False
np_offsets = np.array(offsets)
if is_training and not example.is_impossible:
assert example.start_position >= 0 and example.end_position >= 0
# From the offsets, we locate the first offset that contains start_pos and the last offset
# that contains end_pos, i.e.
# offsets[lower_idx][0] <= start_pos < offsets[lower_idx][1]
# offsets[upper_idx][0] < end_pos <= offsets[upper_idx[1]
# We convert the character-level offsets to token-level offsets
# Also, if the answer after tokenization + detokenization is not the same as the original
# answer,
offsets_lower = [offset[0] for offset in offsets]
offsets_upper = [offset[1] for offset in offsets]
# answer, we try to localize the answer text and do a rematch
candidates = [(example.start_position, example.end_position)]
all_possible_start_pos = {example.start_position}
find_all_candidates = False
lower_idx, upper_idx = None, None
first_lower_idx, first_upper_idx = None, None
while len(candidates) > 0:
start_position, end_position = candidates.pop()
if end_position > offsets_upper[-1] or start_position < offsets_lower[0]:
# Detect the out-of-boundary case
warnings.warn('The selected answer is not covered by the tokens! '
'Use the end_position. '
'qas_id={}, context_text={}, start_pos={}, end_pos={}, '
'offsets={}'.format(example.qas_id, context_text,
start_position, end_position, offsets))
end_position = min(offsets_upper[-1], end_position)
start_position = max(offsets_upper[0], start_position)
lower_idx = bisect.bisect(offsets_lower, start_position) - 1
upper_idx = bisect.bisect_left(offsets_upper, end_position)
# Match the token offsets
token_start_ends = match_tokens_with_char_spans(np_offsets,
np.array([[start_position,
end_position]]))
lower_idx = int(token_start_ends[0][0])
upper_idx = int(token_start_ends[0][1])
if not find_all_candidates:
first_lower_idx = lower_idx
first_upper_idx = upper_idx
Expand Down
2 changes: 1 addition & 1 deletion src/gluonnlp/utils/__init__.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
from . import config
from . import lazy_imports
from . import misc
from . import preprocessing
from . import registry
from . import testing
from .parameter import *
from .misc import *
15 changes: 14 additions & 1 deletion src/gluonnlp/utils/lazy_imports.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,8 @@
'try_import_scipy',
'try_import_mwparserfromhell',
'try_import_fasttext',
'try_import_langid']
'try_import_langid',
'try_import_boto3']


def try_import_sentencepiece():
Expand Down Expand Up @@ -132,3 +133,15 @@ def try_import_langid():
raise ImportError('"langid" is not installed. You must install langid in order to use the'
' functionality. You may try to use `pip install langid`.')
return langid


def try_import_boto3():
try:
import boto3
except ImportError:
raise ImportError('"boto3" is not installed. To enable fast downloading in EC2. You should '
'install boto3 and correctly configure the S3. '
'See https://github.com/facebookresearch/fastText for more information. '
'If you are using EC2, downloading from s3:// will '
'be multiple times faster than using the traditional http/https URL.')
return boto3
65 changes: 48 additions & 17 deletions src/gluonnlp/utils/misc.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,10 +16,15 @@
import tqdm
except ImportError:
tqdm = None
from .lazy_imports import try_import_boto3
from mxnet.gluon.utils import shape_is_known, replace_file
from collections import OrderedDict
import glob as _glob


S3_PREFIX = 's3://'


def glob(url, separator=','):
"""Return a list of paths matching a pathname pattern.

Expand Down Expand Up @@ -396,6 +401,15 @@ def download(url: str,
fname
The file path of the downloaded file.
"""
is_s3 = url.startswith(S3_PREFIX)
if is_s3:
boto3 = try_import_boto3()
s3 = boto3.resource('s3')
components = url[len(S3_PREFIX):].split('/')
if len(components) < 2:
raise ValueError('Invalid S3 url. Received url={}'.format(url))
s3_bucket_name = components[0]
s3_key = '/'.join(components[1:])
if path is None:
fname = url.split('/')[-1]
# Empty filenames are invalid
Expand Down Expand Up @@ -424,23 +438,40 @@ def download(url: str,
# pylint: disable=W0703
try:
print('Downloading {} from {}...'.format(fname, url))
r = requests.get(url, stream=True, verify=verify_ssl)
if r.status_code != 200:
raise RuntimeError('Failed downloading url {}'.format(url))
# create uuid for temporary files
random_uuid = str(uuid.uuid4())
total_size = int(r.headers.get('content-length', 0))
chunk_size = 1024
if tqdm is not None:
t = tqdm.tqdm(total=total_size, unit='iB', unit_scale=True)
with open('{}.{}'.format(fname, random_uuid), 'wb') as f:
for chunk in r.iter_content(chunk_size=chunk_size):
if chunk: # filter out keep-alive new chunks
if tqdm is not None:
t.update(len(chunk))
f.write(chunk)
if tqdm is not None:
t.close()
if is_s3:
response = s3.meta.client.head_object(Bucket=s3_bucket_name,
Key=s3_key)
total_size = int(response.get('ContentLength', 0))
random_uuid = str(uuid.uuid4())
tmp_path = '{}.{}'.format(fname, random_uuid)
if tqdm is not None:
def hook(t_obj):
def inner(bytes_amount):
t_obj.update(bytes_amount)
return inner
with tqdm.tqdm(total=total_size, unit='iB', unit_scale=True) as t:
s3.meta.client.download_file(s3_bucket_name, s3_key, tmp_path,
Callback=hook(t))
else:
s3.meta.client.download_file(s3_bucket_name, s3_key, tmp_path)
else:
r = requests.get(url, stream=True, verify=verify_ssl)
if r.status_code != 200:
raise RuntimeError('Failed downloading url {}'.format(url))
# create uuid for temporary files
random_uuid = str(uuid.uuid4())
total_size = int(r.headers.get('content-length', 0))
chunk_size = 1024
if tqdm is not None:
t = tqdm.tqdm(total=total_size, unit='iB', unit_scale=True)
with open('{}.{}'.format(fname, random_uuid), 'wb') as f:
for chunk in r.iter_content(chunk_size=chunk_size):
if chunk: # filter out keep-alive new chunks
if tqdm is not None:
t.update(len(chunk))
f.write(chunk)
if tqdm is not None:
t.close()
# if the target file exists(created by other processes)
# and have the same hash with target file
# delete the temporary file
Expand Down
59 changes: 59 additions & 0 deletions src/gluonnlp/utils/preprocessing.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,3 +51,62 @@ def get_trimmed_lengths(lengths: List[int],
return trimmed_lengths
else:
return np.minimum(lengths, max_length)


def match_tokens_with_char_spans(token_offsets: np.ndarray,
spans: np.ndarray) -> np.ndarray:
"""Match the span offsets with the character-level offsets.

For each span, we perform the following:

1: Cutoff the boundary

span[0] = max(span[0], token_offsets[0, 0])
span[1] = min(span[1], token_offsets[-1, 1])

2: Find start + end

We try to select the smallest number of tokens that cover the entity, i.e.,
we will find start + end, in which tokens[start:end + 1] covers the span.

We will use the following algorithm:

For "start", we search for
token_offsets[start, 0] <= span[0] < token_offsets[start + 1, 0]

For "end", we search for:
token_offsets[end - 1, 1] < spans[1] <= token_offsets[end, 1]

Parameters
----------
token_offsets
The offsets of the input tokens. Must be sorted.
That is, it will satisfy
1. token_offsets[i][0] <= token_offsets[i][1]
2. token_offsets[i][0] <= token_offsets[i + 1][0]
3. token_offsets[i][1] <= token_offsets[i + 1][1]
Shape (#num_tokens, 2)
spans
The character-level offsets (begin/end) of the selected spans.
Shape (#spans, 2)

Returns
-------
token_start_ends
The token-level starts and ends. The end will also be used.
Shape (#spans, 2)
"""
offsets_starts = token_offsets[:, 0]
offsets_ends = token_offsets[:, 1]
span_char_starts = spans[:, 0]
span_char_ends = spans[:, 1]

# Truncate the span
span_char_starts = np.maximum(offsets_starts[0], span_char_starts)
span_char_ends = np.minimum(offsets_ends[-1], span_char_ends)

# Search for valid start + end
span_token_starts = np.searchsorted(offsets_starts, span_char_starts, side='right') - 1
span_token_ends = np.searchsorted(offsets_ends, span_char_ends, side='left')
return np.concatenate((np.expand_dims(span_token_starts, axis=-1),
np.expand_dims(span_token_ends, axis=-1)), axis=-1)
24 changes: 23 additions & 1 deletion tests/test_utils_preprocessing.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import pytest
import numpy as np
from numpy.testing import assert_allclose
from gluonnlp.utils.preprocessing import get_trimmed_lengths
from gluonnlp.utils.preprocessing import get_trimmed_lengths, match_tokens_with_char_spans


def test_get_trimmed_lengths():
Expand All @@ -15,3 +16,24 @@ def test_get_trimmed_lengths():
max_length=max_length,
do_merge=do_merge)
assert_allclose(trimmed_lengths, np.array(gt_trimmed_lengths))


def test_match_tokens_with_char_spans():
token_offsets = np.array([(0, 1), (1, 2), (3, 4), (5, 6)])
spans = np.array([(0, 3), (4, 6)])
out = match_tokens_with_char_spans(token_offsets, spans)
assert_allclose(out, np.array([[0, 2],
[2, 3]]))

token_offsets = np.array([(5, 10), (10, 20), (20, 25), (26, 30)])
spans = np.array([(0, 3), (4, 6), (10, 30),
(22, 23), (15, 25),
(10, 35), (36, 38)])
out = match_tokens_with_char_spans(token_offsets, spans)
assert_allclose(out, np.array([[0, 0],
[0, 0],
[1, 3],
[2, 2],
[1, 2],
[1, 3],
[3, 3]]))