From 28208021bbe8a442db59f8ca378936e28d84bc0c Mon Sep 17 00:00:00 2001 From: roll Date: Mon, 12 Sep 2016 09:42:09 +0300 Subject: [PATCH 1/4] added test to highlight make_valid_url bug --- tests/test_utilities.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/tests/test_utilities.py b/tests/test_utilities.py index 89f567f64c..3b0239a1c8 100644 --- a/tests/test_utilities.py +++ b/tests/test_utilities.py @@ -31,3 +31,7 @@ def test_make_valid_url(self): 'data_url=http://data.defra.gov.uk/ops/government_procurement_card/over_£500_GPC_apr_2013.csv') assertion = '£' not in utilities.helpers.make_valid_url(url) self.assertTrue(assertion) + + def test_make_valid_url_dont_break_query(self): + url = 'http://next.openspending.org/fdp-adapter/convert?url=https%3A%2F%2Fraw.githubusercontent.com%2Fkravets-levko%2Fdata%2Fmaster%2Ftest.xlsx.csv' + self.assertEqual(utilities.helpers.make_valid_url(url), url) From 233d2159ba9290e8df34132e28e77b8bfb19585b Mon Sep 17 00:00:00 2001 From: roll Date: Mon, 12 Sep 2016 10:04:52 +0300 Subject: [PATCH 2/4] fixed make_valid_url --- goodtables/utilities/helpers.py | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/goodtables/utilities/helpers.py b/goodtables/utilities/helpers.py index a90aa0c20a..5dc3160cac 100644 --- a/goodtables/utilities/helpers.py +++ b/goodtables/utilities/helpers.py @@ -6,6 +6,7 @@ import os import io +import re import json import inspect import requests @@ -202,6 +203,7 @@ def validate_handler(handler, argcount=1): if not len(spec[0]) == argcount: raise exceptions.InvalidHandlerError + def make_valid_url(url): """Make sure url doesn't contain unsupported characters @@ -214,8 +216,12 @@ def make_valid_url(url): return (glue).join(quoted) scheme, netloc, path, query, fragment = compat.urlsplit(url) - quoted_path = compat.quote(path.encode('utf-8')) - quoted_query = compat.quote_plus(query.encode('utf-8')) - new_url_tuple = (scheme, netloc, quoted_path, quoted_query, fragment) + encoded_path = url_encode_non_ascii(path) + encoded_query = url_encode_non_ascii(query) + new_url_tuple = (scheme, netloc, encoded_path, encoded_query, fragment) quoted_url = compat.urlunsplit(new_url_tuple) return quoted_url + + +def url_encode_non_ascii(text): + return re.sub('[\x80-\xFF]', lambda c: '%%%02x' % ord(c.group(0)), text) From 4d2e10f7172662d43c7bb058e809c67afc86ece1 Mon Sep 17 00:00:00 2001 From: roll Date: Mon, 12 Sep 2016 10:20:54 +0300 Subject: [PATCH 3/4] added comment --- goodtables/utilities/helpers.py | 1 + 1 file changed, 1 insertion(+) diff --git a/goodtables/utilities/helpers.py b/goodtables/utilities/helpers.py index 5dc3160cac..a2f1558359 100644 --- a/goodtables/utilities/helpers.py +++ b/goodtables/utilities/helpers.py @@ -224,4 +224,5 @@ def make_valid_url(url): def url_encode_non_ascii(text): + # http://stackoverflow.com/questions/4389572/how-to-fetch-a-non-ascii-url-with-python-urlopen return re.sub('[\x80-\xFF]', lambda c: '%%%02x' % ord(c.group(0)), text) From f072858cf19094038d1ec824ae7e3c3e18e807d5 Mon Sep 17 00:00:00 2001 From: roll Date: Mon, 12 Sep 2016 12:22:56 +0300 Subject: [PATCH 4/4] fixed make_valid_url encoding bug --- goodtables/utilities/helpers.py | 15 ++++++++++----- tests/test_utilities.py | 5 +++++ 2 files changed, 15 insertions(+), 5 deletions(-) diff --git a/goodtables/utilities/helpers.py b/goodtables/utilities/helpers.py index a2f1558359..962e4f8d54 100644 --- a/goodtables/utilities/helpers.py +++ b/goodtables/utilities/helpers.py @@ -216,13 +216,18 @@ def make_valid_url(url): return (glue).join(quoted) scheme, netloc, path, query, fragment = compat.urlsplit(url) - encoded_path = url_encode_non_ascii(path) - encoded_query = url_encode_non_ascii(query) - new_url_tuple = (scheme, netloc, encoded_path, encoded_query, fragment) + path = url_encode_non_ascii(path) + query = url_encode_non_ascii(query) + new_url_tuple = (scheme, netloc, path, query, fragment) quoted_url = compat.urlunsplit(new_url_tuple) return quoted_url -def url_encode_non_ascii(text): +def url_encode_non_ascii(element): # http://stackoverflow.com/questions/4389572/how-to-fetch-a-non-ascii-url-with-python-urlopen - return re.sub('[\x80-\xFF]', lambda c: '%%%02x' % ord(c.group(0)), text) + pattern = '[\x80-\xFF]'.encode('utf-8') + element = element.encode('utf-8') + replace = lambda char: ('%%%02x' % ord(char.group(0))).encode('utf-8') + element = re.sub(pattern, replace, element) + element = element.decode('utf-8') + return element diff --git a/tests/test_utilities.py b/tests/test_utilities.py index 3b0239a1c8..260f5eb420 100644 --- a/tests/test_utilities.py +++ b/tests/test_utilities.py @@ -32,6 +32,11 @@ def test_make_valid_url(self): assertion = '£' not in utilities.helpers.make_valid_url(url) self.assertTrue(assertion) + def test_make_valid_url_urlencode(self): + url = 'http://data.defra.gov.uk/ops/government_procurement_card/over_£500_GPC_apr_2013.csv' + valid_url = 'http://data.defra.gov.uk/ops/government_procurement_card/over_%c2%a3500_GPC_apr_2013.csv' + self.assertEqual(utilities.helpers.make_valid_url(url), valid_url) + def test_make_valid_url_dont_break_query(self): url = 'http://next.openspending.org/fdp-adapter/convert?url=https%3A%2F%2Fraw.githubusercontent.com%2Fkravets-levko%2Fdata%2Fmaster%2Ftest.xlsx.csv' self.assertEqual(utilities.helpers.make_valid_url(url), url)