diff --git a/edx_dl/parsing.py b/edx_dl/parsing.py index 72080eac..6e82f7df 100644 --- a/edx_dl/parsing.py +++ b/edx_dl/parsing.py @@ -13,7 +13,7 @@ from bs4 import BeautifulSoup as BeautifulSoup_ from .common import Course, Section, SubSection, Unit, Video -from .utils import get_page_contents +from .utils import get_page_contents, remove_blanks # Force use of bs4 with html5lib BeautifulSoup = lambda page: BeautifulSoup_(page, 'html5lib') @@ -188,7 +188,9 @@ def extract_resources_urls(self, text, BASE_URL, file_formats): youtube_links = re_youtube_links.findall(text) resources_urls += youtube_links - return resources_urls + # there may be some surplus blank characters extracted from the HTML; + # remove them + return list(map(remove_blanks, resources_urls)) def extract_sections_from_html(self, page, BASE_URL): """ diff --git a/edx_dl/utils.py b/edx_dl/utils.py index de5f3c62..6a61f601 100644 --- a/edx_dl/utils.py +++ b/edx_dl/utils.py @@ -139,3 +139,8 @@ def clean_filename(s, minimal_change=False): s = s.strip().replace(' ', '_') valid_chars = '-_.()%s%s' % (string.ascii_letters, string.digits) return ''.join(c for c in s if c in valid_chars) + + +def remove_blanks(s): + """Remove all blank characters from a string.""" + return ''.join(list(filter(lambda c: not c.isspace(), s))) \ No newline at end of file