fix bug: remove surplus blanks in url

Xuetangx has some malformed HTMLs, like this: <a href=" /c4x/TsinghuaX/70250023X/asset/LST0-1-0___________.zip">一个有趣应用-动画文件</a></pre><p></p></o1> in http://www.xuetangx.com/courses/course-v1:TsinghuaX+70250023X_2015_2+sp/courseware/bfbdec0177a7466d9b6bc48c011ee401/63a0945fa22d486fa4e6976802f1f037/ from which the extracted url is 'http://www.xuetangx.com /c4x/TsinghuaX/70250023X/asset/LST0-1-0___________.zip', and it's necessary to remove the space.
coursera-dl · Jan 19, 2017 · b8c2684 · b8c2684
1 parent 8c47c43
commit b8c2684
Show file tree

Hide file tree

Showing 2 changed files with 9 additions and 2 deletions.
diff --git a/edx_dl/parsing.py b/edx_dl/parsing.py
@@ -13,7 +13,7 @@
 from bs4 import BeautifulSoup as BeautifulSoup_
 
 from .common import Course, Section, SubSection, Unit, Video
-from .utils import get_page_contents
+from .utils import get_page_contents, remove_blanks
 
 # Force use of bs4 with html5lib
 BeautifulSoup = lambda page: BeautifulSoup_(page, 'html5lib')
@@ -188,7 +188,9 @@ def extract_resources_urls(self, text, BASE_URL, file_formats):
         youtube_links = re_youtube_links.findall(text)
         resources_urls += youtube_links
 
-        return resources_urls
+        # there may be some surplus blank characters extracted from the HTML;
+        # remove them
+        return list(map(remove_blanks, resources_urls))
 
     def extract_sections_from_html(self, page, BASE_URL):
         """

diff --git a/edx_dl/utils.py b/edx_dl/utils.py
@@ -139,3 +139,8 @@ def clean_filename(s, minimal_change=False):
     s = s.strip().replace(' ', '_')
     valid_chars = '-_.()%s%s' % (string.ascii_letters, string.digits)
     return ''.join(c for c in s if c in valid_chars)
+
+
+def remove_blanks(s):
+    """Remove all blank characters from a string."""
+    return ''.join(list(filter(lambda c: not c.isspace(), s)))