Skip to content

Commit

Permalink
fix bug: remove surplus blanks in url
Browse files Browse the repository at this point in the history
Xuetangx has some malformed HTMLs, like this:

<a href=" /c4x/TsinghuaX/70250023X/asset/LST0-1-0___________.zip">一个有趣应用-动画文件</a></pre><p></p></o1>

in http://www.xuetangx.com/courses/course-v1:TsinghuaX+70250023X_2015_2+sp/courseware/bfbdec0177a7466d9b6bc48c011ee401/63a0945fa22d486fa4e6976802f1f037/

from which the extracted url is 'http://www.xuetangx.com /c4x/TsinghuaX/70250023X/asset/LST0-1-0___________.zip',
and it's necessary to remove the space.
  • Loading branch information
yiwenlu66 committed Jan 19, 2017
1 parent 8c47c43 commit b8c2684
Show file tree
Hide file tree
Showing 2 changed files with 9 additions and 2 deletions.
6 changes: 4 additions & 2 deletions edx_dl/parsing.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@
from bs4 import BeautifulSoup as BeautifulSoup_

from .common import Course, Section, SubSection, Unit, Video
from .utils import get_page_contents
from .utils import get_page_contents, remove_blanks

# Force use of bs4 with html5lib
BeautifulSoup = lambda page: BeautifulSoup_(page, 'html5lib')
Expand Down Expand Up @@ -188,7 +188,9 @@ def extract_resources_urls(self, text, BASE_URL, file_formats):
youtube_links = re_youtube_links.findall(text)
resources_urls += youtube_links

return resources_urls
# there may be some surplus blank characters extracted from the HTML;
# remove them
return list(map(remove_blanks, resources_urls))

def extract_sections_from_html(self, page, BASE_URL):
"""
Expand Down
5 changes: 5 additions & 0 deletions edx_dl/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -139,3 +139,8 @@ def clean_filename(s, minimal_change=False):
s = s.strip().replace(' ', '_')
valid_chars = '-_.()%s%s' % (string.ascii_letters, string.digits)
return ''.join(c for c in s if c in valid_chars)


def remove_blanks(s):
"""Remove all blank characters from a string."""
return ''.join(list(filter(lambda c: not c.isspace(), s)))

0 comments on commit b8c2684

Please sign in to comment.