diff --git a/edx_dl/common.py b/edx_dl/common.py index daeb5c7..1509d03 100644 --- a/edx_dl/common.py +++ b/edx_dl/common.py @@ -163,6 +163,18 @@ def __init__(self, videos, resources_urls): self.resources_urls = resources_urls +class WebpageUnit(Unit): + """ + Representation of a Webpage for unit in the course. + Used for unit type: discussion, html, problem, etc, not video unit + """ + def __init__(self, content): + """ + """ + super().__init__([], []) + self.content = content + + class Video(object): """ Representation of a single video. diff --git a/edx_dl/edx_dl.py b/edx_dl/edx_dl.py index f79837b..20d72ca 100644 --- a/edx_dl/edx_dl.py +++ b/edx_dl/edx_dl.py @@ -39,6 +39,7 @@ YOUTUBE_DL_CMD, DEFAULT_CACHE_FILENAME, Unit, + WebpageUnit, Video, ExitCode, DEFAULT_FILE_FORMATS, @@ -104,6 +105,7 @@ } BASE_URL = OPENEDX_SITES['edx']['url'] EDX_HOMEPAGE = BASE_URL + '/user_api/v1/account/login_session' +EDX_LEARN_BASE_URL = 'https://learning.edx.org/course' LOGIN_API = BASE_URL + '/login_ajax' DASHBOARD = BASE_URL + '/dashboard' COURSE_LIST_API = BASE_URL + '/api/learner_home/init' @@ -118,6 +120,7 @@ def change_openedx_site(site_name): Changes the openedx website for the given one via the key """ global BASE_URL + global EDX_LEARN_BASE_URL global EDX_HOMEPAGE global LOGIN_API global DASHBOARD @@ -497,7 +500,7 @@ def extract_units_from_sequential_block(block, headers, file_formats): json_extractor = EdXJsonExtractor() resp_dict = get_page_contents_as_json(url, headers) - vertical_blocks = json_extractor.extract_vertical_blocks(resp_dict) + vertical_blocks = json_extractor.extract_vertical_blocks(resp_dict, EDX_LEARN_BASE_URL) all_units = [] for block in vertical_blocks: units = extract_units_from_vertical_block(block, headers, file_formats) @@ -939,6 +942,15 @@ def skip_or_download(downloads, headers, args, f=download_url): f(url, filename, headers, args) +def skip_or_save(file_path, content): + """ + save content into file if it not exists + """ + if not os.path.exists(file_path): + with open(file_path, 'w', encoding='utf-8') as f: + f.write(content) + + def download_video(video, args, target_dir, filename_prefix, headers): if args.prefer_cdn_videos or video.video_youtube_url is None: mp4_downloads = _build_url_downloads(video.mp4_urls, target_dir, @@ -980,6 +992,9 @@ def download_unit(unit, args, target_dir, filename_prefix, headers): res_downloads = _build_url_downloads(unit.resources_urls, target_dir, filename_prefix) skip_or_download(res_downloads, headers, args) + if isinstance(unit, WebpageUnit): + file_path = os.path.join(target_dir, filename_prefix + '.html') + skip_or_save(file_path, unit.content) def download(args, selections, all_units, headers): diff --git a/edx_dl/parsing.py b/edx_dl/parsing.py index 57c5feb..d31da88 100644 --- a/edx_dl/parsing.py +++ b/edx_dl/parsing.py @@ -11,7 +11,7 @@ from six.moves import html_parser from bs4 import BeautifulSoup as BeautifulSoup_ -from .common import Course, Section, SubSection, Block, Unit, Video +from .common import Course, Section, SubSection, Block, Unit, WebpageUnit, Video # Force use of bs4 with html.parser @@ -139,7 +139,7 @@ def _make_children(info, ids): chapter_blocks.append(chapter_block) return chapter_blocks - def extract_vertical_blocks(self, deserialized_response): + def extract_vertical_blocks(self, deserialized_response, EDX_LEARN_BASE_URL): """ Method to extract the vertical blocks from deserialized response """ @@ -148,7 +148,7 @@ def extract_vertical_blocks(self, deserialized_response): vertical_blocks = [] for i, item in enumerate(items, 1): block_id = item['id'] - url = "https://learning.edx.org/course/" + sequential_block_id + '/' + block_id + url = EDX_LEARN_BASE_URL + sequential_block_id + '/' + block_id block = Block(position=i, block_id=block_id, name=item['page_title'], @@ -537,6 +537,12 @@ def extract_units_from_html(self, page, BASE_URL, file_formats): unit = self.extract_unit(unit_soup, BASE_URL, file_formats) if len(unit.videos) > 0 or len(unit.resources_urls) > 0: units.append(unit) + + # If a unit is of these types, download it as a web page + content_types = ['discussion', 'html', 'poll', 'problem', 'survey'] + block_type = re.findall(r'data-block-type="(.+?)"', page) + if [x for x in block_type if x in content_types]: + units.append(WebpageUnit(content=page)) return units def extract_unit(self, unit_soup, BASE_URL, file_formats): @@ -545,10 +551,7 @@ def extract_unit(self, unit_soup, BASE_URL, file_formats): xblock_list = unit_soup.find_all('div', 'xblock') for xblock in xblock_list: xblock_type = xblock['data-block-type'] - if xblock_type == 'html': - urls = self.extract_resources_urls(xblock, BASE_URL, file_formats) - resources_urls.extend(urls) - elif xblock_type == 'video': + if xblock_type == 'video': video_youtube_url = None available_subs_url = None sub_template_url = None @@ -577,8 +580,12 @@ def extract_unit(self, unit_soup, BASE_URL, file_formats): available_subs_url=available_subs_url, sub_template_url=sub_template_url, mp4_urls=mp4_urls)) + else: + urls = self.extract_resources_urls(xblock, BASE_URL, file_formats) + resources_urls.extend(urls) return Unit(videos=videos, resources_urls=resources_urls) + def extract_resources_urls(self, soup, BASE_URL, file_formats): """ Extract resources looking for references in the webpage and @@ -603,6 +610,7 @@ def extract_resources_urls(self, soup, BASE_URL, file_formats): return resources_urls + def get_page_extractor(url): """ factory method for page extractors diff --git a/requirements.txt b/requirements.txt index 09ee1da..31388b5 100644 --- a/requirements.txt +++ b/requirements.txt @@ -3,4 +3,4 @@ html5lib>=1.0.1 six>=1.11.0 youtube_dl>=2021.12.17 requests>=2.18.4 -tqdm~=4.66.1 +tqdm>=4.66.1