Download HTML when the unit type is problem, discussion,survey,html,poll

csyezheng · Oct 10, 2023 · c51b8d3 · c51b8d3
1 parent 05da242
commit c51b8d3
Show file tree

Hide file tree

Showing 4 changed files with 44 additions and 9 deletions.
diff --git a/edx_dl/common.py b/edx_dl/common.py
@@ -163,6 +163,18 @@ def __init__(self, videos, resources_urls):
         self.resources_urls = resources_urls
 
 
+class WebpageUnit(Unit):
+    """
+    Representation of a Webpage for unit in the course.
+    Used for unit type: discussion, html, problem, etc, not video unit
+    """
+    def __init__(self, content):
+        """
+        """
+        super().__init__([], [])
+        self.content = content
+
+
 class Video(object):
     """
     Representation of a single video.

diff --git a/edx_dl/edx_dl.py b/edx_dl/edx_dl.py
@@ -39,6 +39,7 @@
     YOUTUBE_DL_CMD,
     DEFAULT_CACHE_FILENAME,
     Unit,
+    WebpageUnit,
     Video,
     ExitCode,
     DEFAULT_FILE_FORMATS,
@@ -104,6 +105,7 @@
 }
 BASE_URL = OPENEDX_SITES['edx']['url']
 EDX_HOMEPAGE = BASE_URL + '/user_api/v1/account/login_session'
+EDX_LEARN_BASE_URL = 'https://learning.edx.org/course'
 LOGIN_API = BASE_URL + '/login_ajax'
 DASHBOARD = BASE_URL + '/dashboard'
 COURSE_LIST_API = BASE_URL + '/api/learner_home/init'
@@ -118,6 +120,7 @@ def change_openedx_site(site_name):
     Changes the openedx website for the given one via the key
     """
     global BASE_URL
+    global EDX_LEARN_BASE_URL
     global EDX_HOMEPAGE
     global LOGIN_API
     global DASHBOARD
@@ -497,7 +500,7 @@ def extract_units_from_sequential_block(block, headers, file_formats):
 
     json_extractor = EdXJsonExtractor()
     resp_dict = get_page_contents_as_json(url, headers)
-    vertical_blocks = json_extractor.extract_vertical_blocks(resp_dict)
+    vertical_blocks = json_extractor.extract_vertical_blocks(resp_dict, EDX_LEARN_BASE_URL)
     all_units = []
     for block in vertical_blocks:
         units = extract_units_from_vertical_block(block, headers, file_formats)
@@ -939,6 +942,15 @@ def skip_or_download(downloads, headers, args, f=download_url):
         f(url, filename, headers, args)
 
 
+def skip_or_save(file_path, content):
+    """
+    save content into file  if it not exists
+    """
+    if not os.path.exists(file_path):
+        with open(file_path, 'w', encoding='utf-8') as f:
+            f.write(content)
+
+
 def download_video(video, args, target_dir, filename_prefix, headers):
     if args.prefer_cdn_videos or video.video_youtube_url is None:
         mp4_downloads = _build_url_downloads(video.mp4_urls, target_dir,
@@ -980,6 +992,9 @@ def download_unit(unit, args, target_dir, filename_prefix, headers):
     res_downloads = _build_url_downloads(unit.resources_urls, target_dir,
                                          filename_prefix)
     skip_or_download(res_downloads, headers, args)
+    if isinstance(unit, WebpageUnit):
+        file_path = os.path.join(target_dir, filename_prefix + '.html')
+        skip_or_save(file_path, unit.content)
 
 
 def download(args, selections, all_units, headers):

diff --git a/edx_dl/parsing.py b/edx_dl/parsing.py
@@ -11,7 +11,7 @@
 from six.moves import html_parser
 from bs4 import BeautifulSoup as BeautifulSoup_
 
-from .common import Course, Section, SubSection, Block, Unit, Video
+from .common import Course, Section, SubSection, Block, Unit, WebpageUnit, Video
 
 
 # Force use of bs4 with html.parser
@@ -139,7 +139,7 @@ def _make_children(info, ids):
                     chapter_blocks.append(chapter_block)
         return chapter_blocks
 
-    def extract_vertical_blocks(self, deserialized_response):
+    def extract_vertical_blocks(self, deserialized_response, EDX_LEARN_BASE_URL):
         """
         Method to extract the vertical blocks from deserialized response
         """
@@ -148,7 +148,7 @@ def extract_vertical_blocks(self, deserialized_response):
         vertical_blocks = []
         for i, item in enumerate(items, 1):
             block_id = item['id']
-            url = "https://learning.edx.org/course/" + sequential_block_id + '/' + block_id
+            url = EDX_LEARN_BASE_URL + sequential_block_id + '/' + block_id
             block = Block(position=i,
                           block_id=block_id,
                           name=item['page_title'],
@@ -537,6 +537,12 @@ def extract_units_from_html(self, page, BASE_URL, file_formats):
             unit = self.extract_unit(unit_soup, BASE_URL, file_formats)
             if len(unit.videos) > 0 or len(unit.resources_urls) > 0:
                 units.append(unit)
+
+        # If a unit is of these types, download it as a web page
+        content_types = ['discussion', 'html', 'poll', 'problem', 'survey']
+        block_type = re.findall(r'data-block-type="(.+?)"', page)
+        if [x for x in block_type if x in content_types]:
+            units.append(WebpageUnit(content=page))
         return units
 
     def extract_unit(self, unit_soup, BASE_URL, file_formats):
@@ -545,10 +551,7 @@ def extract_unit(self, unit_soup, BASE_URL, file_formats):
         xblock_list = unit_soup.find_all('div', 'xblock')
         for xblock in xblock_list:
             xblock_type = xblock['data-block-type']
-            if xblock_type == 'html':
-                urls = self.extract_resources_urls(xblock, BASE_URL, file_formats)
-                resources_urls.extend(urls)
-            elif xblock_type == 'video':
+            if xblock_type == 'video':
                 video_youtube_url = None
                 available_subs_url = None
                 sub_template_url = None
@@ -577,8 +580,12 @@ def extract_unit(self, unit_soup, BASE_URL, file_formats):
                                     available_subs_url=available_subs_url,
                                     sub_template_url=sub_template_url,
                                     mp4_urls=mp4_urls))
+            else:
+                urls = self.extract_resources_urls(xblock, BASE_URL, file_formats)
+                resources_urls.extend(urls)
         return Unit(videos=videos, resources_urls=resources_urls)
 
+
     def extract_resources_urls(self, soup, BASE_URL, file_formats):
         """
         Extract resources looking for <a> references in the webpage and
@@ -603,6 +610,7 @@ def extract_resources_urls(self, soup, BASE_URL, file_formats):
 
         return resources_urls
 
+
 def get_page_extractor(url):
     """
     factory method for page extractors

diff --git a/requirements.txt b/requirements.txt
@@ -3,4 +3,4 @@ html5lib>=1.0.1
 six>=1.11.0
 youtube_dl>=2021.12.17
 requests>=2.18.4
-tqdm~=4.66.1
+tqdm>=4.66.1