Skip to content

Commit

Permalink
Download HTML when the unit type is problem, discussion,survey,html,poll
Browse files Browse the repository at this point in the history
  • Loading branch information
csyezheng committed Oct 10, 2023
1 parent 05da242 commit c51b8d3
Show file tree
Hide file tree
Showing 4 changed files with 44 additions and 9 deletions.
12 changes: 12 additions & 0 deletions edx_dl/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -163,6 +163,18 @@ def __init__(self, videos, resources_urls):
self.resources_urls = resources_urls


class WebpageUnit(Unit):
"""
Representation of a Webpage for unit in the course.
Used for unit type: discussion, html, problem, etc, not video unit
"""
def __init__(self, content):
"""
"""
super().__init__([], [])
self.content = content


class Video(object):
"""
Representation of a single video.
Expand Down
17 changes: 16 additions & 1 deletion edx_dl/edx_dl.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,7 @@
YOUTUBE_DL_CMD,
DEFAULT_CACHE_FILENAME,
Unit,
WebpageUnit,
Video,
ExitCode,
DEFAULT_FILE_FORMATS,
Expand Down Expand Up @@ -104,6 +105,7 @@
}
BASE_URL = OPENEDX_SITES['edx']['url']
EDX_HOMEPAGE = BASE_URL + '/user_api/v1/account/login_session'
EDX_LEARN_BASE_URL = 'https://learning.edx.org/course'
LOGIN_API = BASE_URL + '/login_ajax'
DASHBOARD = BASE_URL + '/dashboard'
COURSE_LIST_API = BASE_URL + '/api/learner_home/init'
Expand All @@ -118,6 +120,7 @@ def change_openedx_site(site_name):
Changes the openedx website for the given one via the key
"""
global BASE_URL
global EDX_LEARN_BASE_URL
global EDX_HOMEPAGE
global LOGIN_API
global DASHBOARD
Expand Down Expand Up @@ -497,7 +500,7 @@ def extract_units_from_sequential_block(block, headers, file_formats):

json_extractor = EdXJsonExtractor()
resp_dict = get_page_contents_as_json(url, headers)
vertical_blocks = json_extractor.extract_vertical_blocks(resp_dict)
vertical_blocks = json_extractor.extract_vertical_blocks(resp_dict, EDX_LEARN_BASE_URL)
all_units = []
for block in vertical_blocks:
units = extract_units_from_vertical_block(block, headers, file_formats)
Expand Down Expand Up @@ -939,6 +942,15 @@ def skip_or_download(downloads, headers, args, f=download_url):
f(url, filename, headers, args)


def skip_or_save(file_path, content):
"""
save content into file if it not exists
"""
if not os.path.exists(file_path):
with open(file_path, 'w', encoding='utf-8') as f:
f.write(content)


def download_video(video, args, target_dir, filename_prefix, headers):
if args.prefer_cdn_videos or video.video_youtube_url is None:
mp4_downloads = _build_url_downloads(video.mp4_urls, target_dir,
Expand Down Expand Up @@ -980,6 +992,9 @@ def download_unit(unit, args, target_dir, filename_prefix, headers):
res_downloads = _build_url_downloads(unit.resources_urls, target_dir,
filename_prefix)
skip_or_download(res_downloads, headers, args)
if isinstance(unit, WebpageUnit):
file_path = os.path.join(target_dir, filename_prefix + '.html')
skip_or_save(file_path, unit.content)


def download(args, selections, all_units, headers):
Expand Down
22 changes: 15 additions & 7 deletions edx_dl/parsing.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@
from six.moves import html_parser
from bs4 import BeautifulSoup as BeautifulSoup_

from .common import Course, Section, SubSection, Block, Unit, Video
from .common import Course, Section, SubSection, Block, Unit, WebpageUnit, Video


# Force use of bs4 with html.parser
Expand Down Expand Up @@ -139,7 +139,7 @@ def _make_children(info, ids):
chapter_blocks.append(chapter_block)
return chapter_blocks

def extract_vertical_blocks(self, deserialized_response):
def extract_vertical_blocks(self, deserialized_response, EDX_LEARN_BASE_URL):
"""
Method to extract the vertical blocks from deserialized response
"""
Expand All @@ -148,7 +148,7 @@ def extract_vertical_blocks(self, deserialized_response):
vertical_blocks = []
for i, item in enumerate(items, 1):
block_id = item['id']
url = "https://learning.edx.org/course/" + sequential_block_id + '/' + block_id
url = EDX_LEARN_BASE_URL + sequential_block_id + '/' + block_id
block = Block(position=i,
block_id=block_id,
name=item['page_title'],
Expand Down Expand Up @@ -537,6 +537,12 @@ def extract_units_from_html(self, page, BASE_URL, file_formats):
unit = self.extract_unit(unit_soup, BASE_URL, file_formats)
if len(unit.videos) > 0 or len(unit.resources_urls) > 0:
units.append(unit)

# If a unit is of these types, download it as a web page
content_types = ['discussion', 'html', 'poll', 'problem', 'survey']
block_type = re.findall(r'data-block-type="(.+?)"', page)
if [x for x in block_type if x in content_types]:
units.append(WebpageUnit(content=page))
return units

def extract_unit(self, unit_soup, BASE_URL, file_formats):
Expand All @@ -545,10 +551,7 @@ def extract_unit(self, unit_soup, BASE_URL, file_formats):
xblock_list = unit_soup.find_all('div', 'xblock')
for xblock in xblock_list:
xblock_type = xblock['data-block-type']
if xblock_type == 'html':
urls = self.extract_resources_urls(xblock, BASE_URL, file_formats)
resources_urls.extend(urls)
elif xblock_type == 'video':
if xblock_type == 'video':
video_youtube_url = None
available_subs_url = None
sub_template_url = None
Expand Down Expand Up @@ -577,8 +580,12 @@ def extract_unit(self, unit_soup, BASE_URL, file_formats):
available_subs_url=available_subs_url,
sub_template_url=sub_template_url,
mp4_urls=mp4_urls))
else:
urls = self.extract_resources_urls(xblock, BASE_URL, file_formats)
resources_urls.extend(urls)
return Unit(videos=videos, resources_urls=resources_urls)


def extract_resources_urls(self, soup, BASE_URL, file_formats):
"""
Extract resources looking for <a> references in the webpage and
Expand All @@ -603,6 +610,7 @@ def extract_resources_urls(self, soup, BASE_URL, file_formats):

return resources_urls


def get_page_extractor(url):
"""
factory method for page extractors
Expand Down
2 changes: 1 addition & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -3,4 +3,4 @@ html5lib>=1.0.1
six>=1.11.0
youtube_dl>=2021.12.17
requests>=2.18.4
tqdm~=4.66.1
tqdm>=4.66.1

0 comments on commit c51b8d3

Please sign in to comment.