Skip to content

Commit

Permalink
Add support for xuetangx.com
Browse files Browse the repository at this point in the history
Xuetangx, based on OpenEdX, is the most popular MOOC platform in China. It would be fine to support it :-)
  • Loading branch information
yiwenlu66 committed Jan 17, 2017
1 parent 8634a8e commit 6ff43be
Show file tree
Hide file tree
Showing 3 changed files with 195 additions and 17 deletions.
134 changes: 119 additions & 15 deletions edx_dl/edx_dl.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,13 +14,14 @@
import pickle
import re
import sys
import math

from functools import partial
from multiprocessing.dummy import Pool as ThreadPool

from six.moves.http_cookiejar import CookieJar
from six.moves.urllib.error import HTTPError, URLError
from six.moves.urllib.parse import urlencode
from six.moves.urllib.parse import urlencode, quote
from six.moves.urllib.request import (
urlopen,
build_opener,
Expand Down Expand Up @@ -89,19 +90,25 @@
'bits':{
'url':'http://any-learn.bits-pilani.ac.in',
'courseware-selector': ('nav', {'aria-label': 'Course Navigation'}),
},
'xuetangx': {
'url': 'http://www.xuetangx.com',
'courseware-selector': None,
}
}
BASE_URL = OPENEDX_SITES['edx']['url']
SITE_NAME = 'edx'
BASE_URL = OPENEDX_SITES[SITE_NAME]['url']
EDX_HOMEPAGE = BASE_URL + '/login_ajax'
LOGIN_API = BASE_URL + '/login_ajax'
DASHBOARD = BASE_URL + '/dashboard'
COURSEWARE_SEL = OPENEDX_SITES['edx']['courseware-selector']
COURSEWARE_SEL = OPENEDX_SITES[SITE_NAME]['courseware-selector']


def change_openedx_site(site_name):
"""
Changes the openedx website for the given one via the key
"""
global SITE_NAME
global BASE_URL
global EDX_HOMEPAGE
global LOGIN_API
Expand All @@ -113,11 +120,15 @@ def change_openedx_site(site_name):
logging.error("OpenEdX platform should be one of: %s", ', '.join(sites))
sys.exit(ExitCode.UNKNOWN_PLATFORM)

BASE_URL = OPENEDX_SITES[site_name]['url']
SITE_NAME = site_name
BASE_URL = OPENEDX_SITES[SITE_NAME]['url']
EDX_HOMEPAGE = BASE_URL + '/login_ajax'
LOGIN_API = BASE_URL + '/login_ajax'
DASHBOARD = BASE_URL + '/dashboard'
COURSEWARE_SEL = OPENEDX_SITES[site_name]['courseware-selector']
if site_name == 'xuetangx':
DASHBOARD = BASE_URL + '/api/web/courses/mycourses?format=json'
else:
DASHBOARD = BASE_URL + '/dashboard'
COURSEWARE_SEL = OPENEDX_SITES[SITE_NAME]['courseware-selector']


def _display_courses(courses):
Expand All @@ -131,10 +142,67 @@ def _display_courses(courses):
logging.info(' %s', course.url)


def get_courses_info_xuetangx(url, headers):
"""
Extracts the courses information from the dashboard.
This function is re-implemented for http://www.xuetangx.com, because
Xuetangx uses a REST API, which is quite different from other OpenEdX sites.
"""
def fetch_and_parse(base_url, param):
"""
Fetches the JSON API, and returns the total count, and a list of dicts
for the results on the current page.
:param base_url: the URL of the API.
:param param: query parameters, represented by a list of tuples.
:return: a (total, results) tuple; (0, []) on failure.
"""
url = base_url + '?' + urlencode(param)
page = get_page_contents(url, headers)
try:
d = json.loads(page)
total = d['total']
results = d['results']
except (json.JSONDecodeError, KeyError):
total = 0
results = []
return total, results

logging.info('Extracting course information from JSON API.')

api_url = BASE_URL + '/api/web/courses/mycourses'
query_params = [
[('type', 'started'), ('format', 'json')],
[('type', 'ended'), ('format', 'json')]
]
# use default page size, and fetch multiple times, in case there is a hard
# limit set by the API
page_size = 10

courses = []
page_extractor = get_page_extractor(url)

for param in query_params:
total, results = fetch_and_parse(api_url, param)
page_count = math.ceil(1.0 * total / page_size)
for i in range(page_count):
if i:
# page needs to be re-fetched unless it is the first one
new_param = param + [('offset', i * page_size)]
_, results = fetch_and_parse(api_url, new_param)
courses += page_extractor.extract_courses(results, BASE_URL)

return courses


def get_courses_info(url, headers):
"""
Extracts the courses information from the dashboard.
"""
if SITE_NAME == 'xuetangx':
return get_courses_info_xuetangx(url, headers)

logging.info('Extracting course information from dashboard.')

page = get_page_contents(url, headers)
Expand Down Expand Up @@ -304,6 +372,14 @@ def parse_args():
default=False,
help='list available sections')

parser.add_argument('--quality',
dest='quality',
action='store',
choices={'high', 'standard'},
default='high',
help='quality of video to download; works for xuetangx'
' only')

parser.add_argument('--youtube-dl-options',
dest='youtube_dl_options',
action='store',
Expand Down Expand Up @@ -435,6 +511,9 @@ def extract_units(url, headers, file_formats):

page = get_page_contents(url, headers)
page_extractor = get_page_extractor(url)
set_headers = getattr(page_extractor, 'set_headers', None)
if callable(set_headers):
set_headers(headers)
units = page_extractor.extract_units_from_html(page, BASE_URL, file_formats)

return units
Expand Down Expand Up @@ -664,27 +743,42 @@ def _build_subtitles_downloads(video, target_dir, filename_prefix, headers):
return downloads


def _build_url_downloads(urls, target_dir, filename_prefix):
def _build_url_downloads(urls, target_dir, filename_prefix, args,
is_video=False):
"""
Builds a dict {url: filename} for the given urls
If it is a youtube url it uses the valid template for youtube-dl
otherwise just takes the name of the file from the url
"""
if SITE_NAME == 'xuetangx' and is_video and urls:
# take advantage of the fact that the URL of HQ videos are
# lexicographically larger on Xuetangx ('quality20' > 'quality10')
urls = [max(urls)] if args.quality == 'high' else [min(urls)]
downloads = {url:
_build_filename_from_url(url, target_dir, filename_prefix)
_build_filename_from_url(url, target_dir, filename_prefix,
is_video=is_video)
for url in urls}
return downloads


def _build_filename_from_url(url, target_dir, filename_prefix):
def _build_filename_from_url(url, target_dir, filename_prefix, is_video=False,
video_counter=[0]):
"""
Builds the appropriate filename for the given args
"""
# video file names in Xuetangx do not make sense;
# use a counter as a workaround
if is_video:
video_counter[0] += 1

if is_youtube_url(url):
filename_template = filename_prefix + "-%(title)s-%(id)s.%(ext)s"
filename = os.path.join(target_dir, filename_template)
else:
original_filename = url.rsplit('/', 1)[1]
if SITE_NAME == 'xuetangx' and is_video:
original_filename = 'video_%05d.mp4' % video_counter[0]
else:
original_filename = url.rsplit('/', 1)[1]
filename = os.path.join(target_dir,
filename_prefix + '-' + original_filename)

Expand All @@ -695,6 +789,8 @@ def download_url(url, filename, headers, args):
"""
Downloads the given url in filename.
"""
# resolve unicode issue
url = quote(url, safe=';/?:@&=+$,')

if is_youtube_url(url):
download_youtube_url(url, filename, headers, args)
Expand Down Expand Up @@ -770,13 +866,15 @@ def skip_or_download(downloads, headers, args, f=download_url):
def download_video(video, args, target_dir, filename_prefix, headers):
if args.prefer_cdn_videos or video.video_youtube_url is None:
mp4_downloads = _build_url_downloads(video.mp4_urls, target_dir,
filename_prefix)
filename_prefix, args,
is_video=True)
skip_or_download(mp4_downloads, headers, args)
else:
if video.video_youtube_url is not None:
youtube_downloads = _build_url_downloads([video.video_youtube_url],
target_dir,
filename_prefix)
filename_prefix,
is_video=True)
skip_or_download(youtube_downloads, headers, args)

# the behavior with subtitles is different, since the subtitles don't know
Expand Down Expand Up @@ -804,7 +902,7 @@ def download_unit(unit, args, target_dir, filename_prefix, headers):
download_video(video, args, target_dir, new_prefix, headers)

res_downloads = _build_url_downloads(unit.resources_urls, target_dir,
filename_prefix)
filename_prefix, args)
skip_or_download(res_downloads, headers, args)


Expand All @@ -818,13 +916,19 @@ def download(args, selections, all_units, headers):
# notice that we could iterate over all_units, but we prefer to do it over
# sections/subsections to add correct prefixes and show nicer information.

# courses on Xuetangx may contain chinese characters
preserve_non_ascii = (SITE_NAME == 'xuetangx')

for selected_course, selected_sections in selections.items():
coursename = directory_name(selected_course.name)
coursename = directory_name(selected_course.name,
minimal_change=preserve_non_ascii)
for selected_section in selected_sections:
section_dirname = "%02d-%s" % (selected_section.position,
selected_section.name)
target_dir = os.path.join(args.output_dir, coursename,
clean_filename(section_dirname))
clean_filename(section_dirname,
minimal_change=
preserve_non_ascii))
mkdir_p(target_dir)
counter = 0
for subsection in selected_section.subsections:
Expand Down
74 changes: 74 additions & 0 deletions edx_dl/parsing.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,13 +5,15 @@
"""
import re
import json
import logging

from datetime import timedelta, datetime

from six.moves import html_parser
from bs4 import BeautifulSoup as BeautifulSoup_

from .common import Course, Section, SubSection, Unit, Video
from .utils import get_page_contents

# Force use of bs4 with html5lib
BeautifulSoup = lambda page: BeautifulSoup_(page, 'html5lib')
Expand Down Expand Up @@ -342,6 +344,75 @@ def _make_subsections(section_soup):
return sections


class XuetangxPageExtractor(ClassicEdXPageExtractor):

def __init__(self):
self.headers = None

def set_headers(self, headers):
"""Sets the headers necessary for accessing the video URL API"""
self.headers = headers
self.base_url = None

def extract_courses(self, results, BASE_URL):
"""
Extract courses from a list of dicts.
"""
courses = []

for result in results:
try:
course_id = result['id']
course_name = result['name']
course_url = BASE_URL + result['info_link']
# Xuetangx allows accessing materials for all archived courses,
# so it's safe to mark all courses as 'Started'.
course_state = 'Started'
except KeyError:
continue
courses.append(Course(id=course_id,
name=course_name,
url=course_url,
state=course_state))

return courses

def extract_units_from_html(self, page, BASE_URL, file_formats):
self.base_url = BASE_URL
return ClassicEdXPageExtractor.extract_units_from_html(self, page,
BASE_URL,
file_formats)

def extract_mp4_urls(self, text):
"""
Looks for available links to the mp4 version of the videos
"""
# Xuetangx does not provide the video URL directly in the page;
# instead, a video id can be found in the page and translated into
# actual URL through a "video2source" API.
m = re.search('(?<=data-ccsource=&#39;).+(?=&#39;)', text)
if not m:
return []

video_id = m.group(0)
if not self.base_url:
logging.debug('Base URL unset; please set self.base_url before '
'calling extract_mp4_urls')
return []
video_src_url = self.base_url + '/videoid2source/' + video_id
video_src_json = get_page_contents(video_src_url, self.headers)
try:
sources = json.loads(video_src_json)['sources']
except (json.JSONDecodeError, KeyError):
return []

mp4_urls = []
for quality in sources:
if sources[quality]:
mp4_urls.append(sources[quality][0])
return mp4_urls


def get_page_extractor(url):
"""
factory method for page extractors
Expand All @@ -350,6 +421,9 @@ def get_page_extractor(url):
'https://lagunita.stanford.edu'):
return CurrentEdXPageExtractor()

if 'xuetangx.com' in url:
return XuetangxPageExtractor()

return ClassicEdXPageExtractor()


Expand Down
4 changes: 2 additions & 2 deletions edx_dl/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,11 +42,11 @@ def execute_command(cmd, args):
raise e


def directory_name(initial_name):
def directory_name(initial_name, minimal_change=False):
"""
Transform the name of a directory into an ascii version
"""
result = clean_filename(initial_name)
result = clean_filename(initial_name, minimal_change=minimal_change)
return result if result != "" else "course_folder"


Expand Down

0 comments on commit 6ff43be

Please sign in to comment.