Add support for xuetangx.com

Xuetangx, based on OpenEdX, is the most popular MOOC platform in China. It would be fine to support it :-)
coursera-dl · Jan 17, 2017 · 6ff43be · 6ff43be
1 parent 8634a8e
commit 6ff43be
Show file tree

Hide file tree

Showing 3 changed files with 195 additions and 17 deletions.
diff --git a/edx_dl/edx_dl.py b/edx_dl/edx_dl.py
@@ -14,13 +14,14 @@
 import pickle
 import re
 import sys
+import math
 
 from functools import partial
 from multiprocessing.dummy import Pool as ThreadPool
 
 from six.moves.http_cookiejar import CookieJar
 from six.moves.urllib.error import HTTPError, URLError
-from six.moves.urllib.parse import urlencode
+from six.moves.urllib.parse import urlencode, quote
 from six.moves.urllib.request import (
     urlopen,
     build_opener,
@@ -89,19 +90,25 @@
     'bits':{
         'url':'http://any-learn.bits-pilani.ac.in',
         'courseware-selector': ('nav', {'aria-label': 'Course Navigation'}),
+    },
+    'xuetangx': {
+        'url': 'http://www.xuetangx.com',
+        'courseware-selector': None,
     }
 }
-BASE_URL = OPENEDX_SITES['edx']['url']
+SITE_NAME = 'edx'
+BASE_URL = OPENEDX_SITES[SITE_NAME]['url']
 EDX_HOMEPAGE = BASE_URL + '/login_ajax'
 LOGIN_API = BASE_URL + '/login_ajax'
 DASHBOARD = BASE_URL + '/dashboard'
-COURSEWARE_SEL = OPENEDX_SITES['edx']['courseware-selector']
+COURSEWARE_SEL = OPENEDX_SITES[SITE_NAME]['courseware-selector']
 
 
 def change_openedx_site(site_name):
     """
     Changes the openedx website for the given one via the key
     """
+    global SITE_NAME
     global BASE_URL
     global EDX_HOMEPAGE
     global LOGIN_API
@@ -113,11 +120,15 @@ def change_openedx_site(site_name):
         logging.error("OpenEdX platform should be one of: %s", ', '.join(sites))
         sys.exit(ExitCode.UNKNOWN_PLATFORM)
 
-    BASE_URL = OPENEDX_SITES[site_name]['url']
+    SITE_NAME = site_name
+    BASE_URL = OPENEDX_SITES[SITE_NAME]['url']
     EDX_HOMEPAGE = BASE_URL + '/login_ajax'
     LOGIN_API = BASE_URL + '/login_ajax'
-    DASHBOARD = BASE_URL + '/dashboard'
-    COURSEWARE_SEL = OPENEDX_SITES[site_name]['courseware-selector']
+    if site_name == 'xuetangx':
+        DASHBOARD = BASE_URL + '/api/web/courses/mycourses?format=json'
+    else:
+        DASHBOARD = BASE_URL + '/dashboard'
+    COURSEWARE_SEL = OPENEDX_SITES[SITE_NAME]['courseware-selector']
 
 
 def _display_courses(courses):
@@ -131,10 +142,67 @@ def _display_courses(courses):
         logging.info('     %s', course.url)
 
 
+def get_courses_info_xuetangx(url, headers):
+    """
+    Extracts the courses information from the dashboard.
+
+    This function is re-implemented for http://www.xuetangx.com, because
+    Xuetangx uses a REST API, which is quite different from other OpenEdX sites.
+    """
+    def fetch_and_parse(base_url, param):
+        """
+        Fetches the JSON API, and returns the total count, and a list of dicts
+        for the results on the current page.
+
+        :param base_url: the URL of the API.
+        :param param: query parameters, represented by a list of tuples.
+        :return: a (total, results) tuple; (0, []) on failure.
+        """
+        url = base_url + '?' + urlencode(param)
+        page = get_page_contents(url, headers)
+        try:
+            d = json.loads(page)
+            total = d['total']
+            results = d['results']
+        except (json.JSONDecodeError, KeyError):
+            total = 0
+            results = []
+        return total, results
+
+    logging.info('Extracting course information from JSON API.')
+
+    api_url = BASE_URL + '/api/web/courses/mycourses'
+    query_params = [
+        [('type', 'started'), ('format', 'json')],
+        [('type', 'ended'), ('format', 'json')]
+    ]
+    # use default page size, and fetch multiple times, in case there is a hard
+    # limit set by the API
+    page_size = 10
+
+    courses = []
+    page_extractor = get_page_extractor(url)
+
+    for param in query_params:
+        total, results = fetch_and_parse(api_url, param)
+        page_count = math.ceil(1.0 * total / page_size)
+        for i in range(page_count):
+            if i:
+                # page needs to be re-fetched unless it is the first one
+                new_param = param + [('offset', i * page_size)]
+                _, results = fetch_and_parse(api_url, new_param)
+            courses += page_extractor.extract_courses(results, BASE_URL)
+
+    return courses
+
+
 def get_courses_info(url, headers):
     """
     Extracts the courses information from the dashboard.
     """
+    if SITE_NAME == 'xuetangx':
+        return get_courses_info_xuetangx(url, headers)
+
     logging.info('Extracting course information from dashboard.')
 
     page = get_page_contents(url, headers)
@@ -304,6 +372,14 @@ def parse_args():
                         default=False,
                         help='list available sections')
 
+    parser.add_argument('--quality',
+                        dest='quality',
+                        action='store',
+                        choices={'high', 'standard'},
+                        default='high',
+                        help='quality of video to download; works for xuetangx'
+                             ' only')
+
     parser.add_argument('--youtube-dl-options',
                         dest='youtube_dl_options',
                         action='store',
@@ -435,6 +511,9 @@ def extract_units(url, headers, file_formats):
 
     page = get_page_contents(url, headers)
     page_extractor = get_page_extractor(url)
+    set_headers = getattr(page_extractor, 'set_headers', None)
+    if callable(set_headers):
+        set_headers(headers)
     units = page_extractor.extract_units_from_html(page, BASE_URL, file_formats)
 
     return units
@@ -664,27 +743,42 @@ def _build_subtitles_downloads(video, target_dir, filename_prefix, headers):
     return downloads
 
 
-def _build_url_downloads(urls, target_dir, filename_prefix):
+def _build_url_downloads(urls, target_dir, filename_prefix, args,
+                         is_video=False):
     """
     Builds a dict {url: filename} for the given urls
     If it is a youtube url it uses the valid template for youtube-dl
     otherwise just takes the name of the file from the url
     """
+    if SITE_NAME == 'xuetangx' and is_video and urls:
+        # take advantage of the fact that the URL of HQ videos are
+        # lexicographically larger on Xuetangx ('quality20' > 'quality10')
+        urls = [max(urls)] if args.quality == 'high' else [min(urls)]
     downloads = {url:
-                 _build_filename_from_url(url, target_dir, filename_prefix)
+                 _build_filename_from_url(url, target_dir, filename_prefix,
+                                          is_video=is_video)
                  for url in urls}
     return downloads
 
 
-def _build_filename_from_url(url, target_dir, filename_prefix):
+def _build_filename_from_url(url, target_dir, filename_prefix, is_video=False,
+                             video_counter=[0]):
     """
     Builds the appropriate filename for the given args
     """
+    # video file names in Xuetangx do not make sense;
+    # use a counter as a workaround
+    if is_video:
+        video_counter[0] += 1
+
     if is_youtube_url(url):
         filename_template = filename_prefix + "-%(title)s-%(id)s.%(ext)s"
         filename = os.path.join(target_dir, filename_template)
     else:
-        original_filename = url.rsplit('/', 1)[1]
+        if SITE_NAME == 'xuetangx' and is_video:
+            original_filename = 'video_%05d.mp4' % video_counter[0]
+        else:
+            original_filename = url.rsplit('/', 1)[1]
         filename = os.path.join(target_dir,
                                 filename_prefix + '-' + original_filename)
 
@@ -695,6 +789,8 @@ def download_url(url, filename, headers, args):
     """
     Downloads the given url in filename.
     """
+    # resolve unicode issue
+    url = quote(url, safe=';/?:@&=+$,')
 
     if is_youtube_url(url):
         download_youtube_url(url, filename, headers, args)
@@ -770,13 +866,15 @@ def skip_or_download(downloads, headers, args, f=download_url):
 def download_video(video, args, target_dir, filename_prefix, headers):
     if args.prefer_cdn_videos or video.video_youtube_url is None:
         mp4_downloads = _build_url_downloads(video.mp4_urls, target_dir,
-                                             filename_prefix)
+                                             filename_prefix, args,
+                                             is_video=True)
         skip_or_download(mp4_downloads, headers, args)
     else:
         if video.video_youtube_url is not None:
             youtube_downloads = _build_url_downloads([video.video_youtube_url],
                                                      target_dir,
-                                                     filename_prefix)
+                                                     filename_prefix,
+                                                     is_video=True)
             skip_or_download(youtube_downloads, headers, args)
 
     # the behavior with subtitles is different, since the subtitles don't know
@@ -804,7 +902,7 @@ def download_unit(unit, args, target_dir, filename_prefix, headers):
             download_video(video, args, target_dir, new_prefix, headers)
 
     res_downloads = _build_url_downloads(unit.resources_urls, target_dir,
-                                         filename_prefix)
+                                         filename_prefix, args)
     skip_or_download(res_downloads, headers, args)
 
 
@@ -818,13 +916,19 @@ def download(args, selections, all_units, headers):
     # notice that we could iterate over all_units, but we prefer to do it over
     # sections/subsections to add correct prefixes and show nicer information.
 
+    # courses on Xuetangx may contain chinese characters
+    preserve_non_ascii = (SITE_NAME == 'xuetangx')
+
     for selected_course, selected_sections in selections.items():
-        coursename = directory_name(selected_course.name)
+        coursename = directory_name(selected_course.name,
+                                    minimal_change=preserve_non_ascii)
         for selected_section in selected_sections:
             section_dirname = "%02d-%s" % (selected_section.position,
                                            selected_section.name)
             target_dir = os.path.join(args.output_dir, coursename,
-                                      clean_filename(section_dirname))
+                                      clean_filename(section_dirname,
+                                                     minimal_change=
+                                                     preserve_non_ascii))
             mkdir_p(target_dir)
             counter = 0
             for subsection in selected_section.subsections:

diff --git a/edx_dl/parsing.py b/edx_dl/parsing.py
@@ -5,13 +5,15 @@
 """
 import re
 import json
+import logging
 
 from datetime import timedelta, datetime
 
 from six.moves import html_parser
 from bs4 import BeautifulSoup as BeautifulSoup_
 
 from .common import Course, Section, SubSection, Unit, Video
+from .utils import get_page_contents
 
 # Force use of bs4 with html5lib
 BeautifulSoup = lambda page: BeautifulSoup_(page, 'html5lib')
@@ -342,6 +344,75 @@ def _make_subsections(section_soup):
         return sections
 
 
+class XuetangxPageExtractor(ClassicEdXPageExtractor):
+
+    def __init__(self):
+        self.headers = None
+
+    def set_headers(self, headers):
+        """Sets the headers necessary for accessing the video URL API"""
+        self.headers = headers
+        self.base_url = None
+
+    def extract_courses(self, results, BASE_URL):
+        """
+        Extract courses from a list of dicts.
+        """
+        courses = []
+
+        for result in results:
+            try:
+                course_id = result['id']
+                course_name = result['name']
+                course_url = BASE_URL + result['info_link']
+                # Xuetangx allows accessing materials for all archived courses,
+                # so it's safe to mark all courses as 'Started'.
+                course_state = 'Started'
+            except KeyError:
+                continue
+            courses.append(Course(id=course_id,
+                                  name=course_name,
+                                  url=course_url,
+                                  state=course_state))
+
+        return courses
+
+    def extract_units_from_html(self, page, BASE_URL, file_formats):
+        self.base_url = BASE_URL
+        return ClassicEdXPageExtractor.extract_units_from_html(self, page,
+                                                               BASE_URL,
+                                                               file_formats)
+
+    def extract_mp4_urls(self, text):
+        """
+        Looks for available links to the mp4 version of the videos
+        """
+        # Xuetangx does not provide the video URL directly in the page;
+        # instead, a video id can be found in the page and translated into
+        # actual URL through a "video2source" API.
+        m = re.search('(?<=data-ccsource=&#39;).+(?=&#39;)', text)
+        if not m:
+            return []
+
+        video_id = m.group(0)
+        if not self.base_url:
+            logging.debug('Base URL unset; please set self.base_url before '
+                          'calling extract_mp4_urls')
+            return []
+        video_src_url = self.base_url + '/videoid2source/' + video_id
+        video_src_json = get_page_contents(video_src_url, self.headers)
+        try:
+            sources = json.loads(video_src_json)['sources']
+        except (json.JSONDecodeError, KeyError):
+            return []
+
+        mp4_urls = []
+        for quality in sources:
+            if sources[quality]:
+                mp4_urls.append(sources[quality][0])
+        return mp4_urls
+
+
 def get_page_extractor(url):
     """
     factory method for page extractors
@@ -350,6 +421,9 @@ def get_page_extractor(url):
             'https://lagunita.stanford.edu'):
         return CurrentEdXPageExtractor()
 
+    if 'xuetangx.com' in url:
+        return XuetangxPageExtractor()
+
     return ClassicEdXPageExtractor()
 
 

diff --git a/edx_dl/utils.py b/edx_dl/utils.py
@@ -42,11 +42,11 @@ def execute_command(cmd, args):
             raise e
 
 
-def directory_name(initial_name):
+def directory_name(initial_name, minimal_change=False):
     """
     Transform the name of a directory into an ascii version
     """
-    result = clean_filename(initial_name)
+    result = clean_filename(initial_name, minimal_change=minimal_change)
     return result if result != "" else "course_folder"