ed-cooper · ed-cooper · May 12, 2019 · May 11, 2019 · May 11, 2019 · May 12, 2019
diff --git a/README.md b/README.md
@@ -14,6 +14,7 @@ Then, open it in your editor of your choice, and set the following 3 variables:
 username = "Your username"        # The username you would usually use for My Manchester
 password = "Your password"        # The accompanying password
 base_dir = "~/Documents/Lectures" # Where to download files to
+concurrent_downloads = 4          # How many podcasts to download at the same time (Increases speed)
 ```
 
 Then, install the packages listed in [requirements.txt](requirements.txt).

diff --git a/requirements.txt b/requirements.txt
@@ -1,12 +1,3 @@
 beautifulsoup4==4.7.1
-certifi==2019.3.9
-chardet==3.0.4
-entrypoints==0.3
 flake8==3.7.7
-idna==2.8
-mccabe==0.6.1
-pycodestyle==2.5.0
-pyflakes==2.1.1
 requests==2.21.0
-soupsieve==1.9.1
-urllib3==1.24.3
diff --git a/run.py b/run.py
@@ -1,3 +1,4 @@
+import concurrent.futures
 import os
 import requests
 import settings
@@ -61,72 +62,93 @@
     print("Could not get video service: service responded with status code", get_video_service_base.status_code)
     sys.exit(4)
 
-# Status code valid, parse HTML
-get_video_service_base_soup = BeautifulSoup(get_video_service_base.content, features="html.parser")
-first = True
-for course_li in get_video_service_base_soup.find("nav", {"id": "sidebar-nav"}).ul.contents[3].find_all("li", {
-        "class": "series"}):
-    # For each course
-    print("-" * (21 + len(course_li.a.string)))
-    print("Getting podcasts for", course_li.a.string)
-    print("-" * (21 + len(course_li.a.string)))
-    get_video_service_course = session.get("https://video.manchester.ac.uk" + course_li.a["href"])
+# Status code valid
+
+
+# Downloads a postcast using the href and a target location.
+# Logging messages will use the name to identify which podcast download request it is related to.
+def download_podcast(name, podcast_link, download_path):
+    print("Downloading podcast", name)
+
+    # Get podcast webpage
+    get_video_service_podcast_page = session.get("https://video.manchester.ac.uk" + podcast_link)
 
     # Check status code valid
-    if get_video_service_course.status_code != 200:
-        print("Could not get podcasts for", course_li.a.string, "- Service responded with status code",
-              get_video_service_course.status_code)
-        continue
+    if get_video_service_podcast_page.status_code != 200:
+        print("Could not get podcast webpage for", name, "- Service responded with status code",
+              get_video_service_podcast_page.status_code)
+        return
 
-    # Success code valid, create directory for podcasts
-    course_dir = os.path.expanduser(os.path.join(settings.base_dir, "".join(
-        c for c in course_li.a.string if c in VALID_FILE_CHARS)))
-    os.makedirs(course_dir, exist_ok=True)
+    # Status code valid, parse HTML
+    getVideoServicePodcastPageSoup = BeautifulSoup(get_video_service_podcast_page.content,
+                                                   features="html.parser")
+    podcast_src = "https://video.manchester.ac.uk" + \
+                  getVideoServicePodcastPageSoup.find("video", id="video").source["src"]
 
-    # Parse HTML
-    get_video_service_course_soup = BeautifulSoup(get_video_service_course.content, features="html.parser")
-    podcasts = get_video_service_course_soup.find("nav", {"id": "sidebar-nav"}).ul.contents[5].find_all("li", {
-        "class": "episode"})
-    podcast_no = len(podcasts) + 1
-    for podcast_li in podcasts:
-        # For each podcast
-        podcast_no -= 1
-
-        # Check podcast not already downloaded
-        download_path = os.path.expanduser(os.path.join(course_dir, f"{podcast_no:02d} - " + podcast_li.a.string +
-                                                        ".mp4"))
-        if os.path.isfile(download_path):
-            print("Skipping podcast", podcast_li.a.string, "(already exists)")
-            continue
+    # Get podcast
+    get_video_service_podcast = session.get(podcast_src, stream=True)
 
-        # Podcast not yet downloaded
-        print("Getting podcast", podcast_li.a.string)
+    # Check status code valid
+    if get_video_service_podcast.status_code != 200:
+        print("Could not get podcast for", name, "- Service responded with status code",
+              get_video_service_podcast.status_code)
+        return
 
-        # Get podcast webpage
-        get_video_service_podcast_page = session.get("https://video.manchester.ac.uk" + podcast_li.a["href"])
+    # Write to file
+    with open(download_path, "wb") as f:
+        get_video_service_podcast.raw.decode_content = True
+        shutil.copyfileobj(get_video_service_podcast.raw, f)
 
-        # Check status code valid
-        if get_video_service_podcast_page.status_code != 200:
-            print("Could not get podcast webpage for", podcast_li.a.string, "- Service responded with status code",
-                  get_video_service_podcast_page.status_code)
-            continue
+    print("Downloaded podcast", name)
 
-        # Status code valid, parse HTML
-        getVideoServicePodcastPageSoup = BeautifulSoup(get_video_service_podcast_page.content,
-                                                       features="html.parser")
-        podcast_src = "https://video.manchester.ac.uk" + \
-                      getVideoServicePodcastPageSoup.find("video", id="video").source["src"]
 
-        # Get podcast
-        get_video_service_podcast = session.get(podcast_src, stream=True)
+with concurrent.futures.ThreadPoolExecutor(max_workers=settings.concurrent_downloads) as executor:
+    futures = []
+
+    # Parse HTML
+    get_video_service_base_soup = BeautifulSoup(get_video_service_base.content, features="html.parser")
+
+    for course_li in get_video_service_base_soup.find("nav", {"id": "sidebar-nav"}).ul.contents[3].find_all("li", {
+            "class": "series"}):
+        # For each course
+        print("-" * (21 + len(course_li.a.string)))
+        print("Getting podcasts for", course_li.a.string)
+        print("-" * (21 + len(course_li.a.string)))
+        get_video_service_course = session.get("https://video.manchester.ac.uk" + course_li.a["href"])
 
         # Check status code valid
-        if get_video_service_podcast.status_code != 200:
-            print("Could not get podcast for", podcast_li.a.string, "- Service responded with status code",
-                  get_video_service_podcast_page.status_code)
+        if get_video_service_course.status_code != 200:
+            print("Could not get podcasts for", course_li.a.string, "- Service responded with status code",
+                  get_video_service_course.status_code)
             continue
 
-        # Write to file
-        with open(download_path, "wb") as f:
-            get_video_service_podcast.raw.decode_content = True
-            shutil.copyfileobj(get_video_service_podcast.raw, f)
+        # Success code valid, create directory for podcasts
+        course_dir = os.path.expanduser(os.path.join(settings.base_dir, "".join(
+            c for c in course_li.a.string if c in VALID_FILE_CHARS)))
+        os.makedirs(course_dir, exist_ok=True)
+
+        # Parse HTML
+        get_video_service_course_soup = BeautifulSoup(get_video_service_course.content, features="html.parser")
+        podcasts = get_video_service_course_soup.find("nav", {"id": "sidebar-nav"}).ul.contents[5].find_all("li", {
+            "class": "episode"})
+        podcast_no = len(podcasts) + 1
+        for podcast_li in podcasts:
+            # For each podcast
+            podcast_no -= 1
+
+            # Check podcast not already downloaded
+            download_path = os.path.expanduser(os.path.join(course_dir, f"{podcast_no:02d} - " + podcast_li.a.string +
+                                                            ".mp4"))
+            if os.path.isfile(download_path):
+                print("Skipping podcast", podcast_li.a.string, "(already exists)")
+                continue
+
+            # Podcast not yet downloaded
+            print("Queuing podcast", podcast_li.a.string)
+
+            # Queue podcast for downloading
+            futures.append(executor.submit(download_podcast, podcast_li.a.string, podcast_li.a["href"], download_path))
+
+    # Wait for all queued podcasts to download
+    for idx, future in enumerate(concurrent.futures.as_completed(futures)):
+        res = future.result()  # This will also raise any exceptions
diff --git a/settings-template.py b/settings-template.py
@@ -1,3 +1,4 @@
 username = "Your username"
 password = "Your password"
 base_dir = "~/Documents/Lectures"
+concurrent_downloads = 4