Skip to content
This repository has been archived by the owner on Dec 26, 2023. It is now read-only.

Add concurrent downloading #1

Merged
merged 12 commits into from
May 12, 2019
1 change: 1 addition & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@ Then, open it in your editor of your choice, and set the following 3 variables:
username = "Your username" # The username you would usually use for My Manchester
password = "Your password" # The accompanying password
base_dir = "~/Documents/Lectures" # Where to download files to
concurrent_downloads = 4 # How many podcasts to download at the same time (Increases speed)
```

Then, install the packages listed in [requirements.txt](requirements.txt).
Expand Down
9 changes: 0 additions & 9 deletions requirements.txt
Original file line number Diff line number Diff line change
@@ -1,12 +1,3 @@
beautifulsoup4==4.7.1
certifi==2019.3.9
chardet==3.0.4
entrypoints==0.3
flake8==3.7.7
idna==2.8
mccabe==0.6.1
pycodestyle==2.5.0
pyflakes==2.1.1
requests==2.21.0
soupsieve==1.9.1
urllib3==1.24.3
134 changes: 78 additions & 56 deletions run.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
import concurrent.futures
import os
import requests
import settings
Expand Down Expand Up @@ -61,72 +62,93 @@
print("Could not get video service: service responded with status code", get_video_service_base.status_code)
sys.exit(4)

# Status code valid, parse HTML
get_video_service_base_soup = BeautifulSoup(get_video_service_base.content, features="html.parser")
first = True
for course_li in get_video_service_base_soup.find("nav", {"id": "sidebar-nav"}).ul.contents[3].find_all("li", {
"class": "series"}):
# For each course
print("-" * (21 + len(course_li.a.string)))
print("Getting podcasts for", course_li.a.string)
print("-" * (21 + len(course_li.a.string)))
get_video_service_course = session.get("https://video.manchester.ac.uk" + course_li.a["href"])
# Status code valid


# Downloads a postcast using the href and a target location.
# Logging messages will use the name to identify which podcast download request it is related to.
def download_podcast(name, podcast_link, download_path):
csnewman marked this conversation as resolved.
Show resolved Hide resolved
print("Downloading podcast", name)

# Get podcast webpage
get_video_service_podcast_page = session.get("https://video.manchester.ac.uk" + podcast_link)

# Check status code valid
if get_video_service_course.status_code != 200:
print("Could not get podcasts for", course_li.a.string, "- Service responded with status code",
get_video_service_course.status_code)
continue
if get_video_service_podcast_page.status_code != 200:
print("Could not get podcast webpage for", name, "- Service responded with status code",
get_video_service_podcast_page.status_code)
return

# Success code valid, create directory for podcasts
course_dir = os.path.expanduser(os.path.join(settings.base_dir, "".join(
c for c in course_li.a.string if c in VALID_FILE_CHARS)))
os.makedirs(course_dir, exist_ok=True)
# Status code valid, parse HTML
getVideoServicePodcastPageSoup = BeautifulSoup(get_video_service_podcast_page.content,
features="html.parser")
podcast_src = "https://video.manchester.ac.uk" + \
getVideoServicePodcastPageSoup.find("video", id="video").source["src"]

# Parse HTML
get_video_service_course_soup = BeautifulSoup(get_video_service_course.content, features="html.parser")
podcasts = get_video_service_course_soup.find("nav", {"id": "sidebar-nav"}).ul.contents[5].find_all("li", {
"class": "episode"})
podcast_no = len(podcasts) + 1
for podcast_li in podcasts:
# For each podcast
podcast_no -= 1

# Check podcast not already downloaded
download_path = os.path.expanduser(os.path.join(course_dir, f"{podcast_no:02d} - " + podcast_li.a.string +
".mp4"))
if os.path.isfile(download_path):
print("Skipping podcast", podcast_li.a.string, "(already exists)")
continue
# Get podcast
get_video_service_podcast = session.get(podcast_src, stream=True)

# Podcast not yet downloaded
print("Getting podcast", podcast_li.a.string)
# Check status code valid
if get_video_service_podcast.status_code != 200:
print("Could not get podcast for", name, "- Service responded with status code",
get_video_service_podcast.status_code)
return

# Get podcast webpage
get_video_service_podcast_page = session.get("https://video.manchester.ac.uk" + podcast_li.a["href"])
# Write to file
with open(download_path, "wb") as f:
get_video_service_podcast.raw.decode_content = True
shutil.copyfileobj(get_video_service_podcast.raw, f)

# Check status code valid
if get_video_service_podcast_page.status_code != 200:
print("Could not get podcast webpage for", podcast_li.a.string, "- Service responded with status code",
get_video_service_podcast_page.status_code)
continue
print("Downloaded podcast", name)

# Status code valid, parse HTML
getVideoServicePodcastPageSoup = BeautifulSoup(get_video_service_podcast_page.content,
features="html.parser")
podcast_src = "https://video.manchester.ac.uk" + \
getVideoServicePodcastPageSoup.find("video", id="video").source["src"]

# Get podcast
get_video_service_podcast = session.get(podcast_src, stream=True)
with concurrent.futures.ThreadPoolExecutor(max_workers=settings.concurrent_downloads) as executor:
futures = []

# Parse HTML
get_video_service_base_soup = BeautifulSoup(get_video_service_base.content, features="html.parser")

for course_li in get_video_service_base_soup.find("nav", {"id": "sidebar-nav"}).ul.contents[3].find_all("li", {
"class": "series"}):
# For each course
print("-" * (21 + len(course_li.a.string)))
print("Getting podcasts for", course_li.a.string)
print("-" * (21 + len(course_li.a.string)))
get_video_service_course = session.get("https://video.manchester.ac.uk" + course_li.a["href"])

# Check status code valid
if get_video_service_podcast.status_code != 200:
print("Could not get podcast for", podcast_li.a.string, "- Service responded with status code",
get_video_service_podcast_page.status_code)
if get_video_service_course.status_code != 200:
print("Could not get podcasts for", course_li.a.string, "- Service responded with status code",
get_video_service_course.status_code)
continue

# Write to file
with open(download_path, "wb") as f:
get_video_service_podcast.raw.decode_content = True
shutil.copyfileobj(get_video_service_podcast.raw, f)
# Success code valid, create directory for podcasts
course_dir = os.path.expanduser(os.path.join(settings.base_dir, "".join(
c for c in course_li.a.string if c in VALID_FILE_CHARS)))
os.makedirs(course_dir, exist_ok=True)

# Parse HTML
get_video_service_course_soup = BeautifulSoup(get_video_service_course.content, features="html.parser")
podcasts = get_video_service_course_soup.find("nav", {"id": "sidebar-nav"}).ul.contents[5].find_all("li", {
"class": "episode"})
podcast_no = len(podcasts) + 1
for podcast_li in podcasts:
# For each podcast
podcast_no -= 1

# Check podcast not already downloaded
download_path = os.path.expanduser(os.path.join(course_dir, f"{podcast_no:02d} - " + podcast_li.a.string +
".mp4"))
if os.path.isfile(download_path):
print("Skipping podcast", podcast_li.a.string, "(already exists)")
continue

# Podcast not yet downloaded
print("Queuing podcast", podcast_li.a.string)

# Queue podcast for downloading
futures.append(executor.submit(download_podcast, podcast_li.a.string, podcast_li.a["href"], download_path))

# Wait for all queued podcasts to download
for idx, future in enumerate(concurrent.futures.as_completed(futures)):
res = future.result() # This will also raise any exceptions
1 change: 1 addition & 0 deletions settings-template.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
username = "Your username"
password = "Your password"
base_dir = "~/Documents/Lectures"
concurrent_downloads = 4