- chomedriver: https://sites.google.com/a/chromium.org/chromedriver/ / https://chromedriver.chromium.org/
- chromium command line switches: https://peter.sh/experiments/chromium-command-line-switches/#net-log-capture-mode
    - important flags: '--log-net-log', '--net-log-capture-mode' / '--enable-logging --v=1'
    - How to capture a NetLog dump: https://www.chromium.org/for-testers/providing-network-details

- canvas tutorial: https://towardsdatascience.com/controlling-the-web-with-python-6fceb22c5f08

In [1]:
# basic webdriver imports
from selenium import webdriver
from selenium.webdriver.chrome.options import Options as ChromeOptions
# imports for waiting
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

import os

from bs4 import BeautifulSoup

import json
import requests
import re
from tqdm import tqdm

import time


# get credentials
CANVAS_USERNAME = os.getenv('CANVAS_USERNAME')
CANVAS_PASSWORD = os.getenv('CANVAS_PASSWORD')

# set paths
DATA_PATH = os.path.abspath('./../data')

LOG_PATH = os.path.join(DATA_PATH, 'tmp/net_log.json')
DRIVER_PATH = os.path.join(DATA_PATH, 'drivers/chromedriver.exe')
VIDEO_PATH = os.path.join(DATA_PATH, 'videos')

DEFAULT_TIMEOUT = 15

In [2]:
def generate_driver():
    '''
    return a fully configured chrome driver
    '''
    
    # configure options
    chrome_options = ChromeOptions()
    chrome_options.add_argument('--log-net-log={}'.format(LOG_PATH))
    # note: the '--log-net-log' switch is of *vital* importance to this projct as it records network activity
    
    # start the driver
    driver = webdriver.Chrome(executable_path=DRIVER_PATH, options=chrome_options)
    
    return driver


def setup_and_login(default_2FA=True):
    '''
    automatically walk through the process of starting Canvas and passing 2FA
    
    default_2FA (bool) : if true, automatically 'call' the fist 2FA method presented
    '''
    
    # step 0: start a configured driver
    driver = generate_driver()
    
    # step 0.5: open the base canvas url --> triggers a login scree
    driver.get('https://canvas.harvard.edu/')
    
    
    #-------------------------------------------------------
    #-------------------- step 1: login --------------------
    #-------------------------------------------------------
    
    # wait for the 'username' element to indicate the login page has loaded
    _ = WebDriverWait(driver, DEFAULT_TIMEOUT).until(EC.presence_of_element_located((By.ID, 'username')))
    
    # input username
    username_box = driver.find_element_by_id('username') # note: could just remove this line and use the WebDriverWait return
    username_box.send_keys(CANVAS_USERNAME)
    
    # input password
    pass_box = driver.find_element_by_id('password')
    pass_box.send_keys(CANVAS_PASSWORD)
    
    # click submit
    login_button = driver.find_element_by_id('submitLogin')
    login_button.click()
    
    
    #--------------------------------------------------------------
    #-------------------- step 2: get past 2FA --------------------
    #--------------------------------------------------------------
    
    # select the default 2FA method
    # note: if this is not used, the user must select and trigger the 2FA manually
    if default_2FA is True:
        # wait for 2FA iframe to load and switch to it
        _ = WebDriverWait(driver, DEFAULT_TIMEOUT).until(EC.frame_to_be_available_and_switch_to_it('duo_iframe'))
        
        # click the 'call' button
        call_button = driver.find_element_by_css_selector('.positive.auth-button')
        call_button.click()
    
    
    # wait for dashboard to load
    print('INFO: if 2FA is not completed within 120s, this program will exit automatically')
    _ = WebDriverWait(driver, 120).until(EC.presence_of_element_located((By.ID, 'dashboard')))
    
    # return the authenticated driver object
    return driver

In [3]:
def get_player_page_source(driver, lecture_URL, player=None):
    '''
    given a authenticated driver, a lecture URL, (and optionally) the player type: return the player page source after
    loading iframes correctly
    
    return: player_page_source, player_name
    '''
    
    # if a player was slected, make sure it is valid
    if player is not None and player not in ('matterhorn', 'panopto'):
        raise ValueError(f'invalid player selected. player "{player}" is not in ("matterhorn", "panopto")')
    
    # load the url in the driver
    driver.get(lecture_URL)

    # wait for the video iframe to become visible and switch to it
    _ = WebDriverWait(driver, DEFAULT_TIMEOUT).until(EC.frame_to_be_available_and_switch_to_it('tool_content'))
    
    # try to figure out what player is being used based on HTML clues (if no player given)
    # note: this is probably very unstable...
    if player is None:
        initial_page = BeautifulSoup(driver.page_source)
        head_element = initial_page.find('head')
        
        description_element = head_element.find('meta', attrs={'name': 'description'})

        if description_element['content'] == 'HUDCE Publication Listing':
            player = 'matterhorn'
        elif description_element['content'] == 'Capture, manage, and search all your video content.':
            player = 'panopto'
        else:
            raise Exception('looks like the HTML changed and player auto detection is broken')
        
    # get player specific class for 'wait' below
    if player == 'matterhorn':
        element_class = '.item.ng-scope'
    elif player == 'panopto':
        element_class = '.thumbnail-row.draggable'
    
    # wait for the videos to load into the frame (class depends on player)
    _ = WebDriverWait(driver, DEFAULT_TIMEOUT).until(EC.presence_of_element_located((By.CSS_SELECTOR, element_class)))
    # note: 'EC.presence_of_all_elements_located' has same wait effect because it only waits for first element
    
    # return the player page source
    return driver.page_source, player


def extract_lecture_links(player_page_source, player):
    '''
    given a player_page_source and the type of player it came from, extract a 'lecture: url' dict
    '''
    
    player_page = BeautifulSoup(player_page_source)
    
    lecture_to_url = {}
    
    if player == 'matterhorn':
        items_container = player_page.find('div', 'items-container ng-scope') # lowest level to contain list of vids
        # note: no need to scope to 'items-container ng-scope' as other tag is specific enough
        #       but it makes me happy :p
        
        for video in items_container.find_all('div', 'item ng-scope'):
            # extract the title attr
            title_element = video.find('div', 'publication-title auto-launch')
            
            # extract the link attr
            link_element = video.find('a', 'live-event item-link')
            
            title = title_element.text.strip()
            link = 'https:' + link_element['href']
            lecture_to_url[title] = link
    
    elif player == 'panopto':
        details_table = player_page.find('table', 'details-table') # lowest level to contain list of vids
        # note: no need to scope to 'details-table' as other tag is specific enough
        #       but it makes me happy :p
        
        for video in details_table.find_all('tr', 'thumbnail-row draggable'):
            # extract the title/link attr
            title_element = video.find('a', 'detail-title')
            
            title = title_element.text.strip()
            link = title_element['href']
            lecture_to_url[title] = link
    
    return lecture_to_url

In [4]:
def open_lecture_links(driver, lecture_to_url, player):
    '''
    given a driver, a lecture_to_url dict, and a player: open each link in lecture_to_url
    
    this allows the driver to track the network activity generated from each lecture page
    '''
    
    # (probably not needed) make sure the player is valid
    assert(player in ('panopto', 'matterhorn'))
    
    # open each link
    for title, link in lecture_to_url.items():
        # go to the link
        driver.get(link)
        
        # wait for the page to fully load
        if player == 'matterhorn':
            # wait for the play button to appear
            _ = WebDriverWait(driver, DEFAULT_TIMEOUT).until(EC.presence_of_element_located(
                (By.ID, 'paella_plugin_PlayButtonOnScreen')))
        elif player == 'panopto':
            # wait for the loading image to appeaer
            _ = WebDriverWait(driver, DEFAULT_TIMEOUT).until(EC.presence_of_element_located((By.ID, 'loadingMessage')))
            # waif for the loading image to disappear (finished loading)
            _ = WebDriverWait(driver, DEFAULT_TIMEOUT).until(EC.invisibility_of_element_located((By.ID, 'loadingMessage')))
    
    # we are done with driver so we can 'quit' it
    driver.quit()

In [5]:
def extract_m3u8s_from_netlog():
    '''
    extract all .m3u8 links from network log
    '''
    
    # read the json file to a string
    with open(LOG_PATH, 'r') as log_file:
        json_str = log_file.read()
    
    # try to parst the json string normally
    try:
        json_obj = json.loads(json_str)
    except json.JSONDecodeError:
        # try to load the data by patching the end of the file
        print('INFO: in JSON parse exception. trying to patch file...')
        try:
            json_data = json.loads(json_str[:-2] + ']}')
        except json.JSONDecodeError:
            print('looks like you got unlucky... (maybe a buffer not flusing?). try running this again!')
            raise
        print('INFO: file patched')
    # NOTE: the reason you often have to patch the net log is because calling driver.quit() will kill chrome without
    # writing the closing tags on the net log
    
    all_lecture_m3u8s = []
    for event in json_data['events']:
        if 'params' in event:
            params = event['params']
            
            if params.get('network_isolation_key', None) in ('https://matterhorn.dce.harvard.edu',
                                                             'https://harvard.hosted.panopto.com'):
                if '.m3u8' in params['url']:
                    all_lecture_m3u8s.append(params['url'])
    
    return all_lecture_m3u8s

In [6]:
# matterhorn_URL = 'https://canvas.harvard.edu/courses/69559/external_tools/22940'
# panopto_URL = 'https://canvas.harvard.edu/courses/69902/external_tools/61579'

# # do setup
# driver = setup_and_login()

# # get sources
# matterhorn_source, matterhorn_player = get_player_page_source(driver, matterhorn_URL)
# panopto_source, panopto_player = get_player_page_source(driver, panopto_URL)

# # open videos
# matterhorn_video_dict = extract_lecture_links(matterhorn_source, player=matterhorn_player)
# panopto_video_dict = extract_lecture_links(panopto_source, player=panopto_player)

# matterhorn_video_dict, panopto_video_dict


import pickle

# open_lecture_links(driver, matterhorn_video_dict, matterhorn_player)

# all_lecture_m3u8s = extract_m3u8s_from_netlog()

# with open('./matterhorn_m3u8s.pkl', 'wb') as f:
#     pickle.dump(all_lecture_m3u8s, f)


# open_lecture_links(driver, panopto_video_dict, panopto_player)

# all_lecture_m3u8s = extract_m3u8s_from_netlog()

# with open('./panopto_m3u8s.pkl', 'wb') as f:
#     pickle.dump(all_lecture_m3u8s, f)

fname = './matterhorn_m3u8s.pkl'
fname = './panopto_m3u8s.pkl'

with open(fname, 'rb') as f:
    all_lecture_m3u8s = pickle.load(f)

all_lecture_m3u8s

['https://d2y36twrtb17ty.cloudfront.net/sessions/763e3a97-b8a6-4dc7-b2ce-ab9d002a8a98/976d6033-d105-4bf8-aa4e-ab9d002a8aa2-22445718-807a-49b6-85d0-ab9d00399842.hls/master.m3u8?InvocationID=c06033b4-3089-ea11-a9de-0a8e213f0382&tid=00000000-0000-0000-0000-000000000000&StreamID=20acd79f-e292-4aec-951e-aae0c2cd9dd6&ServerName=harvard.hosted.panopto.com',
 'https://d2y36twrtb17ty.cloudfront.net/sessions/763e3a97-b8a6-4dc7-b2ce-ab9d002a8a98/fd928321-ac38-4b5e-987a-32807b61ca3e.screen.hls/master.m3u8?InvocationID=c06033b4-3089-ea11-a9de-0a8e213f0382&tid=00000000-0000-0000-0000-000000000000&StreamID=fd928321-ac38-4b5e-987a-32807b61ca3e&ServerName=harvard.hosted.panopto.com',
 'https://d2y36twrtb17ty.cloudfront.net/sessions/763e3a97-b8a6-4dc7-b2ce-ab9d002a8a98/976d6033-d105-4bf8-aa4e-ab9d002a8aa2-22445718-807a-49b6-85d0-ab9d00399842.hls/757757/index.m3u8',
 'https://d2y36twrtb17ty.cloudfront.net/sessions/763e3a97-b8a6-4dc7-b2ce-ab9d002a8a98/fd928321-ac38-4b5e-987a-32807b61ca3e.screen.hls/273468

In [7]:
def get_title_to_m3u8s(lecture_to_url, all_lecture_m3u8s, player):
    '''
    build a dict that links titles to m3u8s using lecture_to_url (extracted from HTML) and all_lecture_m3u8s (from net log)
    '''
    
    # ----------------------------------------------------------------------------------------------------------------------
    # def id1: the first string of 32 hex vals
    # def id2: the first 32 hex vals of the second string of 64 hex vals
    # for both players each id1 has multiple id2's
    
    # assume: we build an id1_to_m3u8s dict that stores all m3u8s for a given id1
    
    # in matterhorn, the lecture_id is id1. we can easily find all m3u8s by plugging id1 into id1_to_m3u8s
    # in panopto, the lecture_id is id2. we first need to find the (base) id1 which we can then plug into id1_to_m3u8s
    # ----------------------------------------------------------------------------------------------------------------------
    
    # build a simple dict linking title to lecture_id
    title_to_lecture_id = {title: m3u8.split('id=')[1] for title, m3u8 in lecture_to_url.items()}
    
    # build a dict that links from the id1 (the base id) to all m3u8's that have id1
    id1_to_m3u8s = {}
    for m3u8 in all_lecture_m3u8s:
        id1 = m3u8.split('/')[4]
        
        # note: this will build a 1-1 dict
        if id1 not in id1_to_m3u8s:
            id1_to_m3u8s[id1] = []
        id1_to_m3u8s[id1].append(m3u8)
    
    if player == 'matterhorn':
        # in matterhorn, the lecture_id is id1 (the base id) so we can just grab the m3u8s directly
        return {title: id1_to_m3u8s[id1] for title, id1 in title_to_lecture_id.items()}
    
    elif player == 'panopto':
        id2_to_id1 = {}
        for m3u8 in all_lecture_m3u8s:
            id1 = m3u8.split('/')[4]
            id2 = m3u8.split('/')[5][:36]
            id2_to_id1[id2] = id1 # this will overwrite a few times (which is fine)
        # note: this will build a many-to-one dict

        # in panopto, the lecture_id is id2 (the second id) so we need to first get the (base) id1 for each id2
        return {title: id1_to_m3u8s[id2_to_id1[id2]] for title, id2 in title_to_lecture_id.items()}


# title_to_m3u8s = get_title_to_m3u8s(panopto_video_dict, all_lecture_m3u8s, 'panopto')
# title_to_m3u8s = get_title_to_m3u8s(matterhorn_video_dict, all_lecture_m3u8s, 'matterhorn')

In [8]:
def get_title_to_download_links(title_to_m3u8s, player):
    '''
    extract final download links from list of possible m3u8 files
    '''
    
    title_to_best_m3u8 = {}

    for title, m3u8_list in tqdm(title_to_m3u8s.items()):
        max_resolution_m3u8s = []
        
        # for each m3u8 url...
        for m3u8 in m3u8_list:
            
            # get the full content
            m3u8_content = requests.get(m3u8).content.decode()
            
            # if we're looking at a 'master' file (a file with links to other files), do stuff...
            if '#EXT-X-STREAM-INF' in m3u8_content:
                #-------------------- find the m3u8 varient with the max resolution --------------------
                #---------------------------------------------------------------------------------------
                
                resolution_dict = {}
                
                # convert to line-by-line content
                m3u8_content = m3u8_content.splitlines()
                
                # itterate over the lines...
                for i in range(len(m3u8_content)):
                    line = m3u8_content[i]

                    # TODO: add explanation here
                    if line.startswith('#EXT-X-STREAM-INF'):
                        # use a regex to match some '<num>x<num>'. this is the resolution (found after a 'RESOLUTION=' tag)
                        resolution = re.findall('\d*x\d*', line)[0]
                        
                        # grab the next line which stores the extension of the resolution variant
                        m3u8_extension = m3u8_content[i+1]
                        
                        resolution_dict[resolution] = m3u8_extension
                
                # GET THE MAX
                max_prod = -1
                max_resolution = None
                for resolution in resolution_dict.keys():
                    x, y = resolution.split('x')
                    prod = int(x)*int(y)

                    if prod > max_prod:
                        max_prod = prod
                        max_resolution = resolution
                
                # get the m3u8 extension at the max resolution
                m3u8_extension = resolution_dict[max_resolution]
                
                #-------------------- find the full link using the base and the max resolution extension --------------------
                #------------------------------------------------------------------------------------------------------------
                
                if player == 'matterhorn':
                    base_re = 'https://dvgni8clk4vbh.cloudfront.net/engage-player/[\w-]*/'
                elif player == 'panopto':
                    base_re = 'https://d2y36twrtb17ty.cloudfront.net/sessions/[\w-]*/[.\w-]*/'
                
                # extract the base from the m3u8 link
                base_m3u8 = re.findall(base_re, m3u8)[0]
                
                if player == 'matterhorn':
                    full_m3u8 = base_m3u8 + m3u8_extension[3:]
                    m3u8_content = requests.get(full_m3u8).content.decode()
                    
                    # extract the mp4 link from the m3u8 content
                    mp4_extension = re.findall('../.*.mp4', m3u8_content)[0]
                    
                    # add the mp4 link to the list
                    max_resolution_m3u8s.append(base_m3u8 + mp4_extension[3:])
                elif player == 'panopto':
                    full_m3u8 = base_m3u8 + m3u8_extension
                    
                    # add the ts list
                    max_resolution_m3u8s.append(full_m3u8)
        
        # add max_resolution_m3u8 list to the main dict
        title_to_best_m3u8[title] = max_resolution_m3u8s
    
    return title_to_best_m3u8

# download_links = get_title_to_download_links(title_to_m3u8s, 'panopto')

In [18]:
def clean_base_file_name(base_file_name):
    return re.sub(r'[\\/:*?"<>|]', '_', base_file_name)

def download_lecture(url, player, base_file_name, mp4_path=None, timeout_max=None):
    '''
    download a lecture. duh...
    '''
    
    # if mp4_path is unset, set it using VIDEO_PATH and base_file_name
    if mp4_path is None:
        mp4_path = os.path.join(VIDEO_PATH, clean_base_file_name(base_file_name) + '.mp4')
    
    # set a hard cap of 60min (*WAY* more time then needed) for a single download if no time is specified
    if timeout_max is None:
        timeout_max = 60*60
    
    if player == 'matterhorn':
        stream = requests.get(url, stream=True)
        
        # download the video by making many small requests
        start_time = time.time()
        with open(mp4_path, 'wb') as f:
            for chunk in tqdm(stream.iter_content(chunk_size=1048576)):
                f.write(chunk)
                
                # break if over timeout_max
                time_delta = time.time() - start_time
                if time_delta > timeout_max:
                    print('broke from loop after {} seconds'.format(time_delta))
                    break
    
    if player == 'panopto':
        m3u8_content = requests.get(url).content.decode()
        
        # extact a ts list from the m3u8 content
        ts_list = []
        for line in m3u8_content.splitlines():
            if line.endswith('.ts'):
                ts_list.append(url.replace('index.m3u8', line))
        
        # download the video by looping over ts files
        with open(mp4_path, 'wb') as mp4:
            start_time = time.time()
            
            for ts_url in tqdm(ts_list):
                mp4.write(requests.get(ts_url).content)

                # break if over timeout_max
                time_delta = time.time() - start_time
                if time_delta > timeout_max:
                    print('broke from loop after {} seconds'.format(time_delta))
                    break

# url = download_links['3-5 Part 3: Regulation of Digestion'][0]
# download_lecture(url, 'panopto', '3-5', timeout_max=10)

In [19]:
# NOTE: not used in luigi!

# def download_all_videos(master_URL, timeout_max=None):
#     # do setup
#     driver = setup_and_login()
    
#     # get sources
#     player_page_source, player_type = get_player_page_source(driver, master_URL)
    
#     # get video dict
#     lecture_to_url = extract_lecture_links(player_page_source, player=player_type)
    
#     # open all links
#     open_lecture_links(driver, lecture_to_url, player=player_type)
#     # extract data from network
#     all_lecture_m3u8s = extract_m3u8s_from_netlog()
    
#     # organize extracted data
#     title_to_m3u8s = get_title_to_m3u8s(lecture_to_url, all_lecture_m3u8s, player=player_type)
    
#     # find final download links
#     title_to_best_m3u8 = get_title_to_download_links(title_to_m3u8s, player=player_type)
    
#     # download all videos
#     for title, urls in title_to_best_m3u8.items():
#         for url_num in range(len(urls)):
#             full_title = title + ' - perspective' + str(url_num)
#             download_lecture(urls[url_num], player=player_type, lecture_name=full_title, timeout_max=timeout_max)


# download_all_videos(matterhorn_URL, timeout_max=3)

# Luigi Starts!

In [20]:
CACHE_PATH = os.path.join(DATA_PATH, 'tmp/luigi_cache')

S3_ROOT = 's3://etrilling-cscie29/recorded_lectures'

In [21]:
from luigi import Task, Parameter, BoolParameter, build
from luigi.task import WrapperTask
from luigi.local_target import LocalTarget
from luigi.contrib.s3 import S3Target
import luigi

import pickle


class SaveLectureData(Task):
    '''
    given a URL, do all operations to find video download links and save data to cache file
    '''
    
    master_URL = Parameter()
    
    # NOTE: nothing is "required"

    def output(self):
        # extract unique class_id from utl
        class_id = self.master_URL.split('/')[4]
        # generate class specific cache file (meaning this task will only re-run on new courses)
        return LocalTarget(os.path.join(CACHE_PATH, class_id + '.pkl'), format=luigi.format.Nop)
    
    def run(self):
        # do setup
        driver = setup_and_login()

        # get sources
        player_page_source, player_type = get_player_page_source(driver, self.master_URL)

        # get video dict
        lecture_to_url = extract_lecture_links(player_page_source, player=player_type)

        # open all links
        open_lecture_links(driver, lecture_to_url, player=player_type)
        # extract data from network
        all_lecture_m3u8s = extract_m3u8s_from_netlog()
        
        # organize extracted data
        title_to_m3u8s = get_title_to_m3u8s(lecture_to_url, all_lecture_m3u8s, player=player_type)

        # find final download links
        title_to_best_m3u8 = get_title_to_download_links(title_to_m3u8s, player=player_type)
        
        # pack required data into dict
        data = {'title_to_best_m3u8': title_to_best_m3u8, 'player_type': player_type}
        
        with self.output().open('w') as cache:
            pickle.dump(data, cache)


matterhorn_URL = 'https://canvas.harvard.edu/courses/69559/external_tools/22940'
panopto_URL = 'https://canvas.harvard.edu/courses/69902/external_tools/61579'

build([SaveLectureData(master_URL=matterhorn_URL)], local_scheduler=True)
build([SaveLectureData(master_URL=panopto_URL)], local_scheduler=True)

DEBUG: Checking if SaveLectureData(master_URL=https://canvas.harvard.edu/courses/69559/external_tools/22940) is complete
INFO: Informed scheduler that task   SaveLectureData_https___canvas_h_885a7e0fc5   has status   DONE
INFO: Done scheduling tasks
INFO: Running Worker with 1 processes
DEBUG: Asking scheduler for work...
DEBUG: Done
DEBUG: There are no more tasks to run at this time
INFO: Worker Worker(salt=435534365, workers=1, host=ET-RB-2019, username=Elliot Trilling, pid=35360) was stopped. Shutting down Keep-Alive thread
INFO: 
===== Luigi Execution Summary =====

Scheduled 1 tasks of which:
* 1 complete ones were encountered:
    - 1 SaveLectureData(master_URL=https://canvas.harvard.edu/courses/69559/external_tools/22940)

Did not run any tasks
This progress looks :) because there were no failed tasks or missing dependencies

===== Luigi Execution Summary =====

DEBUG: Checking if SaveLectureData(master_URL=https://canvas.harvard.edu/courses/69902/external_tools/61579) is comple

True

In [22]:
class DownloadLecture(Task):
    '''
    download a single lecture
    '''
    
    base_file_name = Parameter()
    url = Parameter()
    player = Parameter()
    timeout_max = Parameter(default=None)
    
    # NOTE: nothing is "required"
    
    def output(self):
        return LocalTarget(os.path.join(VIDEO_PATH, clean_base_file_name(self.base_file_name) + '.mp4'),
                           format=luigi.format.Nop)
    
    def run(self):
        with self.output().temporary_path() as tmp_path:
            download_lecture(url=self.url,
                             player=self.player,
                             base_file_name='THIS_IS_NOT_USED_HERE',
                             mp4_path=tmp_path,
                             timeout_max=self.timeout_max)


# with open(CACHE_PATH, 'rb') as cache:
#     data = pickle.load(cache)
# title_to_best_m3u8 = data['title_to_best_m3u8']
# player_type = data['player_type']
# task = DownloadLecture(url=title_to_best_m3u8['Lecture 8'][0],
#                        player=player_type,
#                        base_file_name='test1',
#                        timeout_max=5)
# build([task], local_scheduler=True)

In [23]:
class UploadLecture(Task):
    '''
    upload a single lecture to S3 (will download lecture first if needed)
    '''
    
    base_file_name = Parameter()
    url = Parameter(default='')
    player = Parameter(default='')
    timeout_max = Parameter(default=None)
    
    def requires(self):
        return DownloadLecture(base_file_name=self.base_file_name,
                               url=self.url,
                               player=self.player,
                               timeout_max=self.timeout_max)
    
    def output(self):
        return S3Target(S3_ROOT + '/' + clean_base_file_name(self.base_file_name) + '.mp4', format=luigi.format.Nop)
    
    def run(self):
        with self.requires().output().open('r') as inf, self.output().open('w') as outf:
            outf.write(inf.read())

# build([UploadLecture(base_file_name='test1')], local_scheduler=True)

In [25]:
class ProcessAllLectures(WrapperTask):
    '''
    an abstract class that runs some task for each lecture
    '''
    
    master_URL = Parameter()
    is_test_run = BoolParameter(default=True)
    
    Process = NotImplemented
    
    def requires(self):
        # fist we need to make sure we have the link data
        saved_lecture_data = SaveLectureData(master_URL=self.master_URL)
        yield saved_lecture_data
        
        # load saved data
        with saved_lecture_data.output().open('r') as cache:
            data = pickle.load(cache)
        title_to_best_m3u8 = data['title_to_best_m3u8']
        player_type = data['player_type']
        
        # now we can process (download / upload) all the videos
        tasks = []
        for title, urls in title_to_best_m3u8.items():
            for url_num in range(len(urls)):
                full_title = title + ' - perspective' + str(url_num)
                
                task = self.Process(base_file_name=full_title,
                                    url=urls[url_num],
                                    player=player_type,
                                    timeout_max=1 if self.is_test_run else None)
                tasks.append(task)
        
        yield tasks

class DownloadAllLectures(ProcessAllLectures):
    '''
    download all lectures
    '''
    Process = DownloadLecture

class UploadAllLectures(ProcessAllLectures):
    '''
    upload all lectures to S3 (will download lectures first if needed)
    '''
    Process = UploadLecture


# build([DownloadAllLectures(master_URL=matterhorn_URL)], local_scheduler=True)
# build([UploadAllLectures(master_URL=matterhorn_URL)], local_scheduler=True)

build([DownloadAllLectures(master_URL=panopto_URL)], local_scheduler=True)
build([UploadAllLectures(master_URL=panopto_URL)], local_scheduler=True)

DEBUG: Checking if DownloadAllLectures(master_URL=https://canvas.harvard.edu/courses/69902/external_tools/61579, is_test_run=True) is complete
INFO: Informed scheduler that task   DownloadAllLectures_True_https___canvas_h_a2350e0143   has status   DONE
INFO: Done scheduling tasks
INFO: Running Worker with 1 processes
DEBUG: Asking scheduler for work...
DEBUG: Done
DEBUG: There are no more tasks to run at this time
INFO: Worker Worker(salt=734768845, workers=1, host=ET-RB-2019, username=Elliot Trilling, pid=35360) was stopped. Shutting down Keep-Alive thread
INFO: 
===== Luigi Execution Summary =====

Scheduled 1 tasks of which:
* 1 complete ones were encountered:
    - 1 DownloadAllLectures(master_URL=https://canvas.harvard.edu/courses/69902/external_tools/61579, is_test_run=True)

Did not run any tasks
This progress looks :) because there were no failed tasks or missing dependencies

===== Luigi Execution Summary =====

DEBUG: Checking if UploadAllLectures(master_URL=https://canvas.ha

DEBUG: Checking if UploadLecture(base_file_name=Lecture 3-4 Salt, Water Balance and Nitrogen Excretion - perspective0, url=https://d2y36twrtb17ty.cloudfront.net/sessions/f7bb075e-2d5a-4c34-ad6b-ab910110d537/7ec0e815-c2b8-4635-a419-ab910110d540-cff29732-04b0-4ab7-9c44-ab9101352630.hls/747517/index.m3u8, player=panopto, timeout_max=1) is complete
DEBUG: no credentials provided, delegating credentials resolution to boto3
DEBUG: Path s3://etrilling-cscie29/recorded_lectures/Lecture 3-4 Salt, Water Balance and Nitrogen Excretion - perspective0.mp4 does not exist
DEBUG: Checking if UploadLecture(base_file_name=2-5 The Immune System - perspective0, url=https://d2y36twrtb17ty.cloudfront.net/sessions/54f432e8-8b75-44bf-b484-ab8d0129c7d5/9cca05c1-2ab9-4a6f-b64f-ab8d0129c7de-79bac0d3-5692-47f6-bc05-ab8d015a3b16.hls/757606/index.m3u8, player=panopto, timeout_max=1) is complete
DEBUG: no credentials provided, delegating credentials resolution to boto3
DEBUG: Path s3://etrilling-cscie29/recorded_lec

DEBUG: no credentials provided, delegating credentials resolution to boto3
DEBUG: Path s3://etrilling-cscie29/recorded_lectures/BIOS E-1B on 2_10_2020 (Mon) - perspective1.mp4 does not exist
DEBUG: Checking if UploadLecture(base_file_name=BIOS E-1B on 2/5/2020 (Wed) - perspective0, url=https://d2y36twrtb17ty.cloudfront.net/sessions/fb6f92c0-10a1-4032-9340-ab3e0148c979/f8a33b58-eb94-4b21-ac3b-ab3e0148c97e-987d41ea-a90d-4a58-b38b-ab59002f3a23.hls/1500000/index.m3u8, player=panopto, timeout_max=1) is complete
DEBUG: no credentials provided, delegating credentials resolution to boto3
DEBUG: Path s3://etrilling-cscie29/recorded_lectures/BIOS E-1B on 2_5_2020 (Wed) - perspective0.mp4 does not exist
DEBUG: Checking if UploadLecture(base_file_name=BIOS E-1B on 2/5/2020 (Wed) - perspective1, url=https://d2y36twrtb17ty.cloudfront.net/sessions/fb6f92c0-10a1-4032-9340-ab3e0148c979/03bc5b3b-27fa-4f35-a03b-ab5900099d1d.object.hls/1500000/index.m3u8, player=panopto, timeout_max=1) is complete
DEBUG: 

INFO: Informed scheduler that task   UploadLecture_BIOS_E_1B_on_2_1_panopto_1_480b619ae7   has status   PENDING
INFO: Informed scheduler that task   DownloadLecture_BIOS_E_1B_on_2_1_panopto_1_480b619ae7   has status   DONE
DEBUG: Checking if DownloadLecture(base_file_name=BIOS E-1B on 3/2/2020 (Mon) - perspective1, url=https://d2y36twrtb17ty.cloudfront.net/sessions/8a09e9dd-bcd6-402c-936f-ab3e0148ca64/9f2106a8-1749-46e6-ae44-ab730009a2d5.object.hls/1500000/index.m3u8, player=panopto, timeout_max=1) is complete
INFO: Informed scheduler that task   UploadLecture_BIOS_E_1B_on_3_2_panopto_1_f3c192838a   has status   PENDING
INFO: Informed scheduler that task   DownloadLecture_BIOS_E_1B_on_3_2_panopto_1_f3c192838a   has status   DONE
DEBUG: Checking if DownloadLecture(base_file_name=BIOS E-1B on 3/2/2020 (Mon) - perspective0, url=https://d2y36twrtb17ty.cloudfront.net/sessions/8a09e9dd-bcd6-402c-936f-ab3e0148ca64/58bdeb37-0c2e-4d52-a5d8-ab3e0148ca69-ccd92781-1ce0-44bf-a8d6-ab7f011b2cdf.hls/1

INFO: Informed scheduler that task   DownloadLecture_Study_tip__Flow__panopto_1_01ae7bea9f   has status   DONE
DEBUG: Checking if DownloadLecture(base_file_name=Lecture 3-2 Animal Reproduction - perspective0, url=https://d2y36twrtb17ty.cloudfront.net/sessions/4bf6c3b9-e21d-4d4f-b2a4-ab94013d5146/6035ffce-03d9-4d16-b67b-ab94013d514a-7f84acfd-2ebd-4e8a-a441-ab940165da9d.hls/747521/index.m3u8, player=panopto, timeout_max=1) is complete
INFO: Informed scheduler that task   UploadLecture_Lecture_3_2_Anim_panopto_1_568e3993eb   has status   PENDING
INFO: Informed scheduler that task   DownloadLecture_Lecture_3_2_Anim_panopto_1_568e3993eb   has status   DONE
DEBUG: Checking if DownloadLecture(base_file_name=3-1 Animal Hormones Part I - perspective0, url=https://d2y36twrtb17ty.cloudfront.net/sessions/b4d172f0-88f2-4b98-bae8-ab9a00ee0702/94ea1878-b0c7-4690-b9b2-ab9a00ee070a-3751cb04-666d-4e91-94f5-ab9a01000102.hls/757633/index.m3u8, player=panopto, timeout_max=1) is complete
INFO: Informed sche

DEBUG: 1 running tasks, waiting for next task to finish
INFO: Informed scheduler that task   UploadLecture_BIOS_E_1B_on_1_2_panopto_1_1a5d45bed1   has status   DONE
DEBUG: Asking scheduler for work...
DEBUG: Pending tasks: 35
INFO: [pid 35360] Worker Worker(salt=977192388, workers=1, host=ET-RB-2019, username=Elliot Trilling, pid=35360) running   UploadLecture(base_file_name=BIOS E-1B on 2/3/2020 (Mon) - perspective0, url=https://d2y36twrtb17ty.cloudfront.net/sessions/3b111f79-1be4-48c8-af47-ab3e0148c958/1faa1e41-69c9-48dc-9de6-ab3e0148c961-7f1bb226-114d-4c0a-a648-ab57002ec378.hls/1500000/index.m3u8, player=panopto, timeout_max=1)
DEBUG: no credentials provided, delegating credentials resolution to boto3
INFO: [pid 35360] Worker Worker(salt=977192388, workers=1, host=ET-RB-2019, username=Elliot Trilling, pid=35360) done      UploadLecture(base_file_name=BIOS E-1B on 2/3/2020 (Mon) - perspective0, url=https://d2y36twrtb17ty.cloudfront.net/sessions/3b111f79-1be4-48c8-af47-ab3e0148c958/1f

DEBUG: no credentials provided, delegating credentials resolution to boto3
INFO: [pid 35360] Worker Worker(salt=977192388, workers=1, host=ET-RB-2019, username=Elliot Trilling, pid=35360) done      UploadLecture(base_file_name=BIOS E-1B on 3/4/2020 (Wed) - perspective1, url=https://d2y36twrtb17ty.cloudfront.net/sessions/d39d7e84-187d-43f6-a497-ab3e0148ca81/42669805-46e8-440e-a555-ab7500099e0d.object.hls/1500000/index.m3u8, player=panopto, timeout_max=1)
DEBUG: 1 running tasks, waiting for next task to finish
INFO: Informed scheduler that task   UploadLecture_BIOS_E_1B_on_3_4_panopto_1_6ef8f6edab   has status   DONE
DEBUG: Asking scheduler for work...
DEBUG: Pending tasks: 27
INFO: [pid 35360] Worker Worker(salt=977192388, workers=1, host=ET-RB-2019, username=Elliot Trilling, pid=35360) running   UploadLecture(base_file_name=BIOS E-1B on 3/4/2020 (Wed) - perspective0, url=https://d2y36twrtb17ty.cloudfront.net/sessions/d39d7e84-187d-43f6-a497-ab3e0148ca81/e9f50b48-f1bf-48eb-8a8b-ab3e0148

DEBUG: 1 running tasks, waiting for next task to finish
INFO: Informed scheduler that task   UploadLecture_Lecture_3_2_Anim_panopto_1_568e3993eb   has status   DONE
DEBUG: Asking scheduler for work...
DEBUG: Pending tasks: 20
INFO: [pid 35360] Worker Worker(salt=977192388, workers=1, host=ET-RB-2019, username=Elliot Trilling, pid=35360) running   UploadLecture(base_file_name=3-1 Animal Hormones Part II - perspective0, url=https://d2y36twrtb17ty.cloudfront.net/sessions/bcb7bc7e-8c7c-416a-a5fd-ab9a0101c3f2/7c1267a5-5e87-4f70-bbf2-ab9a0101c3fb-bb96bd31-e2f6-4146-9dac-aba300e194be.hls/757766/index.m3u8, player=panopto, timeout_max=1)
DEBUG: no credentials provided, delegating credentials resolution to boto3
INFO: [pid 35360] Worker Worker(salt=977192388, workers=1, host=ET-RB-2019, username=Elliot Trilling, pid=35360) done      UploadLecture(base_file_name=3-1 Animal Hormones Part II - perspective0, url=https://d2y36twrtb17ty.cloudfront.net/sessions/bcb7bc7e-8c7c-416a-a5fd-ab9a0101c3f2/7c1

DEBUG: no credentials provided, delegating credentials resolution to boto3
INFO: [pid 35360] Worker Worker(salt=977192388, workers=1, host=ET-RB-2019, username=Elliot Trilling, pid=35360) done      UploadLecture(base_file_name=BIOS E-1B on 2/3/2020 (Mon) - perspective1, url=https://d2y36twrtb17ty.cloudfront.net/sessions/3b111f79-1be4-48c8-af47-ab3e0148c958/7678567f-1345-4e71-bb0a-ab5700099d93.object.hls/1500000/index.m3u8, player=panopto, timeout_max=1)
DEBUG: 1 running tasks, waiting for next task to finish
INFO: Informed scheduler that task   UploadLecture_BIOS_E_1B_on_2_3_panopto_1_b91c4e6ae0   has status   DONE
DEBUG: Asking scheduler for work...
DEBUG: Pending tasks: 12
INFO: [pid 35360] Worker Worker(salt=977192388, workers=1, host=ET-RB-2019, username=Elliot Trilling, pid=35360) running   UploadLecture(base_file_name=BIOS E-1B on 2/5/2020 (Wed) - perspective1, url=https://d2y36twrtb17ty.cloudfront.net/sessions/fb6f92c0-10a1-4032-9340-ab3e0148c979/03bc5b3b-27fa-4f35-a03b-ab590009

DEBUG: 1 running tasks, waiting for next task to finish
INFO: Informed scheduler that task   UploadLecture_2_4_Sensory_Syst_panopto_1_5745af3736   has status   DONE
DEBUG: Asking scheduler for work...
DEBUG: Pending tasks: 5
INFO: [pid 35360] Worker Worker(salt=977192388, workers=1, host=ET-RB-2019, username=Elliot Trilling, pid=35360) running   UploadLecture(base_file_name=2-5 The Immune System - perspective0, url=https://d2y36twrtb17ty.cloudfront.net/sessions/54f432e8-8b75-44bf-b484-ab8d0129c7d5/9cca05c1-2ab9-4a6f-b64f-ab8d0129c7de-79bac0d3-5692-47f6-bc05-ab8d015a3b16.hls/757606/index.m3u8, player=panopto, timeout_max=1)
DEBUG: no credentials provided, delegating credentials resolution to boto3
INFO: [pid 35360] Worker Worker(salt=977192388, workers=1, host=ET-RB-2019, username=Elliot Trilling, pid=35360) done      UploadLecture(base_file_name=2-5 The Immune System - perspective0, url=https://d2y36twrtb17ty.cloudfront.net/sessions/54f432e8-8b75-44bf-b484-ab8d0129c7d5/9cca05c1-2ab9-4a

True