In [1]:
from scrape import *

DATA_PATH = os.path.abspath('./../data/data.plk')

matterhorn_URL = "https://canvas.harvard.edu/courses/69559/external_tools/22940"
panopto_URL = "https://canvas.harvard.edu/courses/69902/external_tools/61579"

In [2]:
from luigi import ExternalTask, Parameter, Task
from luigi.local_target import LocalTarget
import luigi

import pickle


class DownloadMetadata(Task):
    master_URL = Parameter()
    
#     def requires():
#         pass

    def output(self):
        return LocalTarget(DATA_PATH, format=luigi.format.Nop)
    
    def run(self):
        # do setup
        driver = setup_and_login()

        # get sources
        player_psource, player_type = get_player_psource(driver, self.master_URL)

        # get video dict
        video_dict = extract_video_links(player_psource, player=player_type)

        open_video_links(driver, video_dict, player=player_type, URL=self.master_URL) # pass in original url

        all_video_urls = get_video_urls(LOG_PATH)

        title_to_urls = get_title_to_urls(video_dict, all_video_urls, player=player_type)

        if player_type == 'matterhorn':
            title_to_urls = get_title_to_urls2(title_to_urls)
        
        data = {'title_to_urls': title_to_urls, 'player_type': player_type}
        
        with self.output().open('w') as f:
            pickle.dump(data, f)

In [3]:
from luigi import build

build([DownloadMetadata(master_URL=matterhorn_URL)], local_scheduler=True)

DEBUG: Checking if DownloadMetadata(master_URL=https://canvas.harvard.edu/courses/69559/external_tools/22940) is complete
INFO: Informed scheduler that task   DownloadMetadata_https___canvas_h_885a7e0fc5   has status   DONE
INFO: Done scheduling tasks
INFO: Running Worker with 1 processes
DEBUG: Asking scheduler for work...
DEBUG: Done
DEBUG: There are no more tasks to run at this time
INFO: Worker Worker(salt=749664962, workers=1, host=ET-RB-2019, username=Elliot Trilling, pid=15340) was stopped. Shutting down Keep-Alive thread
INFO: 
===== Luigi Execution Summary =====

Scheduled 1 tasks of which:
* 1 complete ones were encountered:
    - 1 DownloadMetadata(master_URL=https://canvas.harvard.edu/courses/69559/external_tools/22940)

Did not run any tasks
This progress looks :) because there were no failed tasks or missing dependencies

===== Luigi Execution Summary =====



True

In [4]:
with open(DATA_PATH, 'rb') as f:
    data = pickle.load(f)

title_to_urls = data['title_to_urls']
player_type = data['player_type']

# title_to_urls

In [5]:
def download_video_luigi(URL, player, open_fd, max_time=None):
    if max_time is None:
        max_time = 60*20 # setting a hard cap of 20min for a single download
    
    if player == 'matterhorn':
        assert(URL[-3:] == 'mp4') # make sure I didn't pass in bad URL
        
        stream = requests.get(URL, stream=True)
    
        start_time = time.time()
        for chunk in tqdm(stream.iter_content(chunk_size=1048576)):
            open_fd.write(chunk)

            time_delta = time.time() - start_time
            if time_delta > max_time:
                print('broke from loop after {} seconds'.format(time_delta))
                break
    
    if player == 'panopto':
        # note: I got some inspiration from this tiny project (https://github.com/onesafe/m3u8_to_mp4)
        
        assert(URL[-4:] == 'm3u8') # make sure I didn't pass in bad URL
        
        # get content of m3u8 file
        m3u8_content = requests.get(URL).content.decode()
        
        # build a list of ts files
        ts_list = []
        for line in m3u8_content.splitlines():
            if line.endswith('.ts'):
                ts_list.append(line)
        
        # download the video by looping over ts files
        start_time = time.time()

        for ts in tqdm(ts_list):
            ts_url = URL.replace('index.m3u8', ts)
            open_fd.write(requests.get(ts_url).content)

            # break if over max_time
            time_delta = time.time() - start_time
            if time_delta > max_time:
                print('broke from loop after {} seconds'.format(time_delta))
                break

In [6]:
class DownloadVideo(Task):
    download_video_params = Parameter()
    
#     def requires():
#         pass
    
    def output(self):
        path = os.path.join(VIDEO_PATH, self.download_video_params['video_name'])
        return LocalTarget(path, format=luigi.format.Nop)
    
    def run(self):
        URL = self.download_video_params['URL']
        player = self.download_video_params['player']
        max_time = self.download_video_params['max_time']
        
        with self.output().open('w') as open_fd:
            download_video_luigi(URL, player, open_fd, max_time)



params = {'URL': title_to_urls['Lecture 8'][0], 'player': player_type, 'video_name': 'test1.mp4', 'max_time': 5}
build([DownloadVideo(download_video_params=params)], local_scheduler=True)


# def foo(URL, player, video_name, max_time):
#     print("cat")
# foo(**params)

DEBUG: Not all parameter values are hashable so instance isn't coming from the cache
DEBUG: Checking if DownloadVideo(download_video_params={'URL': 'https://dvgni8clk4vbh.cloudfront.net/engage-player/fac8c27f-f09a-3526-3342-71c1e3000b6d/568df8a7-78e7-4214-b84d-2f4fc634fb5e/1584592093_segment_3_presenter_high_30fps.mp4', 'player': 'matterhorn', 'video_name': 'test1.mp4', 'max_time': 5}) is complete
INFO: Informed scheduler that task   DownloadVideo___URL____https___f1ad395a60   has status   DONE
INFO: Done scheduling tasks
INFO: Running Worker with 1 processes
DEBUG: Asking scheduler for work...
DEBUG: Done
DEBUG: There are no more tasks to run at this time
INFO: Worker Worker(salt=082861258, workers=1, host=ET-RB-2019, username=Elliot Trilling, pid=15340) was stopped. Shutting down Keep-Alive thread
INFO: 
===== Luigi Execution Summary =====

Scheduled 1 tasks of which:
* 1 complete ones were encountered:
    - 1 DownloadVideo(...)

Did not run any tasks
This progress looks :) because 

True

In [9]:
from luigi.task import WrapperTask

class DownloadAllVideos(WrapperTask):
    master_URL = Parameter()
    
    def requires(self):
        # fist we need to make sure we have the data
        data_task = DownloadMetadata(master_URL=self.master_URL)
        yield data_task
        
        # now we can download all the videos
        with data_task.output().open('r') as f:
            data = pickle.load(f)
        title_to_urls = data['title_to_urls']
        player_type = data['player_type']
        
        download_tasks = []
        for title, urls in title_to_urls.items():
            for url_num in range(len(urls)):
                full_title = title + ' - ' + str(url_num) + ".mp4"
                
                # TODO: update "max_time" at some point
                params = {'URL': urls[url_num], 'player': player_type, 'video_name': full_title, 'max_time': 5}
                task = DownloadVideo(download_video_params=params)
                download_tasks.append(task)

        yield download_tasks

build([DownloadAllVideos(master_URL=matterhorn_URL)], local_scheduler=True)

DEBUG: Checking if DownloadAllVideos(master_URL=https://canvas.harvard.edu/courses/69559/external_tools/22940) is complete
DEBUG: Not all parameter values are hashable so instance isn't coming from the cache
DEBUG: Not all parameter values are hashable so instance isn't coming from the cache
DEBUG: Not all parameter values are hashable so instance isn't coming from the cache
DEBUG: Not all parameter values are hashable so instance isn't coming from the cache
DEBUG: Not all parameter values are hashable so instance isn't coming from the cache
DEBUG: Not all parameter values are hashable so instance isn't coming from the cache
DEBUG: Not all parameter values are hashable so instance isn't coming from the cache
DEBUG: Not all parameter values are hashable so instance isn't coming from the cache
DEBUG: Not all parameter values are hashable so instance isn't coming from the cache
DEBUG: Not all parameter values are hashable so instance isn't coming from the cache
DEBUG: Not all parameter va

True