In [None]:
# STEP 2: Lesson retriever

Scrapes the content from each lesson page and creates subdirectories as necessary.

In [1]:
import json
import os
import re

from bs4 import BeautifulSoup
from html2text import html2text
import requests

from ricecooker.utils.caching import CacheForeverHeuristic, FileCache, CacheControlAdapter

from mitblossoms_chef import scraping_step, MitBlossomsVideoLessonResource, _build_json_tree

DATA_DIR='chefdata'

In [2]:
web_resource_tree = None
with open(os.path.join(DATA_DIR, 'web_resource_tree.json')) as json_file:
    web_resource_tree = json.load(json_file)

vid_rcrs_data = web_resource_tree['children'][0]['children'][3]['children'][10]
vid_rcrs = MitBlossomsVideoLessonResource(vid_rcrs_data)

### Testing MitBlossomsVideoLessonResource object attributes

In [3]:
vid_rcrs.get_additional_resources_zip()


'/var/folders/k3/r74jr38d56v717n39fd073f80000gn/T/tmpitthj8qz.zip'

In [4]:
vid_rcrs.get_transcripts()

[{'file_name': 'averages-transcript.pdf',
  'file_url': 'https://blossoms.mit.edu/sites/default/files/video/transcript/averages-transcript.pdf',
  'title': 'Written Transcript of this video lesson in English'}]

In [5]:
# vid_rcrs.get_teacher_resources()
vid_rcrs.get_video_urls()

[('English',
  '//d1baxxa0joomi3.cloudfront.net/d0ef3356da0c8f47afdff5bbe4694ebf/basic.mp4'),
 ('English-Arabic Subtitles',
  '//d1baxxa0joomi3.cloudfront.net/e76d7a9d506205b7ce54b36d40580de6/basic.mp4'),
 ('Arabic Voice-over',
  '//d1baxxa0joomi3.cloudfront.net/69ff39b896b52fc2acaa4a3b338a5f44/basic.mp4'),
 ('Urdu Voice-over',
  '//d1baxxa0joomi3.cloudfront.net/c4d13d85e317c5f6c7e590883b5cd9e7/basic.mp4'),
 ('English-Portuguese Subtitles',
  '//d1baxxa0joomi3.cloudfront.net/d9bf794c1a3da508737aa134916d15bc/basic.mp4'),
 ('Mandarin Voice-over',
  '//d1baxxa0joomi3.cloudfront.net/538b06d6924db6fe854328c49b1d3087/basic.mp4')]

In [6]:
vid_rcrs.get_video_urls_alt()

[('English', 'https://blossoms.mit.edu/download/6904'),
 ('English-Arabic Subtitles', 'https://blossoms.mit.edu/download/6893'),
 ('Arabic Voice-over', 'https://blossoms.mit.edu/download/6896'),
 ('Urdu Voice-over', 'https://blossoms.mit.edu/download/6898'),
 ('English-Portuguese Subtitles', 'https://blossoms.mit.edu/download/6901'),
 ('Mandarin Voice-over', 'https://blossoms.mit.edu/download/7980')]

In [7]:
vid_rcrs.get_video_url_for_lang('English')

'//d1baxxa0joomi3.cloudfront.net/d9bf794c1a3da508737aa134916d15bc/basic.mp4'

In [3]:
# Explore all possible `lang_variant`s
all_lang_variants = set()
all_video_link_differences = []

def process_lesson_resource(lesson_node, current_lang):
    """
    Helper functions for degbugging and exploring content.
    Run this once for 
    """
    video = MitBlossomsVideoLessonResource(lesson_node)
    lang_variant_url_tuples = video.get_video_urls()
    # CONTENT INFO 1: compile a list of all language variants
    for lang_variant, url in lang_variant_url_tuples:
        all_lang_variants.add(lang_variant)
    # CONTENT INFO 2:
    iframe_urls = set([v for v,u in video.get_video_urls()])
    download_urls = set([v for v,u in video.get_video_urls_alt()])
    if iframe_urls - download_urls or download_urls - iframe_urls:
        diff = {}
        diff['iframe_only'] = iframe_urls - download_urls
        diff['download_only'] = download_urls - iframe_urls
        diff['video_url'] = video.url
        all_video_link_differences.append(diff)
    #
    #
    video_url = video.get_video_url_for_lang(current_lang)
    if not video_url:
        print("NO video for", current_lang, "in", video.url)
    transcripts = video.get_transcripts()
    if len(transcripts) == 0:
        print("No transcript for", video.url)
    #     if len(transcripts) > 1:
    #         print("Multiple transcripts for", video.url)
    #         print(transcripts)
    #
    # show us the zip
    video.get_additional_resources_zip()


def walk_resource_tree(web_resource_tree):
    """
    Traverse walk_resource_tree and print out useful info (used for debugging).
    """
    for lang_node in web_resource_tree['children']:
        current_lang = lang_node['lang']
        print("Processing language", lang_node['lang'])

        for topic_node in lang_node['children']:
            print(" "*4, "Processing Topic", topic_node['title'])

            for node in topic_node['children']:
                if node['__class__'] == 'MitBlossomsTopicCluster':
                    cluster_node = node
                    print(" "*8, 'Processing TopicCluster:', cluster_node['title'])
                    for lesson_node in cluster_node['children']:
                        process_lesson_resource(lesson_node, current_lang)
                        print(" "*12, lesson_node['title'])

                elif node['__class__'] == 'MitBlossomsVideoLessonResource':
                    lesson_node = node
                    process_lesson_resource(lesson_node, current_lang)
                    print(" "*8, lesson_node['title'])

                else:
                    raise ValueError('Unkonwn node type encounted.')
            print()
        print()

walk_resource_tree(web_resource_tree)


Processing language Arabic
     Processing Topic Biology
         Processing TopicCluster: DNA
             Classifying Animals by Appearance Versus DNA Sequence
         Processing TopicCluster: Evolution
             Classifying Animals by Appearance Versus DNA Sequence
         Processing TopicCluster: Health
             Discovering Medicines, Using Robots and Computers
             The Disease of Our Time: Diabetes
         Processing TopicCluster: Environmental Sustainability
             How Does Biotechnology Help Clean Up The Environment?
             Plants and Environmental Resources
         Processing TopicCluster: Botany
             Plants and Environmental Resources
             Roots, Shoots, and Wood
             Yeasts, Molds and Mushrooms
         Methods for Protein Purification
         The Construction of Proteins
         The Olive Propagation Problem
         The Respiratory System of Birds
         Wind and Sand

     Processing Topic Chemistry
         Proces

         How Hot Is Hot? Heat versus Temperature
         How Much Small Is Small?
No transcript for https://blossoms.mit.edu/videos/lessons/there_connection_between_water_desalination_and_making_pickles
         Is There a Connection between Water Desalination and Making Pickles?
         Plastics and Covalent Chemical Bonds
         Recognizing Chemical Reactions
         Save Our Kingdom (in relation to the chemistry topic - Conservation of Mass)
         Why Are the Sky Blue and the Leaves Green?
         Why Neutralize? Impact on Health and the Environment

     Processing Topic Engineering
         Processing TopicCluster: Computer Programming
             Building Cryptosystems
             From Psychology to Logic: Learning Computer Programming in the Kitchen
             Is There A Connection Between Computer Network Topologies And A Malaysian Wedding?
             Out for Shopping: Understanding Linear Data Structures
             The Magic Picture: Steganography in Bitmap Fi

In [None]:
_ = scraping_step()