In [1]:
import requests 
from bs4 import BeautifulSoup 
import json
import urllib.request
import csv
import shutil
from google.colab import files
from tqdm import tqdm
from functools import partial
import os

In [2]:

bar = partial(tqdm, position=0, leave=True)

### print json  (global helper)
def print_json(js):
    print(json.dumps(js, indent=2))

def get_links(base_url, endpoint_to_scrape, keyword_in_href):
    print("\nGetting Resource Links ...\n")
    if endpoint_to_scrape == "":
        r = requests.get(f'{base_url}')
    else:
        r = requests.get(f'{base_url}{endpoint_to_scrape}')
    soup = BeautifulSoup(r.content, 'html5lib') # If this line causes an error, run 'pip install html5lib' or install html5lib
    # print(soup.prettify())
    if endpoint_to_scrape == "/resources.html":
        table = soup.find_all('ul')
    else:
        table = soup.find_all('td')
    # print(table)
    linkList = []
    for a in bar(table):
        b = a.select(f"a[href*={keyword_in_href}]")
        if (b):
          for link in b:
              link_name = link.text
              href_link = link['href']
              if href_link[0] == '/':
                  href_link = href_link[1:]
              url_link = f'{base_url}/{href_link}'
              if "(solutions)" == link_name:
                  link_name = url_link[(len(f"{base_url}/assets/sections/")):(len(url_link) - len(".pdf"))]
              elif link_name == "Solutions" or link_name == "Questions":
                  if f"{base_url}/assets/exams/mt1/" in url_link or f"{base_url}/assets/exams/mt2/" in url_link:
                      link_name = url_link[(len(f"{base_url}/assets/exams/mt1/")):(len(url_link) - len(".pdf"))]
                  elif f"{base_url}/assets/exams/final/" in url_link:
                      link_name = url_link[(len(f"{base_url}/assets/exams/final/")):(len(url_link) - len(".pdf"))]
              linkList.append({link_name: url_link})
    print_json(linkList)
    return linkList

def download_file_to_colab_dir(download_url, filename):
    response = urllib.request.urlopen(download_url)
    file = open(filename+".pdf", 'wb')
    file.write(response.read())
    file.close()

def save_files_in_dir(csv_file_name, saved_files_dir, linkList):
    print("\nSaving files to Colab dir ...\n")
    with open(csv_file_name, 'a', newline='') as csvfile:
        spamwriter = csv.writer(csvfile, delimiter=',')
        for item in bar(linkList):
            for key in item:
                if "pdf" in item[key]:
                    if "exams" in item[key]:
                        dir_to_save = saved_files_dir['exams']
                    elif "lectures" in item[key]:
                        dir_to_save = saved_files_dir['lectures']
                    elif "notes" in item[key]:
                        dir_to_save = saved_files_dir['notes']
                    elif "sections" in item[key]:
                        dir_to_save = saved_files_dir['sections']
                    else:
                        dir_to_save = saved_files_dir['base_dir']
                    download_file_to_colab_dir(item[key], f'{dir_to_save}{key}')
                spamwriter.writerow([key, item[key]])

def archive_dir(archive_name, dir_name, archive_type):
    print(f"\nArchiving files to {archive_type} ...\n")
    shutil.make_archive(archive_name, archive_type, dir_name)

def download_file_from_colab_dir(dir_name):
    files.download(dir_name)

In [3]:
def complete_search_save_process(base_url, colab_dir, archive_type):
    colab_dir_files = f'{colab_dir}Files/'
    colab_dir_files_sections = f'{colab_dir_files}Sections/'
    colab_dir_files_lectures = f'{colab_dir_files}Lectures/'
    colab_dir_files_exams = f'{colab_dir_files}Exams/'
    colab_dir_files_notes = f'{colab_dir_files}Notes/'
    colab_dir_links = f'{colab_dir}Links/'
    csv_file_name = f'{colab_dir_links}CS 161 Links.csv'
    colab_dir_files_subdirs_dict = {'sections': colab_dir_files_sections,
                                    'exams': colab_dir_files_exams,
                                    'lectures': colab_dir_files_lectures,
                                    'notes': colab_dir_files_notes,
                                    'base_dir': colab_dir_files}

    os.mkdir(colab_dir)
    os.mkdir(colab_dir_files)
    os.mkdir(colab_dir_files_sections)
    os.mkdir(colab_dir_files_lectures)
    os.mkdir(colab_dir_files_exams)
    os.mkdir(colab_dir_files_notes)
    os.mkdir(colab_dir_links)

    assetsList = get_links(base_url, "", "asset")
    save_files_in_dir(csv_file_name, colab_dir_files_subdirs_dict, assetsList)

    projectsList = get_links(base_url, "", "proj")
    save_files_in_dir(csv_file_name, colab_dir_files_subdirs_dict, projectsList)

    resourcesList = get_links(base_url, "/resources.html", "asset")
    save_files_in_dir(csv_file_name, colab_dir_files_subdirs_dict, resourcesList)

    archive_dir(colab_dir, colab_dir, archive_type)

    download_from_dir = colab_dir[:(len(colab_dir) - 1)]
    bar(download_file_from_colab_dir(f'{download_from_dir}.{archive_type}'))

In [4]:
complete_search_save_process("https://cs161.org", "/content/CS 161/", "zip")

100%|██████████| 147/147 [00:00<00:00, 13317.55it/s]
  0%|          | 0/82 [00:00<?, ?it/s]


Getting Resource Links ...

[
  {
    "Introduction": "https://cs161.org/assets/lectures/lec01.pdf"
  },
  {
    "Memory Safety notes, section 1": "https://cs161.org/assets/notes/memory-safety.pdf"
  },
  {
    "C, x86, and GDB cheatsheet": "https://cs161.org/assets/projects/1/cheatsheet.pdf"
  },
  {
    "Security Principles": "https://cs161.org/assets/lectures/lec02.pdf"
  },
  {
    "Security Principles notes": "https://cs161.org/assets/notes/security-principles.pdf"
  },
  {
    "Project 1 released": "https://cs161.org/assets/projects/1/project1-spec.pdf"
  },
  {
    "61C Review, Security Principles": "https://cs161.org/assets/sections/01-questions.pdf"
  },
  {
    "01-solutions": "https://cs161.org/assets/sections/01-solutions.pdf"
  },
  {
    "Buffer Overflows": "https://cs161.org/assets/lectures/lec03.pdf"
  },
  {
    "Memory Safety notes, section 2": "https://cs161.org/assets/notes/memory-safety.pdf"
  },
  {
    "Buffer Overflow Defenses": "https://cs161.org/assets/lectur

100%|██████████| 82/82 [00:25<00:00,  3.25it/s]
100%|██████████| 147/147 [00:00<00:00, 12207.95it/s]


Getting Resource Links ...




 12%|█▎        | 1/8 [00:00<00:01,  6.29it/s]

[
  {
    "C, x86, and GDB cheatsheet": "https://cs161.org/assets/projects/1/cheatsheet.pdf"
  },
  {
    "Project 1 released": "https://cs161.org/assets/projects/1/project1-spec.pdf"
  },
  {
    "Project 1 due (11:59pm PT)": "https://cs161.org/assets/projects/1/project1-spec.pdf"
  },
  {
    "Project 2 released": "https://cs161.org/proj2"
  },
  {
    "Project 2 design doc draft due (11:59pm PT)": "https://cs161.org/proj2"
  },
  {
    "Project 3 released": "https://cs161.org/proj3"
  },
  {
    "Project 2 due (11:59pm PT)": "https://cs161.org/proj2"
  },
  {
    "Project 3 due (11:59pm PT)": "https://cs161.org/proj3"
  }
]

Saving files to Colab dir ...



100%|██████████| 8/8 [00:00<00:00, 10.78it/s]
100%|██████████| 5/5 [00:00<00:00, 1775.29it/s]
  0%|          | 0/64 [00:00<?, ?it/s]


Getting Resource Links ...

[
  {
    "Security principles": "https://cs161.org/assets/notes/security-principles.pdf"
  },
  {
    "Memory Safety": "https://cs161.org/assets/notes/memory-safety.pdf"
  },
  {
    "Cryptography": "https://cs161.org/assets/notes/cryptography.pdf"
  },
  {
    "Web": "https://cs161.org/assets/notes/web.pdf"
  },
  {
    "Networking": "https://cs161.org/assets/notes/networking.pdf"
  },
  {
    "sp21mt": "https://cs161.org/assets/exams/mt1/sp21mt.pdf"
  },
  {
    "sp21mtsolutions": "https://cs161.org/assets/exams/mt1/sp21mtsolutions.pdf"
  },
  {
    "fa20mt": "https://cs161.org/assets/exams/mt1/fa20mt.pdf"
  },
  {
    "fa20mtsolutions": "https://cs161.org/assets/exams/mt1/fa20mtsolutions.pdf"
  },
  {
    "su20mt": "https://cs161.org/assets/exams/mt1/su20mt.pdf"
  },
  {
    "su20mtsolutions": "https://cs161.org/assets/exams/mt1/su20mtsolutions.pdf"
  },
  {
    "sp20mt1": "https://cs161.org/assets/exams/mt1/sp20mt1.pdf"
  },
  {
    "sp20mt1solutions":

100%|██████████| 64/64 [00:10<00:00,  5.96it/s]



Archiving files to zip ...



<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

0it [00:00, ?it/s]
