In [3]:
import json
import requests
import re
import os
import threading

from bs4 import BeautifulSoup
from concurrent.futures import ThreadPoolExecutor, as_completed

from utils.cookies import load_cookie_dict, load_cookie_list
from utils.request_scraping import get_course_sections, get_course_tuples, scrape_modules_page, scrape_module_entry
from utils.CLI import print_progress_bar
from utils.API_scraping import get_course_files, sizeof_fmt
from utils.loading_driver import load_driver
from utils.selenium_scraping import scrape_lecture_links, scrape_transcripts

In [6]:

# load cookies
# Cookies need to be in json, export them directly from browse client / session
# I used "Get cookies.txt LOCALLY" on chrome and saved as json, but I can't vouch for its security. 

canvas_cookie_file = "canvas-cookies.json"
video_cookie_file = "video-cookies.json"

cookies = load_cookie_list(canvas_cookie_file)
request_cookies = load_cookie_dict(canvas_cookie_file)
video_cookies = load_cookie_list(video_cookie_file)
request_video_cookies = load_cookie_dict(video_cookie_file)

In [7]:

# Code will filter out courses that are not... courses
course_tuples = get_course_tuples(request_cookies)
pattern = r'^[A-Z]{3} \d{3}'
filtered_tuples= [tup for tup in course_tuples if bool(re.match(pattern, tup[1]))]
print(f"So far, I've taken: {len(filtered_tuples)} courses. Wow! (I think there's a few that have unpublished their canvas page)")
# print(filtered_tuples)

initial_course_dict = [{"coursename": t[1], "course-id": t[0][-6:]} for t in filtered_tuples]
print(json.dumps(initial_course_dict, indent=2))

So far, I've taken: 47 courses. Wow! (I think there's a few that have unpublished their canvas page)
[
  {
    "coursename": "CHE 002A (C) WQ 2023",
    "course-id": "741155"
  },
  {
    "coursename": "CMN 152V 001 WQ 2025",
    "course-id": "964273"
  },
  {
    "coursename": "ECS 154A 001 WQ 2025",
    "course-id": "966323"
  },
  {
    "coursename": "ECS 170 001 SQ 2025",
    "course-id": "984669"
  },
  {
    "coursename": "ECS 189G 001 SQ 2025",
    "course-id": "993297"
  },
  {
    "coursename": "NPB 163 001 SQ 2025",
    "course-id": "982204"
  },
  {
    "coursename": "NPB 173 001 SQ 2025",
    "course-id": "989072"
  },
  {
    "coursename": "PSC 120 001 SQ 2025",
    "course-id": "977100"
  },
  {
    "coursename": "BIS 002A WQ 2024",
    "course-id": "858748"
  },
  {
    "coursename": "CGS 001/PHI 010 SQ 2023",
    "course-id": "798431"
  },
  {
    "coursename": "CHI 065 A01-A02 SQ 2023",
    "course-id": "800304"
  },
  {
    "coursename": "EAE 001 001 FQ 2022",
    "co

In [8]:

# We need to find what the structure of each course is
# Do they have Files? Modules? Media Gallery?
def section_worker(course_dict, request_cookies):
  updated = course_dict.copy()
  request_url = "https://canvas.ucdavis.edu/courses/" + updated["course-id"]
  course_section_tuples = get_course_sections(request_url, request_cookies)

  if None in course_section_tuples:
    updated["homepage"] = None
    updated["sections"] = {}
    print(f"Error with {updated['course-id']}: {course_section_tuples}")
    return updated  

  updated["homepage"] = course_section_tuples[0]
  sections = {
    "F": 'Files' in course_section_tuples[1],
    "M": 'Modules' in course_section_tuples[1],
    "MG": 'Media Gallery' in course_section_tuples[1]
  }
  updated["sections"] = sections
  return updated

master_course_dict = []
with ThreadPoolExecutor(max_workers=10) as executor:
  course_futures = [executor.submit(section_worker, course_dict, request_cookies) for course_dict in initial_course_dict]

  for i, course_future in enumerate(as_completed(course_futures)):
    updated = course_future.result()
    master_course_dict.append(updated)
    print_progress_bar(i + 1, len(initial_course_dict))

print(json.dumps(master_course_dict, indent=2))


 |████████████████████████████████████████████████████████████████████████████████████████████████████| 100.0% 
[
  {
    "coursename": "BIS 002A WQ 2024",
    "course-id": "858748",
    "homepage": "Home",
    "sections": {
      "F": true,
      "M": false,
      "MG": true
    }
  },
  {
    "coursename": "CGS 001/PHI 010 SQ 2023",
    "course-id": "798431",
    "homepage": "Home",
    "sections": {
      "F": true,
      "M": true,
      "MG": false
    }
  },
  {
    "coursename": "NPB 173 001 SQ 2025",
    "course-id": "989072",
    "homepage": "Syllabus",
    "sections": {
      "F": true,
      "M": false,
      "MG": true
    }
  },
  {
    "coursename": "ECS 170 001 SQ 2025",
    "course-id": "984669",
    "homepage": "Home",
    "sections": {
      "F": true,
      "M": false,
      "MG": true
    }
  },
  {
    "coursename": "CHE 002A (C) WQ 2023",
    "course-id": "741155",
    "homepage": "Home",
    "sections": {
      "F": false,
      "M": true,
      "MG": false
    }

In [9]:
print_lock = threading.Lock()

# Next, we want to use the section info to scrape data. 
# We will do 2 passes, first for files and second for transcript.

# File scraping sweep
def course_processor_worker(course_dict, request_cookies):
  baseURL = "https://canvas.ucdavis.edu/courses/" + course_dict["course-id"]
  files = []

  # Want these files to be of form: (name, size, URL)
  if course_dict["sections"]["F"]: # if the file page exists
    # print("File API!", end="")

    # there was a really cool recursize file search I was using, before I realized you could just hit the entire filepage by searching the empty string...
    # files_dict[course_dict['coursename']] = scrape_course_files(course_dict['course-id'], request_cookies)
    files = get_course_files(course_dict['course-id'], request_cookies)

  elif course_dict["sections"]["M"]:
    # print("Modules class:", end="")
    
    # Returns: list of tuples of form: (name, link)
    resource_tuple_list = scrape_modules_page(baseURL + "/modules", request_cookies)        
        
    # Parallel process individual resources
    with ThreadPoolExecutor(max_workers=10) as executor:
      resource_urls = [f"https://canvas.ucdavis.edu{rt[1]}" for rt in resource_tuple_list]
      module_futures = [executor.submit(scrape_module_entry, url, request_cookies) for url in resource_urls]
      for j, future in enumerate(module_futures):
        try:
          res = future.result()
          files.append((resource_tuple_list[j][0], res[0], res[1]))
        except Exception as e:
          with print_lock:
              print(f"Error processing: {resource_tuple_list[j][0]}: {str(e)}")
    # else:
    #   print("Nada", end="")
    
  return (course_dict['coursename'], files)



files_dict = {}
with ThreadPoolExecutor(max_workers=5) as outer_executor:
  futures = [outer_executor.submit(course_processor_worker, course_dict, request_cookies) for course_dict in master_course_dict]
  
  for i, future in enumerate(as_completed(futures)):
    coursename, course_files = future.result()
    files_dict[coursename] = course_files
    with print_lock:
      print_progress_bar(i + 1, len(master_course_dict)) # prefix=" "*25
print(json.dumps(files_dict, indent=2))

Error processing: ENG4_HW3 Solution.pdf: ENG 004 A01-A04 FQ 2022--------------------------------------| 61.7% 
Error processing: In class quiz 2 solution_F2022.pdf: ENG 004 A01-A04 FQ 2022
Error processing: HW8 solution_F2022.pdf: ENG 004 A01-A04 FQ 2022
Error processing: Project requirements_Fall2022.pdf: ENG 004 A01-A04 FQ 2022
 |████████████████████████████████████████████████████████████████████████████████████████████████████| 100.0% 
{
  "NPB 173 001 SQ 2025": [
    [
      "NPB173_lecture1_2025.pdf",
      "2.1MB",
      "https://canvas.ucdavis.edu/files/27274322/download?download_frd=1"
    ],
    [
      "NPB173_lecture2_2025.pdf",
      "2.9MB",
      "https://canvas.ucdavis.edu/files/27281704/download?download_frd=1"
    ],
    [
      "NPB173_lecture3_2025.pdf",
      "11.8MB",
      "https://canvas.ucdavis.edu/files/27324438/download?download_frd=1"
    ]
  ],
  "ECS 170 001 SQ 2025": [
    [
      "1.Class-intro.pdf",
      "2.4MB",
      "https://canvas.ucdavis.edu/files

In [10]:
file_path = "meta-downloads/course-files.json"

with open(file_path, 'w') as json_file:
  json.dump(files_dict, json_file)


In [11]:

def parse_size(size_str):
  if not size_str:
    return 0
  size_str = size_str.strip().upper()
  match = re.match(r"([\d.]+)([KMGT]?B)", size_str)
  if not match:
      return 0
  size, unit = match.groups()
  size = float(size)
  units = {"B": 1, "KB": 1024, "MB": 1024**2, "GB": 1024**3, "TB": 1024**4}
  return size * units[unit]

total_size = 0
zero_resources = []
for class_name, file_list in files_dict.items():
  if len(file_list) == 0:
     zero_resources.append(class_name[:8].strip())
  print(f"{class_name}: {len(file_list)}")
  total_size += sum(parse_size(s[1]) for s in file_list)

print("\n-----")
print(f"Total file download would be: {sizeof_fmt(total_size)}")
print(f"Classes where no resources where found, {len(zero_resources)}: {zero_resources}")

NPB 173 001 SQ 2025: 3
ECS 170 001 SQ 2025: 5
ECS 189G 001 SQ 2025: 0
ECS 154A 001 WQ 2025: 0
PSC 120 001 SQ 2025: 0
NPB 163 001 SQ 2025: 14
BIS 002A WQ 2024: 100
CGS 001/PHI 010 SQ 2023: 90
ECH 080 001 FQ 2022: 15
CMN 152V 001 WQ 2025: 0
ECS 036B A01 FQ 2023: 4
ECS 050 001 FQ 2024: 0
ECS 020 A01 WQ 2024: 74
ECS 032A A01-A06 WQ 2023: 80
ECS 036C A01 WQ 2024: 55
EAE 001 001 FQ 2022: 2
ECN 001A C01 FQ 2023: 46
ECS 124 001 WQ 2025: 50
ECS 122A A01-A06 FQ 2024: 46
ECS 174 001 FQ 2024: 15
ECS 132 001 FQ 2024: 69
ENL 003 028 FQ 2022: 25
ECS 098F 001 FQ 2024: 34
HIS 017A A01-A09 WQ 2024: 34
CHI 065 A01-A02 SQ 2023: 9
MAT 021B B01-B07 WQ 2023: 0
LIN 177 A01-A04 SQ 2024: 19
LIN 175 A01-A02 WQ 2025: 66
MAT 022A 002 SQ 2024: 24
MAT 021C B01-B08 SQ 2023: 56
MAT 107 001 SQ 2024: 100
NPB 100 A01-A06 SQ 2024: 53
PHI 012 A01-A06 SQ 2023: 34
MAT 022AL 001-007 SQ 2024: 1
PHI 112 A01-A04 FQ 2023: 10
CHE 002A (C) WQ 2023: 62
PSC 001Y B01-B13 FQ 2022: 0
PSC 100Y A01-A13 WQ 2024: 0
PSC 130 A01-A04 WQ 2024: 

In [8]:

# test_lecture_class = {'coursename': 'BIS 002A WQ 2024', 'course-id': '858748', 'homepage': 'Home', 'sections': {'F': True, 'M': False, 'MG': True}}
# print(test_lecture_class)

# transcripts = []

# driver = load_driver(cookies)
# url = f"https://canvas.ucdavis.edu/courses/{test_lecture_class['course-id']}"
# print(url)
# media_links = scrape_lecture_links(driver, url)
# lecture_list = list(media_links)

# if len(lecture_list) == 0:
#   print(f"{test_lecture_class['coursename']} no lecs")
#   print(test_lecture_class['coursename'], transcripts)
#   exit()

# lecture_tuples = []
# for i, lecture in enumerate(lecture_list):
#   inner_driver = load_driver(video_cookies)
#   lecture_url = f"https://aggievideo.canvas.ucdavis.edu{lecture}"
#   # print(lecture_url)
#   lecture_tuple = scrape_transcripts(inner_driver, lecture_url)
#   lecture_tuples.append(lecture_tuple)
#   print(f"{i} Done!")


# print(test_lecture_class['coursename'], lecture_tuples)



In [9]:

# Now, time to get lecture transcripts!

def transcript_worker(lecture_url):
  inner_driver = load_driver(video_cookies)
  full_url = f"https://aggievideo.canvas.ucdavis.edu{lecture_url}"
  lecture_tuple = scrape_transcripts(inner_driver, full_url)
  inner_driver.quit()
  return lecture_tuple

def transcript_links_worker(course_dict):
  transcripts = []

  driver = load_driver(cookies)
  url = f"https://canvas.ucdavis.edu/courses/{course_dict['course-id']}"
  media_links = scrape_lecture_links(driver, url)
  driver.quit()
  lecture_list = list(media_links)

  with print_lock:
    print(f"{course_dict['coursename'][:8].strip()}: {len(lecture_list)} lecs")

  if len(lecture_list) == 0:
    return (course_dict['coursename'], transcripts)
  
  # with ThreadPoolExecutor(max_workers=3) as inner_executor:
  #   future_links = [inner_executor.submit(transcript_worker, lec) for lec in lecture_list]
  
  #   for j, future in enumerate(as_completed(future_links)):
  #     result = future.result()
  #     if result:  # Make sure we only add valid results
  #       transcripts.append(result)
  #     with print_lock:
  #       print(f"{course_dict['coursename'][:8].strip()}: {j+1}/{len(lecture_list)}")

  with ThreadPoolExecutor(max_workers=3) as inner_executor:
    future_to_url = { inner_executor.submit(transcript_worker, lec): lec for lec in lecture_list }

    for j, future in enumerate(as_completed(future_to_url)):
      result = future.result()
      original_url = future_to_url[future]
      if result and len(result) == 2:
        name, download_url = result
        transcripts.append((name, original_url, download_url))
      with print_lock:
        print(f"{course_dict['coursename'][:8].strip()}: {j+1}/{len(future_to_url)}")

  return (course_dict['coursename'], transcripts)

lecture_links = {}
recorded_lecture_classes = [course_dict for course_dict in master_course_dict if course_dict['sections']['MG'] ]

with ThreadPoolExecutor(max_workers=2) as outer_executor:
  future_transcripts = [outer_executor.submit(transcript_links_worker, course) for course in recorded_lecture_classes] 
  
  for i, future in enumerate(as_completed(future_transcripts)):
    classname, transcripts_triplet = future.result()
    lecture_links[classname] = transcripts_triplet 
    with print_lock:
      # print(lecture_links)
      print_progress_bar(i + 1, len(recorded_lecture_classes), prefix=" "*20)



NPB 173: 2 lecs
BIS 002A: 30 lecs
BIS 002A: 1/30
NPB 173: 1/2
BIS 002A: 2/30
NPB 173: 2/2
BIS 002A: 3/30       |██--------------------------------------------------------------------------------------------------| 2.8% 
BIS 002A: 4/30
BIS 002A: 5/30
ECS 170: 0 lecs
BIS 002A: 6/30       |█████-----------------------------------------------------------------------------------------------| 5.6% 
BIS 002A: 7/30
BIS 002A: 8/30
ECS 154A: 30 lecs
BIS 002A: 9/30
BIS 002A: 10/30
BIS 002A: 11/30
ECS 154A: 1/30
ECS 154A: 2/30
ECS 154A: 3/30
BIS 002A: 12/30
BIS 002A: 13/30
BIS 002A: 14/30
ECS 154A: 4/30
ECS 154A: 5/30
ECS 154A: 6/30
BIS 002A: 15/30
BIS 002A: 16/30
BIS 002A: 17/30
ECS 154A: 7/30
BIS 002A: 18/30
ECS 154A: 8/30
ECS 154A: 9/30
BIS 002A: 19/30
ECS 154A: 10/30
BIS 002A: 20/30
BIS 002A: 21/30
BIS 002A: 22/30
ECS 154A: 11/30
BIS 002A: 23/30
ECS 154A: 12/30
ECS 154A: 13/30
BIS 002A: 24/30
BIS 002A: 25/30
ECS 154A: 14/30
ECS 154A: 15/30
BIS 002A: 26/30
ECS 154A: 16/30
ECS 154A: 17/30
BIS 00

In [10]:
file_path = "meta-downloads/lecture-links.json"

with open(file_path, 'w') as json_file:
  json.dump(lecture_links, json_file)

