## Scraping the UIC academic calendar

`https://catalog.uic.edu/ucat/academic-calendar/`

In [35]:
from urllib.request import urlopen
from bs4 import BeautifulSoup

In [36]:
with urlopen("https://catalog.uic.edu/ucat/academic-calendar/") as r:
    with open("calendar.html","wb") as fp:
        fp.write(r.read()) # save a local copy of the UIC calendar page

In [37]:
with open("calendar.html","rb") as fp:
    soup = BeautifulSoup(fp, "html.parser")

In [38]:
soup

<!DOCTYPE html>

<html dir="ltr" lang="en" xml:lang="en">
<head>
<title>Academic Calendar &lt; University of Illinois Chicago</title>
<meta content="text/html; charset=utf-8" http-equiv="Content-Type"/>
<link href="/search/opensearch.xml" rel="search" title="Catalog" type="application/opensearchdescription+xml"/>
<meta content="width=device-width, initial-scale=1.0, minimum-scale=1.0" name="viewport"/>
<link href="/favicon.ico" rel="shortcut icon"/>
<link href="/css/reset.css" rel="stylesheet" type="text/css"/>
<link href="/css/courseleaf.css" rel="stylesheet" type="text/css"/>
<link href="/fonts/font-awesome/font-awesome.min.css" rel="stylesheet" type="text/css"/>
<link href="/css/screen.css" media="screen" rel="stylesheet" type="text/css"/>
<link href="/css/handheld.css" media="only screen and (max-width: 767px)" rel="stylesheet" type="text/css"/>
<link href="/css/print.css" media="print" rel="stylesheet" type="text/css"/>
<script src="/js/jquery.js" type="text/javascript"></script>


In [39]:
tables = soup.find_all("table")

In [40]:
len(tables)

6

In [43]:
good_tables = tables[3:]  # Drop the 2021-2022 academic year when they marked everything up differently (?!?!?!)

In [45]:
for t in good_tables:
    for row in t.find_all("tr"):
        print(row.text)


Date
Event

Fall Semester 2022


August 22, M
Instruction begins. 

September 5, M
Labor Day holiday. No classes.

October 14, F
Eight-week Part of Term A ends.

October 17, M
Eight-week Part of Term B begins.

November 8, T
Election Day holiday. No classes.

November 24–25, Th–F
Thanksgiving holiday. No classes.

December 2, F
Instruction ends.

December 5–9, M–F
Final examinations.

December 14, W
Instructor grading deadline for 16-week courses (5 p.m.)

December 19, M
Grades available via my.UIC.edu

Spring Semester 2023


January 9, M
Instruction begins.

January 16, M
Martin Luther King Jr. Day. No Classes.

March 3, F
Eight-week Part of Term A ends.

March 6, M
Eight-week Part of Term B begins.

March 20-24, M-F
Spring vacation. No classes.

April 28, F
Instruction ends.

May 1-5, M-F
Final examinations.

May 10, W
Instructor grading deadline for 16-week courses (5 p.m.)

May 15, M
Grades available via my.UIC.edu

Summer Sessions 2023


Summer Session 1 (4-Week) 


May 15, M
Ins

In [None]:
# Output format: I want each event on the calendar to become a dictionary
# like this one:
{
    "year": 2023,
    "term": "spring",  # or "summer 1" or "summer 2" or "fall"
    "date": "April 28, F",
    "event": "Instruction ends."
}

In [46]:
# ASSUMPTION: Any row of a schedule table that has its first <td>
# containing the word "summer", "spring", or "fall" is a heading
# indicating a new year/term.
def is_term_heading(row):
    first_td = row.td
    if first_td==None:
        return False
    s = first_td.text.lower()
    if "summer" in s or "fall" in s or "spring" in s:
        return True
    return False

In [53]:
schedule_events = []

for t in good_tables:
    for row in t.find_all("tr"):
        if is_term_heading(row):
            s = row.text.lower()
            fields = s.split()
            try:
                # If this works, then we're seeing something like "Fall Semester 2022"
                year = int(fields[-1]) # e.g. 2022
                term = fields[0].lower() # e.g. "fall"
            except ValueError:
                # We're seeing something like "Summer Session 1 (4-Week)"
                if s.startswith("summer session 1"):
                    term = "summer1"
                else:
                    term = "summer2"
            print(year,term)
        else:
            heading = row.find("th")
            if heading != None:
                continue # skip all rows that give column headings
            #e.g. "December 8, F.  Instruction Ends."
            # record this schedule item
            cells = row.find_all("td")
            schedule_events.append(
                {
                    "year": year,
                    "term": term,
                    "date": cells[0].text, # content of the first td
                    "event": cells[1].text, # context of the second td 
                }
            )

2022 fall
2023 spring
2023 summer
2023 summer1
2023 summer2
2023 fall
2024 spring
2024 summer
2024 summer1
2024 summer2
2024 fall
2025 spring
2025 summer
2025 summer1
2025 summer2


In [54]:
schedule_events

[{'year': 2022,
  'term': 'fall',
  'date': 'August 22, M',
  'event': 'Instruction begins. '},
 {'year': 2022,
  'term': 'fall',
  'date': 'September 5, M',
  'event': 'Labor Day holiday. No classes.'},
 {'year': 2022,
  'term': 'fall',
  'date': 'October 14, F',
  'event': 'Eight-week Part of Term A ends.'},
 {'year': 2022,
  'term': 'fall',
  'date': 'October 17, M',
  'event': 'Eight-week Part of Term B begins.'},
 {'year': 2022,
  'term': 'fall',
  'date': 'November 8, T',
  'event': 'Election Day holiday. No classes.'},
 {'year': 2022,
  'term': 'fall',
  'date': 'November 24–25, Th–F',
  'event': 'Thanksgiving holiday. No classes.'},
 {'year': 2022,
  'term': 'fall',
  'date': 'December 2, F',
  'event': 'Instruction ends.'},
 {'year': 2022,
  'term': 'fall',
  'date': 'December 5–9, M–F',
  'event': 'Final examinations.'},
 {'year': 2022,
  'term': 'fall',
  'date': 'December 14, W',
  'event': 'Instructor grading deadline for 16-week courses (5 p.m.)'},
 {'year': 2022,
  'term

In [57]:
# Write as a CSV
import csv

with open("calendar.csv","wt",newline="") as fp:
    writer = csv.DictWriter(fp,fieldnames=["year","term","date","event"])
    writer.writeheader()
    for d in schedule_events:
        writer.writerow(d)

In [59]:
# Write as JSON (nice because it preserves data types)
import json

with open("calendar.json","w") as fp:
    json.dump(schedule_events,fp)