Install ipykernel beforehand
Import dependencies to run in .ipynb, including pandas, beautifulsoup4, requests.

In [1]:
import pandas as pd
from bs4 import BeautifulSoup
import requests
from datetime import datetime, timedelta

Obtain all the HTML code from the page using requests library

In [2]:
# Use GET request at the URL and obtain the html code
url = 'https://www.ccny.cuny.edu/registrar/fall'
r = requests.get(url)
html_doc = r.text
# Insert html_doc to BeautifulSoup to parse later on
soup = BeautifulSoup(html_doc)

Obtain all `<tr>` using BeautifulSoup and remove first element to remove irrelevant `<th>`

In [3]:
rows = soup.find_all('tr')[1:]

Initialize dictionary of lists to insert data, then convert to DataFrame in end.

In [4]:
data = {
    'date': [],
    'day of the week': [],
    'text': []
}

Iterate through all `<tr>` with a for loop and break down text.

In [5]:
def processDate(stringDate):
    # If a date like September 9 - 15
    if ' - ' in stringDate:
        monthDay_begin, endDay = stringDate.split(' - ')
        # Split between something like September 9
        month, dateStart = monthDay_begin.split()
        startDate = datetime.strptime(f"2021 {month} {dateStart}", "%Y %B %d")
        endDate = datetime.strptime(f"2021 {month} {endDay.strip()}", "%Y %B %d")
        return startDate, endDate

    try:
        # If we have a regular date like January 8, 2022 without ranges
        return datetime.strptime(stringDate, "%B %d, %Y")
    except ValueError:
        # If we have no year specified, default it to 2021 for our datetime object
        return datetime.strptime(f"2021 {stringDate}", "%Y %B %d")

# Process each row with BeautifulSoup
for row in rows:
    tds = row.find_all('td')
    # We only care about the date and text and not day of week because we can use internal library to convert.
    stringDate = tds[0].get_text(strip=True)
    text = tds[2].get_text(strip=True)
    # Convert string date to a datetime object based on our defined function
    date_range = processDate(stringDate)

    # If it is tuple, needs to be unpacked
    if isinstance(date_range, tuple):
        startDate, endDate = date_range
    # Not a tuple, so best choice is to make start and end equal so iterates only once in the while loop
    else:
        startDate = endDate = date_range

    current_date = startDate
    while current_date <= endDate:
        # If we already inserted data a specific date, must continue there rather than a new row
        if current_date in data['date']:
            # Identify the index of where the date is, then use that index for the list correlating to the text key
            index = data['date'].index(current_date)
            if not data['text'][index].endswith(';'):
                data['text'][index] += ';'
            data['text'][index] += text
        # Normal insertion when date does not already exist
        else:
            # This means date of week needs to be manually inserted based on the time
            dow = current_date.strftime('%A') # This is the dow variable that determines which day of week it is, converting the current date
            data['day of the week'].append(dow)
            data['date'].append(current_date)
            data['text'].append(text)
        # Must iterate the datetime by a day, until the loop is no longer true
        current_date += timedelta(days=1)



Convert to DataFrame, fix design, and print

In [6]:
# Convert data dictionary to a DataFrame
calendar_dataframe = pd.DataFrame(data)
calendar_dataframe.set_index('date', inplace=True) # Set Date to index
pd.set_option('display.max_colwidth', None) # Make sure the last column text can be fully visible
calendar = calendar_dataframe.style.set_table_attributes('style="word-wrap: break-word; white-space: normal;"')  # Ensure text goes to next line if it does not fit

display(calendar)

Unnamed: 0_level_0,day of the week,text
date,Unnamed: 1_level_1,Unnamed: 2_level_1
2021-08-01 00:00:00,Sunday,Application for degree for January and February 2022 begins
2021-08-18 00:00:00,Wednesday,Last day to apply for Study Abroad
2021-08-24 00:00:00,Tuesday,Last day of Registration;Last day to file ePermit for the Fall 2021;Last day to drop classes for 100% tuition refund;
2021-08-25 00:00:00,Wednesday,Start of Fall Term;Classes begin;Initial Registration Appeals begin;Change of program period; late fees apply
2021-08-26 00:00:00,Thursday,Change of program period; late fees apply;Last day for Independent Study
2021-08-27 00:00:00,Friday,Change of program period; late fees apply
2021-08-28 00:00:00,Saturday,Change of program period; late fees apply;First day of Saturday Classes
2021-08-29 00:00:00,Sunday,Change of program period; late fees apply
2021-08-30 00:00:00,Monday,Change of program period; late fees apply
2021-08-31 00:00:00,Tuesday,Change of program period; late fees apply;Last day to add a class to an existing enrollment;Last day for 75% tuition refund;Financial Aid Certification Enrollment Status date;Last day to apply for Audit option;Last day to drop without the grade of 'WD';Initial Registration Appeals end;
