In [7]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
from datetime import datetime, date, timedelta

In [8]:
# send get request and catch response
url = "https://www.ccny.cuny.edu/registrar/fall"
r = requests.get(url)

soup = BeautifulSoup(r.text, 'html.parser')

pretty_html = soup.prettify()

In [9]:
# print(pretty_html)

In [10]:
# info we're worried about (date, dow, and text) are all in a tbody element.
# there's only one tbody element in the text, so we can just use .find('tbody')
calendar = soup.find('tbody')

In [12]:
# each tr(table row) is a date (this is what we will iterate thru!), 
# td elements in each row are the columns
# first td element in tr is date formatted Month DD
# second td element is dow
# third is explanation/text of date

# in order to iterate thru tr, must use .find_all so we have a list of all tr elements
rows = calendar.find_all('tr')

# have a list of dictionaries that we'll eventually use to create our dataframe
data = []

for row in rows:
    cols = row.find_all('td')

    date_str = cols[0].text.strip()
    dow_str = cols[1].text.strip()
    text_str = cols[2].text.strip()

    # getting rid of unwanted chars in text str
    text_str = text_str.replace('\n', ' ')
    text_str = text_str.replace('\t', ' ')

    # two cases to consider : date/dow can be single day or can be range. range if "-" in either string
    # NOTE : there exists single date on calendar that is in 2022

    if "-" in date_str:
        # range
        date_range = date_str.split(' - ')
        start_date_str = date_range[0] + ", 2021"
        month = start_date_str.split(' ')[0]
        end_date_str = month + ' ' + date_range[1] + ", 2021"

        start_date = datetime.strptime(start_date_str, "%B %d, %Y").date()
        end_date = datetime.strptime(end_date_str, "%B %d, %Y").date()

        # for ranges add each date in range to data to add to df
        curr = start_date

        while curr <= end_date:
            data.append({
                "date": curr,
                "dow": curr.strftime('%A'),
                "text": text_str
            })
            curr += timedelta(days=1)

    else:
        #single date    
        if "2022" in date_str:
            # single date, in 2022. no need to concat anything
            date = datetime.strptime(date_str, "%B %d, %Y").date()
        else:
            # single date, in 2021
            date_str += ", 2021"
            date = datetime.strptime(date_str, "%B %d, %Y").date()

        data.append({
            "date": date,
            "dow": dow_str,
            "text": text_str
        })

In [13]:
# list of dictionaries, data has been populated so we can create our df now
df = pd.DataFrame(data)
df.set_index('date', inplace=True)
df.sort_index(inplace=True)
df.head(58)

Unnamed: 0_level_0,dow,text
date,Unnamed: 1_level_1,Unnamed: 2_level_1
2021-08-01,Sunday,Application for degree for January and Februar...
2021-08-18,Wednesday,Last day to apply for Study Abroad
2021-08-24,Tuesday,Last day of Registration; Last day to file ...
2021-08-25,Wednesday,Start of Fall Term; Classes begin; Initi...
2021-08-25,Wednesday,Change of program period; late fees apply
2021-08-26,Thursday,Change of program period; late fees apply
2021-08-26,Thursday,Last day for Independent Study
2021-08-27,Friday,Change of program period; late fees apply
2021-08-28,Saturday,Change of program period; late fees apply
2021-08-28,Saturday,First day of Saturday Classes
