In [191]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

In [192]:
url = "https://www.ccny.cuny.edu/registrar/fall"
response = requests.get(url)


In [193]:
soup = BeautifulSoup(response.text, "html.parser")
soup.prettify()
table = soup.find_all("table")[0]
rows = soup.find_all("tr")


In [194]:
data = {
    "date": [],
    "day": [],  
    "text": [],
}


In [195]:
def format_date_range(s: str, default_year: int = 2021):
    s = s.strip()

    # single date
    if "-" not in s:
        if "," not in s:
            s += f", {default_year}"
        start = end = pd.to_datetime(s)
        return start.strftime("%Y-%m-%d"), end.strftime("%Y-%m-%d")

    # range within same month
    month = s.split()[0]
    day_part = s.replace(",", "").split()[1]

    # check if it's actually a range
    if "-" in day_part:
        day_start, day_end = day_part.split("-")
    else:
        day_start = day_end = day_part  # fallback if no dash

    start = pd.to_datetime(f"{month} {day_start}, {default_year}")
    end = pd.to_datetime(f"{month} {day_end}, {default_year}")

    return start.strftime("%Y-%m-%d"), end.strftime("%Y-%m-%d")


# Testing --L> returns a tuple of start and end
print(format_date_range("August 10"))  
print(format_date_range("August 1-10"))  
print(format_date_range("January 01, 2022"))  

('2021-08-10', '2021-08-10')
('2021-08-01', '2021-08-10')
('2022-01-01', '2022-01-01')


In [196]:
def clean_day(s: str) -> str:
    return s.strip().replace("\n", " ").replace("\t", " ").split()[0]
print(clean_day("Wednesday - Friday"))
print(clean_day("Friday - Wednesday"))
print(clean_day("Saturday"))

Wednesday
Friday
Saturday


In [197]:
# clean text for date information --> remove newlines and tabs and puts it
# into a string with a space as the seperator
def clean_text(s: str) -> str:
    return " ".join(s.strip().split())

print(clean_text('Last day of Registration ' \
'Last day to file ePermit for the Fall 2021 ' \
'Last day to drop classes for 100% tuition refund;'))


Last day of Registration Last day to file ePermit for the Fall 2021 Last day to drop classes for 100% tuition refund;


In [198]:
for tr in rows:
    tds = tr.find_all("td")
    if len(tds) >= 3:
        start, end = format_date_range(tds[0].get_text())
        data["date"].append(start)  
        data["day"].append(clean_day(tds[1].get_text()))


        text = clean_text(tds[2].get_text()).strip()
        if start != end:
            text += f' Continuous through {end}'
            print(text)
        data["text"].append(text)
 
print(len(data["date"]), len(data["day"]), len(data["text"]))


36 36 36


In [199]:
df = pd.DataFrame(data)
df = df.set_index("date").sort_index()
df

Unnamed: 0_level_0,day,text
date,Unnamed: 1_level_1,Unnamed: 2_level_1
2021-08-01,Sunday,Application for degree for January and Februar...
2021-08-18,Wednesday,Last day to apply for Study Abroad
2021-08-24,Tuesday,Last day of Registration; Last day to file ePe...
2021-08-25,Wednesday,Start of Fall Term; Classes begin; Initial Reg...
2021-08-25,Wednesday,Change of program period; late fees apply
2021-08-26,Thursday,Last day for Independent Study
2021-08-28,Saturday,First day of Saturday Classes
2021-08-31,Tuesday,Last day to add a class to an existing enrollm...
2021-09-01,Wednesday,Verification of Enrollment rosters available t...
2021-09-03,Friday,No classes scheduled
