# CCNY Calendar Scraper

This notebook will scrape the CCNY Fall 2021 academic calendar.


In [135]:

import requests
from bs4 import BeautifulSoup
import pandas as pd


In [136]:

url = "https://www.ccny.cuny.edu/registrar/fall"
response = requests.get(url)
html_content = response.text


In [137]:
soup = BeautifulSoup(html_content, 'html.parser')
soup.prettify()
table = soup.find_all("table")[0]
rows = soup.find_all("tr")


In [138]:
data = {
    "date": [],
    "dow": [],  
    "text": [],
}


In [139]:
# Loop through table rows and extract data
for tr in rows:
    tds = tr.find_all("td")
    if len(tds) >= 3:
        date_text = tds[0].get_text(strip=True)
        dow_text = tds[1].get_text(strip=True)
        event_text = tds[2].get_text(strip=True)
        
        # Add to data dictionary
        data["date"].append(date_text)
        data["dow"].append(dow_text)
        data["text"].append(event_text)


In [140]:


print(f"Number of rows collected: {len(data['date'])}")
if len(data['date']) > 0:
    print(f"First few dates: {data['date'][:3]}")
    print(f"First few dow: {data['dow'][:3]}")
    print(f"First few text: {data['text'][:3]}")
else:
    print("No data collected - checking table structure...")
    print(f"Number of rows found: {len(rows)}")
    if len(rows) > 0:
        print("First row cells:")
        first_row = rows[0].find_all("td")
        for i, cell in enumerate(first_row):
            print(f"  Cell {i}: {cell.get_text(strip=True)}")

Number of rows collected: 36
First few dates: ['August 01', 'August 18', 'August 24']
First few dow: ['Sunday', 'Wednesday', 'Tuesday']
First few text: ['Application for degree for January and February 2022 begins', 'Last day to apply for Study Abroad', 'Last day of Registration;Last day to file ePermit for the Fall 2021;Last day to drop classes for 100% tuition refund;']


In [141]:
df = pd.DataFrame(data)
df['date'] = df['date'].str.split(' - ').str[0] + ', 2021'
df['date'] = pd.to_datetime(df['date'], format='mixed')
df.set_index('date', inplace=True)


In [142]:
# Check if DataFrame was created successfully
print(f"DataFrame shape: {df.shape}")
print(f"DataFrame columns: {df.columns.tolist()}")
print(f"DataFrame index type: {type(df.index)}")

DataFrame shape: (36, 2)
DataFrame columns: ['dow', 'text']
DataFrame index type: <class 'pandas.core.indexes.datetimes.DatetimeIndex'>
