In [3]:
# William Ng
# when using jupyter lab its important install your kernel and library appropriately so you won't run into errors
# anaconda tutorial
# conda env create --name MyEnvName python= -c conda-forge
# conda activate MyEnvName
# conda install -c conda-forge jupyterlab
# python -m ipykernel install --user --name=MyEnvName
# jupyter lab

In [4]:
# initialize and import required dependencies and library
import pandas as pd
from bs4 import BeautifulSoup

In [5]:
from datetime import datetime, timedelta
import requests

In [6]:
# get request and response of the url
url = "https://www.ccny.cuny.edu/registrar/fall"
r = requests.get(url)

# utilize beautiful soup library to extract html
soup = BeautifulSoup(r.text, 'html.parser')
gethtml = soup.prettify()
# print(gethtml)

In [7]:
calender = soup.find('table')

In [8]:
# we now must iterate through the row which can be defined by tr. find_all will provide list of all tr elements
rows = calender.find_all('tr')

In [25]:
# initalize a list that is to be used for the dataframe later
calenderData = []
for row in rows:
    cols = row.find_all('td')
    if len(cols) >= 3:
        date_str = cols[0].text.strip()
        dow_str = cols[1].text.strip()
        text_str = cols[2].text.strip()

        # Clean unwanted characters in the text description
        text_str = text_str.replace('\n', ' ').replace('\t', ' ').strip()

        # Handle single dates and date ranges
        if "-" in date_str:
            # Handle a date range
            date_range = date_str.split(' - ')
            start_date_str = date_range[0] + ", 2021"
            month = start_date_str.split(' ')[0]
            end_date_str = month + ' ' + date_range[1] + ", 2021"

            # Convert to datetime objects
            start_date = datetime.strptime(start_date_str, "%B %d, %Y").date()
            end_date = datetime.strptime(end_date_str, "%B %d, %Y").date()

            # Iterate through the range and append each date
            curr_date = start_date
            while curr_date <= end_date:
                calenderData.append({
                    "date": curr_date,
                    "dow": curr_date.strftime('%A'),
                    "text": text_str
                })
                curr_date += timedelta(days=1)

        else:
            # Handle a single date
            if "2022" in date_str:
                # Date is in 2022
                date = datetime.strptime(date_str, "%B %d, %Y").date()
            else:
                # Date is in 2021
                date_str += ", 2021"
                date = datetime.strptime(date_str, "%B %d, %Y").date()

            calenderData.append({
                "date": date,
                "dow": dow_str,
                "text": text_str
            })

In [33]:
# intialize dataframe for calenderData
df = pd.DataFrame(calenderData)
# Set the 'date' column as the index
df.set_index('date', inplace=True)
# sort the date appropriately
df.sort_index(inplace=True)

In [34]:
print(df.head())

                  dow                                               text
date                                                                    
2021-08-01     Sunday  Application for degree for January and Februar...
2021-08-18  Wednesday                 Last day to apply for Study Abroad
2021-08-24    Tuesday  Last day of Registration;    Last day to file ...
2021-08-25  Wednesday  Start of Fall Term;    Classes begin;    Initi...
2021-08-25  Wednesday          Change of program period; late fees apply


In [35]:
print(df.tail())

                 dow                                               text
date                                                                   
2021-12-25  Saturday                                     College Closed
2021-12-27    Monday                                     College Closed
2021-12-28   Tuesday      Final Grade Submission Deadline for Fall 2021
2021-12-31    Friday                                     College Closed
2022-01-01  Saturday  College Closed;    Fall 2021 Degree Conferral ...


In [36]:
print(df.index)

Index([2021-08-01, 2021-08-18, 2021-08-24, 2021-08-25, 2021-08-25, 2021-08-26,
       2021-08-26, 2021-08-27, 2021-08-28, 2021-08-28, 2021-08-29, 2021-08-30,
       2021-08-31, 2021-08-31, 2021-09-01, 2021-09-03, 2021-09-04, 2021-09-05,
       2021-09-06, 2021-09-06, 2021-09-07, 2021-09-08, 2021-09-09, 2021-09-14,
       2021-09-15, 2021-09-15, 2021-09-16, 2021-09-23, 2021-09-24, 2021-10-01,
       2021-10-08, 2021-10-11, 2021-11-01, 2021-11-02, 2021-11-04, 2021-11-06,
       2021-11-23, 2021-11-25, 2021-11-26, 2021-11-27, 2021-11-28, 2021-12-11,
       2021-12-13, 2021-12-14, 2021-12-15, 2021-12-16, 2021-12-17, 2021-12-18,
       2021-12-19, 2021-12-20, 2021-12-21, 2021-12-21, 2021-12-24, 2021-12-25,
       2021-12-27, 2021-12-28, 2021-12-31, 2022-01-01],
      dtype='object', name='date')


In [37]:
print(df.columns)

Index(['dow', 'text'], dtype='object')


In [38]:
print(df.info())

<class 'pandas.core.frame.DataFrame'>
Index: 58 entries, 2021-08-01 to 2022-01-01
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   dow     58 non-null     object
 1   text    58 non-null     object
dtypes: object(2)
memory usage: 1.4+ KB
None


In [39]:
df

Unnamed: 0_level_0,dow,text
date,Unnamed: 1_level_1,Unnamed: 2_level_1
2021-08-01,Sunday,Application for degree for January and Februar...
2021-08-18,Wednesday,Last day to apply for Study Abroad
2021-08-24,Tuesday,Last day of Registration; Last day to file ...
2021-08-25,Wednesday,Start of Fall Term; Classes begin; Initi...
2021-08-25,Wednesday,Change of program period; late fees apply
2021-08-26,Thursday,Change of program period; late fees apply
2021-08-26,Thursday,Last day for Independent Study
2021-08-27,Friday,Change of program period; late fees apply
2021-08-28,Saturday,Change of program period; late fees apply
2021-08-28,Saturday,First day of Saturday Classes


In [40]:
# Export to a CSV file
df.to_csv('cuny_fall_2021_calendar.csv')