In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
from datetime import datetime
import logging
import re

logging.basicConfig(level=logging.WARNING)

def fetch_page(url):
    """Fetches the webpage content from the given URL."""
    response = requests.get(url)
    if response.status_code == 200:
        return response.content
    else:
        logging.error(f"Failed to fetch page. Status code: {response.status_code}")
        return None

def extract_dates_and_events(soup):
    """Extracts dates and corresponding events from <p> tags."""
    p_tags = soup.find_all('p')
    date_event_map = {}

    current_date = None
    range_start_date = None

    for p in p_tags:
        text = p.get_text(strip=True)

        if not text:
            continue

        date_match = re.match(r'(\w+ \d{1,2}(?: - \d{1,2})?)', text)
        if date_match:
            if range_start_date:
                # If there's an ongoing date range, finalize it
                range_end_date = date_match.group(1)
                if range_start_date:
                    full_range = f"{range_start_date} - {range_end_date}"
                    range_start = datetime.strptime(range_start_date + " 2021", '%B %d %Y')
                    range_end = datetime.strptime(range_end_date + " 2021", '%B %d %Y')
                    day_of_week_start = range_start.strftime('%A')
                    if full_range not in date_event_map:
                        date_event_map[full_range] = {
                            'dow': f"{day_of_week_start} - {range_end.strftime('%A')}",
                            'events': []
                        }
                    date_event_map[full_range]['events'].append(text)
                range_start_date = None

            current_date = date_match.group(1)
            if " - " in current_date:
                range_start_date = current_date.split(' - ')[0]
                date_event_map[current_date] = {
                    'dow': '',
                    'events': []
                }
            else:
                month_day = datetime.strptime(current_date, '%B %d')
                year = 2021 if month_day.month >= 7 else 2022
                date = datetime.strptime(current_date + f" {year}", '%B %d %Y')
                date_event_map[current_date] = {
                    'date_obj': date,
                    'dow': date.strftime('%A'),
                    'events': []
                }
        elif current_date:
            if current_date in date_event_map:
                cleaned_text = re.sub(r'^[A-Za-z]+;\s*', '', text)  
                cleaned_text = re.sub(rf"{date_event_map[current_date]['dow']}\s*-*\s*", '', cleaned_text)
                date_event_map[current_date]['events'].append(cleaned_text)

    # Finalize the last range if exists
    if range_start_date and current_date:
        full_range = f"{range_start_date} - {current_date}"
        if full_range not in date_event_map:
            range_start = datetime.strptime(range_start_date + " 2021", '%B %d %Y')
            range_end = datetime.strptime(current_date + " 2021", '%B %d %Y')
            date_event_map[full_range] = {
                'dow': f"{range_start.strftime('%A')} - {range_end.strftime('%A')}",
                'events': []
            }

    return date_event_map

def create_dataframe(date_event_map):
    """Creates a DataFrame from extracted dates and events."""
    records = []

    for date, info in date_event_map.items():
        events = "; ".join(info['events'])  
        formatted_date = info['date_obj'].strftime('%Y-%m-%d') if 'date_obj' in info else None
        records.append({
            'date': formatted_date,  
            'dow': info['dow'],
            'text': events 
        })
    
    df = pd.DataFrame(records)
    df = df.dropna(subset=['date'])  
    df['date'] = pd.to_datetime(df['date'])
    df.set_index('date', inplace=True)
    
    return df

def main():
    url = 'https://www.ccny.cuny.edu/registrar/fall'
    page_content = fetch_page(url)
    
    if page_content:
        soup = BeautifulSoup(page_content, 'html.parser')
        date_event_map = extract_dates_and_events(soup)
        df = create_dataframe(date_event_map)

        print(df)
        df.to_csv('cuny_fall_2021_2022_calendar.csv', index=True)

if __name__ == "__main__":
    main()


                  dow                                               text
date                                                                    
2021-08-01     Sunday  ; Application for degree for January and Febru...
2021-08-18  Wednesday               ; Last day to apply for Study Abroad
2021-08-24    Tuesday  ; Last day of Registration;Last day to file eP...
2021-08-25  Wednesday  ; Start of Fall Term;Classes begin;Initial Reg...
2021-08-26   Thursday                   ; Last day for Independent Study
2021-08-28   Saturday                             ; First day of Classes
2021-08-31    Tuesday  ; Last day to add a class to an existing enrol...
2021-09-01  Wednesday  ; Verification of Enrollment rosters available...
2021-09-09   Thursday                                                   
2021-09-14    Tuesday  ; Last day for 25% tuition refund;Census date;...
2021-09-15  Wednesday  ; Assignment of 'WN' grades for non-attendance...
2021-09-23   Thursday  ; Last day to submit proof o