In [10]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
import os
from datetime import date, timedelta

QUERIED_TEXT = 'madison-square-garden'

# Prep dataframe for all applicable date ranges for which there will be total_{QUERIED_TEXT} count
queried_start_date = pd.to_datetime('2020-01-01')
queried_current_date = date.today()
queried_end_date = pd.to_datetime('2023-06-01')

queried_past_date_range = pd.date_range(start=queried_start_date, end=queried_current_date - timedelta(days=1))
queried_past_date_df = pd.DataFrame({'date': queried_past_date_range})
queried_past_date_df[f'{QUERIED_TEXT}_event_occurred'] = 0
queried_past_date_df = queried_past_date_df[::-1]
queried_past_date_df.reset_index(inplace=True, drop=True)
# print(queried_past_date_df)

queried_future_date_range = pd.date_range(start=queried_current_date, end=queried_end_date)
queried_future_date_df = pd.DataFrame({'date': queried_future_date_range})
queried_future_date_df[f'{QUERIED_TEXT}_event_occurred'] = 0
# print(queried_future_date_df)

# Set the base URL
past_base_url = 'https://donyc.com/venues/{}/past_events?page={}'
future_base_url = 'https://donyc.com/venues/{}?page={}'

# # Verify that file exists
csv_filename = f'donyc_{QUERIED_TEXT}.csv'
# Prepare indexing variables
page_num = 1
logic_complete = False


# Reminder, in the past loop, the "newer" Dates in code have lower date values
while True:
    url = past_base_url.format(QUERIED_TEXT, page_num)
    print(f'{url}')
    response = requests.get(url)
    soup = BeautifulSoup(response.content, 'html.parser')
    event_cards = soup.select('div[class^="ds-listing event-card"]')

    for card in event_cards:
        href = card.find('a')['href']
        year, month, day = [int(x) for x in href.split('/')[2:5] if x.isdigit()]
        event_date = pd.to_datetime(date(year, month, day))
        print(event_date)

        if event_date < queried_start_date.date():
            logic_complete = True
            break

        queried_past_date_df.loc[queried_past_date_df['date'] == event_date, f'{QUERIED_TEXT}_event_occurred'] = 1
        pd.concat([queried_past_date_df, queried_future_date_df]).sort_values('date', ascending=True).to_csv(csv_filename, index=False)

    if logic_complete:
        break

    # one page is complete, run the next page
    print(page_num)
    page_num += 1

# Reminder, in the future loop, the "newer" Dates in code have higher date values
while True:
    url = future_base_url.format(QUERIED_TEXT, page_num)
    print(f'{url}')
    response = requests.get(url)
    soup = BeautifulSoup(response.content, 'html.parser')
    event_cards = soup.select('div[class^="ds-listing event-card"]')

    for card in event_cards:
        href = card.find('a')['href']
        year, month, day = [int(x) for x in href.split('/')[2:5] if x.isdigit()]
        event_date = pd.to_datetime(date(year, month, day))
        print(event_date)
        
        if event_date > queried_end_date.date():
            logic_complete = True
            break

        queried_future_date_df.loc[queried_future_date_df['date'] == event_date, f'{QUERIED_TEXT}_event_occurred'] = 1
        pd.concat([queried_past_date_df, queried_future_date_df]).sort_values('date', ascending=True).to_csv(csv_filename, index=False)

    if logic_complete:
        break

    # one page is complete, run the next page
    print(page_num)
    page_num += 1


https://donyc.com/venues/madison-square-garden/past_events?page=1
2023-03-02 00:00:00
2023-03-01 00:00:00
2023-02-27 00:00:00
2023-02-26 00:00:00
2023-02-25 00:00:00
2023-02-24 00:00:00
2023-02-21 00:00:00
2023-02-20 00:00:00
2023-02-18 00:00:00
2023-02-14 00:00:00
2023-02-13 00:00:00
2023-02-11 00:00:00
2023-02-10 00:00:00
2023-02-09 00:00:00
2023-02-08 00:00:00
2023-02-06 00:00:00
2023-02-05 00:00:00
2023-02-04 00:00:00
2023-02-02 00:00:00
2023-01-31 00:00:00
2023-01-29 00:00:00
2023-01-28 00:00:00


  if event_date < queried_start_date.date():


2023-01-27 00:00:00
2023-01-24 00:00:00
2023-01-23 00:00:00
1
https://donyc.com/venues/madison-square-garden/past_events?page=2
2023-01-20 00:00:00
2023-01-19 00:00:00
2023-01-18 00:00:00
2023-01-16 00:00:00
2023-01-15 00:00:00
2023-01-13 00:00:00
2023-01-12 00:00:00
2023-01-11 00:00:00
2023-01-10 00:00:00
2023-01-09 00:00:00
2023-01-08 00:00:00
2023-01-07 00:00:00
2023-01-06 00:00:00
2023-01-04 00:00:00
2023-01-03 00:00:00
2023-01-02 00:00:00
2022-12-31 00:00:00
2022-12-30 00:00:00
2022-12-29 00:00:00
2022-12-28 00:00:00
2022-12-27 00:00:00
2022-12-26 00:00:00
2022-12-26 00:00:00
2022-12-25 00:00:00
2022-12-23 00:00:00
2
https://donyc.com/venues/madison-square-garden/past_events?page=3
2022-12-22 00:00:00
2022-12-21 00:00:00
2022-12-20 00:00:00
2022-12-19 00:00:00
2022-12-17 00:00:00
2022-12-16 00:00:00
2022-12-15 00:00:00
2022-12-14 00:00:00
2022-12-13 00:00:00
2022-12-12 00:00:00
2022-12-11 00:00:00
2022-12-10 00:00:00
2022-12-09 00:00:00
2022-12-07 00:00:00
2022-12-06 00:00:00
2022

KeyboardInterrupt: 