In [18]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
import os

QUERIED_TEXT = 'events'

# Prep dataframe for all applicable date ranges for which there will be total_{QUERIED_TEXT} count
queried_start_date = pd.to_datetime('2020-01-01') #, format='%Y%m%d')
queried_end_date = pd.to_datetime('2023-06-01') #, format='%Y%m%d')
queried_date_range = pd.date_range(start=queried_start_date, end=queried_end_date)
queried_date_range_df = pd.DataFrame({'date': queried_date_range})
queried_date_range_df[f'total_{QUERIED_TEXT}'] = None

# Set the base URL
base_url = 'https://donyc.com/events/{}/{}/{}?page={}'

# Verify that file exists
csv_filename = f'donyc_{QUERIED_TEXT}.csv'
if os.path.isfile(csv_filename):
    queried_date_range_df = pd.read_csv(csv_filename, parse_dates=['date'])

# Loop through each date in the date range
for i, row in queried_date_range_df.iterrows():

    date = row['date']

    # Check if the 'total_{QUERIED_TEXT}' column is not NaN, if yes then break out of the loop
    if not pd.isna(row[f'total_{QUERIED_TEXT}']):
        print(f'{date} is already populated')
        continue

    # Format the URL with the year, month, day, and page number
    year = date.year
    month = date.month
    day = date.day
    page_num = 1

    # Initialize the count to 0
    count = 0

    # Loop through each page of {QUERIED_TEXT} for the current date
    while True:
        # Make a request to the current page
        url = base_url.format(year, month, day, page_num)
        response = requests.get(url)

        # Parse the HTML content using BeautifulSoup
        soup = BeautifulSoup(response.content, 'html.parser')

        # Find all div elements with a class attribute starting with 'ds-listing event-card'
        event_cards = soup.select('div[class^="ds-listing event-card"]')

        # If no event cards are found, break out of the loop
        if not event_cards:
            break

        # Loop through each event card that matches the specified classes
        for card in event_cards:
            # Find the anchor tag with an href attribute that starts with '/{QUERIED_TEXT}/2023/3/1' and the specified classes
            anchor = card.find('a', href=lambda href: href and href.startswith('/events/{}/{}/{}'.format(year, month, day)), class_='ds-listing-event-title url summary')
            if anchor:
                count += 1

        # Increment the page number and update the URL
        print(page_num)
        page_num += 1

    # Set the count for the current date in the 'total_{QUERIED_TEXT}' column of the DataFrame
    queried_date_range_df.loc[i, f'total_{QUERIED_TEXT}'] = count
    queried_date_range_df.to_csv(csv_filename, index=False)
    print(queried_date_range_df.loc[i])

2020-01-01 00:00:00 is already populated
2020-01-02 00:00:00 is already populated
2020-01-03 00:00:00 is already populated
2020-01-04 00:00:00 is already populated
2020-01-05 00:00:00 is already populated
2020-01-06 00:00:00 is already populated
2020-01-07 00:00:00 is already populated
2020-01-08 00:00:00 is already populated
2020-01-09 00:00:00 is already populated
2020-01-10 00:00:00 is already populated
2020-01-11 00:00:00 is already populated
2020-01-12 00:00:00 is already populated
2020-01-13 00:00:00 is already populated
2020-01-14 00:00:00 is already populated
2020-01-15 00:00:00 is already populated
2020-01-16 00:00:00 is already populated
2020-01-17 00:00:00 is already populated
2020-01-18 00:00:00 is already populated
2020-01-19 00:00:00 is already populated
2020-01-20 00:00:00 is already populated
2020-01-21 00:00:00 is already populated
2020-01-22 00:00:00 is already populated
2020-01-23 00:00:00 is already populated
2020-01-24 00:00:00 is already populated
2020-01-25 00:00

In [43]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
import os

QUERIED_TEXT = 'music'

# Prepare dataframe for all applicable date ranges for which there will be total_{QUERIED_TEXT} count
queried_start_date = pd.to_datetime('2020-01-01')
queried_end_date = pd.to_datetime('2023-06-01')
queried_date_range = pd.date_range(start=queried_start_date, end=queried_end_date)
queried_date_range_df = pd.DataFrame({'date': queried_date_range})
queried_date_range_df[f'total_{QUERIED_TEXT}'] = None

# Set the base URL
base_url = 'https://donyc.com/events/{}/{}/{}/{}?page={}'

# Verify that file exists
csv_filename = f'donyc_{QUERIED_TEXT}.csv'
if os.path.isfile(csv_filename):
    queried_date_range_df = pd.read_csv(csv_filename, parse_dates=['date'])

# Loop through each date in the date range
for i, row in queried_date_range_df.iterrows():

    continue_for = False

    date = row['date']

    # Check if the 'total_{QUERIED_TEXT}' column is not NaN, if yes then break out of the loop
    if not pd.isna(row[f'total_{QUERIED_TEXT}']):
        print(f'{date} is already populated')
        continue

    # Format the URL with the year, month, day, and page number
    year = date.year
    month = date.month
    day = date.day
    page_num = 1

    # Initialize the count to 0
    count = 0

    # Loop through each page of {QUERIED_TEXT} for the current date
    while True:

        url = base_url.format(QUERIED_TEXT, year, month, day, page_num)

        # Make a request to the current page
        response = requests.get(url)

        # Parse the HTML content using BeautifulSoup
        soup = BeautifulSoup(response.content, 'html.parser')

        # Find all div elements with a class attribute starting with 'ds-listing event-card'
        event_cards = soup.select('div[class^="ds-listing event-card"]')

        # If no event cards are found, break out of the loop
        if not event_cards:
            break

        # Loop through each event card that matches the specified classes
        for card in event_cards:
            # Find the anchor tag with an href attribute that starts with '/events/{}/{}/{}/' and the specified classes
            anchor = card.find('a', href=lambda href: href and href.startswith('/events/{}/{}/{}/'.format(year, month, day)), class_='ds-listing-event-title url summary')
            if anchor:
                count += 1
            else:
                continue_for = True
                break

        # Increment the page number and update the URL
        print(page_num)
        page_num += 1

        if continue_for:
            break

    # Set the count for the current date in the 'total_{QUERIED_TEXT}' column of the DataFrame
    queried_date_range_df.loc[i, f'total_{QUERIED_TEXT}'] = count
    queried_date_range_df.to_csv(csv_filename, index=False)
    print(queried_date_range_df.loc[i])


2020-01-01 00:00:00 is already populated
2020-01-02 00:00:00 is already populated
2020-01-03 00:00:00 is already populated
2020-01-04 00:00:00 is already populated
2020-01-05 00:00:00 is already populated
2020-01-06 00:00:00 is already populated
2020-01-07 00:00:00 is already populated
2020-01-08 00:00:00 is already populated
2020-01-09 00:00:00 is already populated
2020-01-10 00:00:00 is already populated
2020-01-11 00:00:00 is already populated
2020-01-12 00:00:00 is already populated
2020-01-13 00:00:00 is already populated
2020-01-14 00:00:00 is already populated
2020-01-15 00:00:00 is already populated
2020-01-16 00:00:00 is already populated
2020-01-17 00:00:00 is already populated
2020-01-18 00:00:00 is already populated
2020-01-19 00:00:00 is already populated
2020-01-20 00:00:00 is already populated
2020-01-21 00:00:00 is already populated
2020-01-22 00:00:00 is already populated
2020-01-23 00:00:00 is already populated
2020-01-24 00:00:00 is already populated
2020-01-25 00:00

Madison Square Garden Scraper

In [44]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
import os
from datetime import date

QUERIED_TEXT = 'madison-square-garden'

# Prep dataframe for all applicable date ranges for which there will be total_{QUERIED_TEXT} count
queried_start_date = pd.to_datetime('2020-01-01') #, format='%Y%m%d')
queried_current_date = date.today()
queried_end_date = pd.to_datetime('2023-06-01') #, format='%Y%m%d')

queried_past_date_range = pd.date_range(start=queried_start_date, end=queried_current_date)
queried_past_date_df = pd.DataFrame({'date': queried_past_date_range})
queried_past_date_df[f'total_{QUERIED_TEXT}_events'] = None
queried_past_date_df = queried_past_date_df[::-1]
queried_past_date_df.reset_index(inplace=True, drop=True)
queried_past_date_df

queried_future_date_range = pd.date_range(start=queried_current_date, end=queried_end_date)
queried_future_date_df = pd.DataFrame({'date': queried_future_date_range})
queried_future_date_df[f'total_{QUERIED_TEXT}_events'] = None
queried_future_date_df

# Set the base URL
past_base_url = 'https://donyc.com/venues/{}/past_events?page={}'
future_base_url = 'https://donyc.com/venues/{}?page={}'

# Verify that file exists
csv_filename = f'donyc_{QUERIED_TEXT}.csv'
if os.path.isfile(csv_filename):
    venue_dates_df = pd.read_csv(csv_filename, parse_dates=['date'])
    queried_past_date_df = venue_dates_df[venue_dates_df['date'].date() < queried_current_date][::-1]
    # queried_past_date_df = venue_dates_df[venue_dates_df['date'] < queried_current_date].sort_values('date', ascending=False)
    queried_past_date_df.reset_index(inplace=True, drop=True)
    queried_future_date_df = venue_dates_df[venue_dates_df['date'] >= queried_current_date]


In [51]:

# Prepare indexing variables
increment_date = True
increment_url = True
page_num = 1
count = 0
queried_past_date_df_index = 0
logic_complete = False

# Set date info
_date = queried_past_date_df.loc[queried_past_date_df_index, 'date']
year = _date.year
month = _date.month
day = _date.day

# Reminder, in the past loop, the "newer"Dates in code have lower date values
while True:
    # Set key to complete loop

        # if increment_date:
    # # Set date info
    # _date = queried_past_date_df.loc[queried_past_date_df_index, 'date']
    # year = _date.year
    # month = _date.month
    # day = _date.day
        # reset_date = False

        # if increment_url:
    # Set card info
    url = past_base_url.format(QUERIED_TEXT, page_num)
    print(f'{url}')
    response = requests.get(url)
    soup = BeautifulSoup(response.content, 'html.parser')
    event_cards = soup.select('div[class^="ds-listing event-card"]')
        # reset_url = False

        # While True, loop through every event_cards
        # while True:

    for card in event_cards:
        href = card.find('a')['href']
        year, month, day = [int(x) for x in href.split('/')[2:5] if x.isdigit()]
        event_date = date(year, month, day)
        print(f'card date {event_date}')
        print(f'{_date} of count {count}')

        # If the newest card date value is lower than queried_start_date then initiate emergency brakes
        if event_date < queried_start_date.date():
            logic_complete = True
            break

        # If event_date matches index date, then increment count
        if event_date == _date.date():
            count += 1
    
        # Technically bottom two logics can be combined as !=
        # If event_date is less than index date, add a new count entry for _date and move on
        elif event_date < _date.date():
            queried_past_date_df[queried_past_date_df_index, f'total_{QUERIED_TEXT}_events'] = count

            # While inefficient to sort every time, it allows for pause and resumption of downloads effortlessly
            pd.concat([queried_past_date_df, queried_future_date_df]).sort_values('date', ascending=True).to_csv(csv_filename, index=False)
            
            queried_past_date_df_index += 1
            count = 0

            _date = queried_past_date_df.loc[queried_past_date_df_index, 'date']
            year = _date.year
            month = _date.month
            day = _date.day

        # If event_date is greater than index date, add a new 0 entry for _date and move on
        elif event_date > _date.date():
            queried_past_date_df[queried_past_date_df_index, f'total_{QUERIED_TEXT}_events'] = count
            
            # While inefficient to sort every time, it allows for pause and resumption of downloads effortlessly
            pd.concat([queried_past_date_df, queried_future_date_df]).sort_values('date', ascending=True).to_csv(csv_filename, index=False)
            
            queried_past_date_df_index += 1
            count = 0

            _date = queried_past_date_df.loc[queried_past_date_df_index, 'date']
            year = _date.year
            month = _date.month
            day = _date.day

    if logic_complete:
        break

    # one page is complete, run the next page
    print(page_num)
    page_num += 1

card date 2023-03-02
2023-03-03 00:00:00 of count 0
card date 2023-03-01
2023-03-02 00:00:00 of count 0
card date 2023-02-27
2023-03-01 00:00:00 of count 0
card date 2023-02-26
2023-02-28 00:00:00 of count 0
card date 2023-02-25
2023-02-27 00:00:00 of count 0
card date 2023-02-24
2023-02-26 00:00:00 of count 0
card date 2023-02-21
2023-02-25 00:00:00 of count 0
card date 2023-02-20
2023-02-24 00:00:00 of count 0
card date 2023-02-18
2023-02-23 00:00:00 of count 0
card date 2023-02-14
2023-02-22 00:00:00 of count 0
card date 2023-02-13
2023-02-21 00:00:00 of count 0
card date 2023-02-11
2023-02-20 00:00:00 of count 0
card date 2023-02-10
2023-02-19 00:00:00 of count 0
card date 2023-02-09
2023-02-18 00:00:00 of count 0
card date 2023-02-08
2023-02-17 00:00:00 of count 0
card date 2023-02-06
2023-02-16 00:00:00 of count 0
card date 2023-02-05
2023-02-15 00:00:00 of count 0
card date 2023-02-04
2023-02-14 00:00:00 of count 0
card date 2023-02-02
2023-02-13 00:00:00 of count 0
card date 20