# All Events

In [None]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
import os

QUERIED_TEXT = 'events'

# Prep dataframe for all applicable date ranges for which there will be total_{QUERIED_TEXT} count
queried_start_date = pd.to_datetime('2020-01-01') #, format='%Y%m%d')
queried_end_date = pd.to_datetime('2023-06-01') #, format='%Y%m%d')
queried_date_range = pd.date_range(start=queried_start_date, end=queried_end_date)
queried_date_range_df = pd.DataFrame({'date': queried_date_range})
queried_date_range_df[f'total_{QUERIED_TEXT}'] = None

# Set the base URL
base_url = 'https://donyc.com/events/{}/{}/{}?page={}'

# Verify that file exists
csv_filename = f'donyc_{QUERIED_TEXT}.csv'
if os.path.isfile(csv_filename):
    queried_date_range_df = pd.read_csv(csv_filename, parse_dates=['date'])

# Loop through each date in the date range
for i, row in queried_date_range_df.iterrows():

    date = row['date']

    # Check if the 'total_{QUERIED_TEXT}' column is not NaN, if yes then break out of the loop
    if not pd.isna(row[f'total_{QUERIED_TEXT}']):
        print(f'{date} is already populated')
        continue

    # Format the URL with the year, month, day, and page number
    year = date.year
    month = date.month
    day = date.day
    page_num = 1

    # Initialize the count to 0
    count = 0

    # Loop through each page of {QUERIED_TEXT} for the current date
    while True:
        # Make a request to the current page
        url = base_url.format(year, month, day, page_num)
        response = requests.get(url)

        # Parse the HTML content using BeautifulSoup
        soup = BeautifulSoup(response.content, 'html.parser')

        # Find all div elements with a class attribute starting with 'ds-listing event-card'
        event_cards = soup.select('div[class^="ds-listing event-card"]')

        # If no event cards are found, break out of the loop
        if not event_cards:
            break

        # Loop through each event card that matches the specified classes
        for card in event_cards:
            # Find the anchor tag with an href attribute that starts with '/{QUERIED_TEXT}/2023/3/1' and the specified classes
            anchor = card.find('a', href=lambda href: href and href.startswith('/events/{}/{}/{}'.format(year, month, day)), class_='ds-listing-event-title url summary')
            if anchor:
                count += 1

        # Increment the page number and update the URL
        print(page_num)
        page_num += 1

    # Set the count for the current date in the 'total_{QUERIED_TEXT}' column of the DataFrame
    queried_date_range_df.loc[i, f'total_{QUERIED_TEXT}'] = count
    queried_date_range_df.to_csv(csv_filename, index=False)
    print(queried_date_range_df.loc[i])

# Music

### QUERIED_TEXT can be updated to comedy, film-screenings, default

In [None]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
import os

QUERIED_TEXT = 'music'

# Prepare dataframe for all applicable date ranges for which there will be total_{QUERIED_TEXT} count
queried_start_date = pd.to_datetime('2020-01-01')
queried_end_date = pd.to_datetime('2023-06-01')
queried_date_range = pd.date_range(start=queried_start_date, end=queried_end_date)
queried_date_range_df = pd.DataFrame({'date': queried_date_range})
queried_date_range_df[f'total_{QUERIED_TEXT}'] = None

# Set the base URL
base_url = 'https://donyc.com/events/{}/{}/{}/{}?page={}'

# Verify that file exists
csv_filename = f'donyc_{QUERIED_TEXT}.csv'
if os.path.isfile(csv_filename):
    queried_date_range_df = pd.read_csv(csv_filename, parse_dates=['date'])

# Loop through each date in the date range
for i, row in queried_date_range_df.iterrows():

    continue_for = False

    date = row['date']

    # Check if the 'total_{QUERIED_TEXT}' column is not NaN, if yes then break out of the loop
    if not pd.isna(row[f'total_{QUERIED_TEXT}']):
        print(f'{date} is already populated')
        continue

    # Format the URL with the year, month, day, and page number
    year = date.year
    month = date.month
    day = date.day
    page_num = 1

    # Initialize the count to 0
    count = 0

    # Loop through each page of {QUERIED_TEXT} for the current date
    while True:

        url = base_url.format(QUERIED_TEXT, year, month, day, page_num)

        # Make a request to the current page
        response = requests.get(url)

        # Parse the HTML content using BeautifulSoup
        soup = BeautifulSoup(response.content, 'html.parser')

        # Find all div elements with a class attribute starting with 'ds-listing event-card'
        event_cards = soup.select('div[class^="ds-listing event-card"]')

        # If no event cards are found, break out of the loop
        if not event_cards:
            break

        # Loop through each event card that matches the specified classes
        for card in event_cards:
            # Find the anchor tag with an href attribute that starts with '/events/{}/{}/{}/' and the specified classes
            anchor = card.find('a', href=lambda href: href and href.startswith('/events/{}/{}/{}/'.format(year, month, day)), class_='ds-listing-event-title url summary')
            if anchor:
                count += 1
            else:
                continue_for = True
                break

        # Increment the page number and update the URL
        print(page_num)
        page_num += 1

        if continue_for:
            break

    # Set the count for the current date in the 'total_{QUERIED_TEXT}' column of the DataFrame
    queried_date_range_df.loc[i, f'total_{QUERIED_TEXT}'] = count
    queried_date_range_df.to_csv(csv_filename, index=False)
    print(queried_date_range_df.loc[i])


# Madison Square Garden Scraper
### QUERIED_TEXT = 'madison-square-garden' can be changed with other venues

In [56]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
import os
from datetime import date, timedelta

QUERIED_TEXT = 'madison-square-garden'
csv_filename = f'donyc_{QUERIED_TEXT}.csv'

# Prep dataframe for all applicable date ranges for which there will be total_{QUERIED_TEXT} count
queried_start_date = pd.to_datetime('2020-01-01')
queried_current_date = date.today()
queried_end_date = pd.to_datetime('2023-06-01')

queried_past_date_range = pd.date_range(start=queried_start_date, end=queried_current_date - timedelta(days=1))
queried_past_date_df = pd.DataFrame({'date': queried_past_date_range})
queried_past_date_df[f'{QUERIED_TEXT}_event_occurred'] = 0
queried_past_date_df = queried_past_date_df[::-1]
queried_past_date_df.reset_index(inplace=True, drop=True)

queried_future_date_range = pd.date_range(start=queried_current_date, end=queried_end_date)
queried_future_date_df = pd.DataFrame({'date': queried_future_date_range})
queried_future_date_df[f'{QUERIED_TEXT}_event_occurred'] = 0

# Set the base URL
past_base_url = 'https://donyc.com/venues/{}/past_events?page={}'
future_base_url = 'https://donyc.com/venues/{}?page={}'

# Reminder, in the past loop, the "newer" Dates in code have lower date values
page_num = 1
logic_complete = False
while True:
    url = past_base_url.format(QUERIED_TEXT, page_num)
    print(f'{url}')
    response = requests.get(url)
    soup = BeautifulSoup(response.content, 'html.parser')
    event_cards = soup.select('div[class^="ds-listing event-card"]')

    for card in event_cards:
        href = card.find('a')['href']
        year, month, day = [int(x) for x in href.split('/')[2:5] if x.isdigit()]
        event_date = pd.to_datetime(date(year, month, day))
        print(event_date)

        if event_date < queried_start_date.date():
            logic_complete = True
            break

        queried_past_date_df.loc[queried_past_date_df['date'] == event_date, f'{QUERIED_TEXT}_event_occurred'] = 1
        pd.concat([queried_past_date_df, queried_future_date_df]).sort_values('date', ascending=True).to_csv(csv_filename, index=False)

    if logic_complete:
        break

    # one page is complete, run the next page
    print(page_num)
    page_num += 1

# Reminder, in the future loop, the "newer" Dates in code have higher date values
page_num = 1
logic_complete = False
while True:
    url = future_base_url.format(QUERIED_TEXT, page_num)
    print(f'{url}')
    response = requests.get(url)
    soup = BeautifulSoup(response.content, 'html.parser')
    event_cards = soup.select('div[class^="ds-listing event-card"]')

    for card in event_cards:
        href = card.find('a')['href']
        year, month, day = [int(x) for x in href.split('/')[2:5] if x.isdigit()]
        event_date = pd.to_datetime(date(year, month, day))
        print(event_date)
        
        if event_date > queried_end_date.date():
            logic_complete = True
            break

        queried_future_date_df.loc[queried_future_date_df['date'] == event_date, f'{QUERIED_TEXT}_event_occurred'] = 1
        pd.concat([queried_past_date_df, queried_future_date_df]).sort_values('date', ascending=True).to_csv(csv_filename, index=False)

    if logic_complete:
        break

    # one page is complete, run the next page
    print(page_num)
    page_num += 1


https://donyc.com/venues/madison-square-garden/past_events?page=1
2023-03-02 00:00:00
2023-03-01 00:00:00
2023-02-27 00:00:00
2023-02-26 00:00:00
2023-02-25 00:00:00
2023-02-24 00:00:00
2023-02-21 00:00:00
2023-02-20 00:00:00
2023-02-18 00:00:00
2023-02-14 00:00:00
2023-02-13 00:00:00
2023-02-11 00:00:00
2023-02-10 00:00:00
2023-02-09 00:00:00
2023-02-08 00:00:00
2023-02-06 00:00:00
2023-02-05 00:00:00
2023-02-04 00:00:00
2023-02-02 00:00:00
2023-01-31 00:00:00
2023-01-29 00:00:00


  if event_date < queried_start_date.date():


2023-01-28 00:00:00
2023-01-27 00:00:00
2023-01-24 00:00:00
2023-01-23 00:00:00
1
https://donyc.com/venues/madison-square-garden/past_events?page=2
2023-01-20 00:00:00
2023-01-19 00:00:00
2023-01-18 00:00:00
2023-01-16 00:00:00
2023-01-15 00:00:00
2023-01-13 00:00:00
2023-01-12 00:00:00
2023-01-11 00:00:00
2023-01-10 00:00:00
2023-01-09 00:00:00
2023-01-08 00:00:00
2023-01-07 00:00:00
2023-01-06 00:00:00
2023-01-04 00:00:00
2023-01-03 00:00:00
2023-01-02 00:00:00
2022-12-31 00:00:00
2022-12-30 00:00:00
2022-12-29 00:00:00
2022-12-28 00:00:00
2022-12-27 00:00:00
2022-12-26 00:00:00
2022-12-26 00:00:00
2022-12-25 00:00:00
2022-12-23 00:00:00
2
https://donyc.com/venues/madison-square-garden/past_events?page=3
2022-12-22 00:00:00
2022-12-21 00:00:00
2022-12-20 00:00:00
2022-12-19 00:00:00
2022-12-17 00:00:00
2022-12-16 00:00:00
2022-12-15 00:00:00
2022-12-14 00:00:00
2022-12-13 00:00:00
2022-12-12 00:00:00
2022-12-11 00:00:00
2022-12-10 00:00:00
2022-12-09 00:00:00
2022-12-07 00:00:00
2022

  if event_date > queried_end_date.date():


2023-03-29 00:00:00
1
https://donyc.com/venues/madison-square-garden?page=2
2023-04-01 00:00:00
2023-04-02 00:00:00
2023-04-05 00:00:00
2023-04-09 00:00:00
2023-04-10 00:00:00
2023-04-13 00:00:00
2023-04-14 00:00:00
2023-04-15 00:00:00
2023-04-22 00:00:00
2023-04-28 00:00:00
2023-05-05 00:00:00
2023-05-06 00:00:00
2023-05-09 00:00:00
2023-05-10 00:00:00
2023-05-19 00:00:00
2023-05-28 00:00:00
2023-05-30 00:00:00
2023-05-31 00:00:00
2023-06-02 00:00:00
