### Scraping Exhibition Information from the MET's Website

In [None]:
# Install and import packages

import requests
from bs4 import BeautifulSoup
import csv
import time

In [2]:
# Base URL for the museum site (used to build full URLs for relative links)
base_url = "https://www.metmuseum.org"
exhibitions_base_url = f"{base_url}/exhibitions/past"

In [3]:
# The range of years to scrape (10 years for the sake of the test)
years = range(2015, 2025)

In [4]:
with open("metexhibitions_2015-2024.csv", "w", newline="", encoding="utf-8") as csvfile:
    writer = csv.writer(csvfile)
    writer.writerow(["Year", "Title", "Duration", "URL", "Description"])

    for year in years:
        url = f"{exhibitions_base_url}?year={year}"
        response = requests.get(url)
        soup = BeautifulSoup(response.text, 'html.parser')

        cards = soup.find_all("article", class_="exhibition-card_exhibitionCard__I9gVC")

        for card in cards:
            # Extract title
            title_div = card.find("div", class_="exhibition-card_title__cZvmM")
            title = title_div.get_text(strip=True) if title_div else "No Title"
            
            # Extract the exhibition duration
            meta_div = card.find("div", class_="exhibition-card_meta__T_lev")
            meta = meta_div.get_text(strip=True) if meta_div else "No Duration Info"
            
            # Extract the link to the specific exhibition
            link_tag = card.find("a", class_="redundant-link_redundantlink__b5TFR")
            if link_tag and link_tag.get("href"):
                link = link_tag["href"]
                if link.startswith("/"):
                    detail_url = base_url + link
                else:
                    detail_url = link
            else:
                detail_url = None

            # Request the exhibition detail page to get the description
            description = ""
            if detail_url:
                detail_response = requests.get(detail_url)
                detail_soup = BeautifulSoup(detail_response.text, "html.parser")
                desc_div = detail_soup.find("div", class_="content-split_main__DCqV5")
                if desc_div:
                    unwanted_selectors = [
                        "audio",       # Remove any <audio> tags
                        "iframe",      # Remove any <iframe> tags
                        "div.image_imageWrapper__uRT0B",   # Remove image wrappers
                        "div.audio-player_transcriptSection__AdfMG"  # Remove transcript sections
                    ]

                    for selector in unwanted_selectors:
                        for unwanted in desc_div.select(selector):
                            unwanted.decompose()
    
                    paragraphs = desc_div.find_all("p")
                    description = " ".join(p.get_text(strip=True) for p in paragraphs)
                
                elif detail_soup.find("div", class_="rich-text"): # The second type of HTML structure that contains the descriptions
                        rich_text_divs = detail_soup.find_all("div", class_="rich-text")
                        if rich_text_divs:
                            description = " ".join(rt.get_text(separator=" ", strip=True) for rt in rich_text_divs)
                        else:
                            description = "No Description Found"
                
                else:
                    description = "No Description Found"
                time.sleep(1)
            
            else:
                detail_url = "No URL"
                description = "No Link, so no description"


            writer.writerow([year, title, meta, detail_url, description])