# Scraping Data 

In [1]:
from bs4 import BeautifulSoup
import requests
import json
from pprint import pprint

In [2]:
link = "https://miamiuniversityartmuseum.omeka.net/items/browse?page=2"

In [3]:
pages = 248

In [4]:
for page in range(1, pages + 1):
    if page == 1:
        browse = "https://miamiuniversityartmuseum.omeka.net/items/browse"
    else:
        i = page - 1
browse = f"https://miamiuniversityartmuseum.omeka.net/items/browse&page={i}"

In [5]:
response = requests.get(browse)
if response.status_code != 200:
    print(f"Failed to retrieve data from the URL. Status code: {response.status_code}")
    exit()

soup = BeautifulSoup(response.text, 'html.parser')

In [6]:
link_on_pages = []
for link in soup.find_all('a'):

    href = link.get('href')
    if href and href.startswith('/items/show/'):
        #print(href)
        link_on_pages.append(href)

#print(link_on_pages)

In [7]:
soups = []

for link in link_on_pages:
    full_link = "https://miamiuniversityartmuseum.omeka.net" + link
    #print(full_link)
    response = requests.get(full_link)
    if response.status_code != 200:
        print(f"Failed to retrieve data from the URL. Status code: {response.status_code}")
        exit()

    soup = BeautifulSoup(response.text, 'html.parser')
    soups.append(soup)

In [8]:
def extract_data_from_soup(soup):
    data = {}

    # Extract fields using a loop
    fields = {
        'dublin-core-title': 'Title',
        'dublin-core-identifier': 'Identifier',
        'dublin-core-subject': 'Subject',
        'dublin-core-description': 'Description',
        'dublin-core-creator': 'Creator',
        'dublin-core-format': 'Format',
        'dublin-core-date': 'Date',
        'dublin-core-medium': 'Medium',
        'physical-object-item-type-metadata-donor': 'Donor',
        'item-citation': 'Citation'
    }

    for field_id, field_name in fields.items():
        element = soup.find('div', {'id': field_id})
        if element:
            data[field_name] = element.find('div', {'class': 'element-text'}).get_text(strip=True)

    # Extract tags
    data['Tags'] = [tag.get_text(strip=True) for tag in soup.find_all('a', {'rel': 'tag'})]

    # Extract image URL
    '''
    image_element = soup.find('div', {'id': 'item-images'})
    if image_element:
        data['Image URL'] = image_element.find('a')['href']
    '''

    # Extract collection link
    collection_element = soup.find('div', {'id': 'collection'})
    if collection_element:
        data['Collection Link'] = collection_element.find('a')['href']

    return data
    #print(f"{key}: {value}\n")


In [9]:
import json

# Initialize a list to store all extracted data
all_data = []

# Extract data from each soup and append to the list
for soup in soups:
    data = extract_data_from_soup(soup)
    all_data.append(data)

# Save the extracted data to a JSON file
with open("data/extracted_data.json", "w", encoding="utf-8") as json_file:
    json.dump(all_data, json_file, ensure_ascii=False, indent=4)

print("Data saved to data/extracted_data.json")

Data saved to data/extracted_data.json


In [10]:
def scrape_omeka():
    # Initialize a list to store all extracted data
    all_data = []

    # Extract data from each soup and append to the list
    for soup in soups:
        data = extract_data_from_soup(soup)
        all_data.append(data)

    # Save the extracted data to a JSON file
    with open("data/extracted_data.json", "w", encoding="utf-8") as json_file:
        json.dump(all_data, json_file, ensure_ascii=False, indent=4)

    print("Data saved to data/extracted_data.json")
if __name__ == "__main__":
    scrape_omeka()

Data saved to data/extracted_data.json


In [15]:
from bs4 import BeautifulSoup
import requests
import json
import os

def generate_page_urls(base_url, total_pages):
    urls = []
    for i in range(0, total_pages + 1):
        if i == 1:
            urls.append(base_url)
        else:
            urls.append(base_url + "?page=" + str(i))
    return urls

def get_item_links(page_url):
    response = requests.get(page_url)
    if response.status_code != 200:
        print("Failed to retrieve", page_url, "Status code:", response.status_code)
        return []

    soup = BeautifulSoup(response.text, 'html.parser')
    links = []
    for a in soup.find_all('a', href=True):
        if a['href'].startswith('/items/show/'):
            full_url = "https://miamiuniversityartmuseum.omeka.net" + a['href']
            links.append(full_url)
    return links

def fetch_soup(url):
    response = requests.get(url)
    if response.status_code != 200:
        print("Failed to retrieve", url, "Status code:", response.status_code)
        return None
    return BeautifulSoup(response.text, 'html.parser')

def extract_data_from_soup(soup):
    data = {}
    fields = {
        'dublin-core-title': 'Title',
        'dublin-core-identifier': 'Identifier',
        'dublin-core-subject': 'Subject',
        'dublin-core-description': 'Description',
        'dublin-core-creator': 'Creator',
        'dublin-core-format': 'Format',
        'dublin-core-date': 'Date',
        'dublin-core-medium': 'Medium',
        'physical-object-item-type-metadata-donor': 'Donor',
        'item-citation': 'Citation'
    }

    for field_id, field_name in fields.items():
        element = soup.find('div', {'id': field_id})
        if element:
            text_element = element.find('div', {'class': 'element-text'})
            if text_element:
                data[field_name] = text_element.get_text(strip=True)

    tags = []
    for tag in soup.find_all('a', {'rel': 'tag'}):
        tags.append(tag.get_text(strip=True))
    data['Tags'] = tags

    collection_element = soup.find('div', {'id': 'collection'})
    if collection_element:
        link = collection_element.find('a')
        if link:
            data['Collection Link'] = link['href']

    return data

def append_to_json(data, output_path):
    if os.path.exists(output_path):
        with open(output_path, 'r', encoding='utf-8') as f:
            existing_data = json.load(f)
    else:
        existing_data = []

    existing_data.extend(data)

    with open(output_path, 'w', encoding='utf-8') as f:
        json.dump(existing_data, f, ensure_ascii=False, indent=4)

    print("Appended", len(data), "records to", output_path)

def scrape_omeka(base_url, total_pages, output_path):
    page_urls = generate_page_urls(base_url, total_pages)

    for page_url in page_urls:
        page_data = []
        item_links = get_item_links(page_url)
        for item_url in item_links:
            soup = fetch_soup(item_url)
            if soup:
                data = extract_data_from_soup(soup)
                page_data.append(data)

        append_to_json(page_data, output_path)
        print("Processed page:", page_url)

def main():
    base_url = "https://miamiuniversityartmuseum.omeka.net/items/browse"
    total_pages = 2
    output_path = "data/extracted_data.json"
    scrape_omeka(base_url, total_pages, output_path)

if __name__ == "__main__":
    main()


Appended 10 records to data/extracted_data.json
Processed page: https://miamiuniversityartmuseum.omeka.net/items/browse?page=0
Appended 10 records to data/extracted_data.json
Processed page: https://miamiuniversityartmuseum.omeka.net/items/browse
Appended 10 records to data/extracted_data.json
Processed page: https://miamiuniversityartmuseum.omeka.net/items/browse?page=2
