In [None]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
import csv 
import time
import urllib.request # The urllib.request module defines functions and classes which help in opening URLs (mostly HTTP)
import re
from urllib.request import urlopen


# Define default values
code = '08a-01'  
accessRights = 'Public' 
accrualMethod = 'HTML' 
dateAccessioned = time.strftime('%Y-%m-%d') 
language = 'eng' 
isPartOf = '08a-01'
memberOf = 'ba5cc745-21c5-4ae9-954b-72dd8db6815a '
provider = 'Pennsylvania Spatial Data Access (PASDA)'
# resourceClass = ''
# resourceType = ''
# dateRange = ''


# ... [Rest of your imports and definitions, removed for brevity] ...


# Start with the main search page
resURL = 'https://www.pasda.psu.edu/uci/SearchResults.aspx?Keyword=+'
page = urllib.request.urlopen(resURL).read()
soup = BeautifulSoup(page, 'html.parser')

# Identify landing page URLs inside <h3> tags
landing_page_links = soup.select('h3 a[href^="DataSummary.aspx?dataset="]')
landing_pages = ['https://www.pasda.psu.edu/uci/' + link['href'] for link in landing_page_links]

# For testing the code of just the first 5
landing_pages = landing_pages[:5]  # Keep only the first 5 landing pages


all_data = []

for url in landing_pages:
    response = requests.get(url)
    soup = BeautifulSoup(response.content, "html.parser")
    
    landingPage = url
    iden = 'pasda-' + landingPage.rsplit("=",1)[1]

    # Extract metadata fields
    title = soup.find(attrs={'id': 'Label1'}).text.strip()
    date = soup.find(attrs={'id': 'Label2'}).text.strip()
    publisher = soup.find(attrs={'id': 'Label3'}).text.strip()
    description = soup.find(attrs={'id': 'Label14'}).text.strip()
    
    metadataLink = soup.find('a', href=True, string='Metadata')
    downloadLink = soup.find('a', href=True, string='Download')
    
    metadata = "https://www.pasda.psu.edu/uci/" + metadataLink['href']
    try:
        download = downloadLink['href']
    except:
        download = ''
    
    # ... [Rest of the data extraction code, similar to what you have] ...
    
    record = {
    "Information": landingPage,
    "ID": iden,
    "Title": title,
    "Date": date,
    "Publisher": publisher, 
    "Provider": provider, 
    "Language": language,
    "Description": description,
    "HTML": metadata,
    "Download": download,
    "Code": code,
    "Is Part Of": isPartOf,
    "Member Of": memberOf,
    
        
        
    "Date Accessioned": dateAccessioned,
}

    # Append the dictionary to the list
    all_data.append(record)
    
    data.append(
        [landingPage, 
        iden, 
        title, 
        date, 
#         dateRange, 
        
        description, 
#         resourceClass, 
#         resourceType, 
        metadata, 
        download, 
        code, 
        isPartOf, 
        memberOf, 
        accessRights, 
        accrualMethod, 
        dateAccessioned]
    )

df = pd.DataFrame(data, columns=['Information', 'ID', 'Title', 'Temporal Coverage', 'Date Range', 'Publisher', 'Provider', 'Language', 'Description', 'Resource Class', 'Resource Type', 'HTML', 'Download', 'Code', 'Is Part Of', 'Member Of', 'Access Rights', 'Accrual Method', 'Date Accessioned'])



In [None]:
# Part 3: Extracting bounding boxes
bounding_boxes = {}
for metadata_url in df['HTML']:
    response = requests.get(metadata_url)
    soup = BeautifulSoup(response.content, "html.parser")
    try: 
        try:
            west = soup.find('i', string='West_Bounding_Coordinate:').next_sibling.strip()   
        except:
            west = ''

        try:
            south = soup.find('i', string='South_Bounding_Coordinate:').next_sibling.strip()   
        except:
            south = ''

        try:
            east = soup.find('i', string='East_Bounding_Coordinate:').next_sibling.strip()   
        except:
            east = ''

        try:
            north = soup.find('i', string='North_Bounding_Coordinate:').next_sibling.strip()   
        except:
            north = ''

        bbox = west + ',' + south + ',' +east + ',' + north
    except:
        bbox = "missing"
        
        
    bounding_boxes[metadata_url] = bbox

df['Bounding Box'] = df['HTML'].map(bounding_boxes)

actionDate = time.strftime('%Y%m%d')
df.to_csv(f'output_{actionDate}.csv', index=False)
print('#### Job done ####')