## Import Modules

In [7]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
import csv 
import time
import urllib.request
from urllib.request import urlopen
import numpy as np
import os

## Part 1: Obtain a list of dataset pages and query the discovery metadata

We use PASDA's search feature to return a page (https://www.pasda.psu.edu/uci/SearchResults.aspx?Keyword=+) that lists all of the activate dataset landing pages with some metadata. Then, we use the Beautiful Soup module to query this page and harvest the following values:

- Title
- Date Issued
- Publisher
- Description
- Metadata file link
- Download link

### MANUAL STEP!!

1. Open "https://www.pasda.psu.edu/uci/SearchResults.aspx?Keyword=+" in a browser
2. Download the page
3. Save the file as "pasda-search.html" in the same directory as this notebook

In [8]:
### Read the downloaded file into a pandas dataframe

In [9]:
file_path = 'pasda-search.html'  # Modify this to the correct path to your downloaded HTML file
with open(file_path, 'r', encoding='utf-8') as file:
    html_content = file.read()


soup = BeautifulSoup(html_content, 'html.parser')

# Assuming every dataset is contained in its own <tr> tag
datasets = soup.select('tr[align="left"]')

data = []

dataset_entries = soup.select('td > h3 > a[href^="DataSummary.aspx?dataset="]')

for entry in dataset_entries:
    publisher = entry.find_next("td").text.strip()
    date = entry.find_previous("td").find_previous("td").text.strip()
    title = entry.text.strip()
    description = entry.find_next("span", id=lambda x: x and x.startswith('DataGrid1_Label3_')).text.strip()
    metadataFile = entry.parent.parent.find('a', href=True, string='Metadata')['href']
    metadataLink = "https://www.pasda.psu.edu/uci/" + metadataFile 
    try:
        download = entry.parent.parent.find('a', href=True, string='Download')['href']
    except:
        download = ''
        
    # obtain full landing page and create ID
    landing_page = "https://www.pasda.psu.edu/uci/" + entry['href']  # Landing page URL
    iden = 'pasda-' + landing_page.rsplit("=",1)[1]

    data.append([publisher, date, title, description, metadataFile, metadataLink, download, landing_page, iden])
    

# Convert to pandas dataframe
import pandas as pd
df = pd.DataFrame(data, columns=['Creator', 'Date Issued', 'Alternative Title', 'Description', 'Metadata File', 'HTML', 'Download', 'Information', 'ID'])
    

In [None]:
# optional: check the results
df
actionDate = time.strftime('%Y%m%d')
df.to_csv(f'pasda-aardvark_{actionDate}.csv', index=False)
print('#### Job done ####')

## Part 2: Download the supplemental metadata

Context: Most of the records have supplemental metadata in ISO 19139 or FGDC format. The link to this document is found in the 'HTML" column. Although these files are created as XMLs, the link is a rendered HTML.

There is additional information in these files that we want to scrape, including bounding boxes and geometry type.

We will start by downloading the metadata files - this will save time and reduce the load on PASDA's servers because this part of the recipe may need to be run multiple times after troubleshooting.

### Download the metadata files to a folder called "metadata_files"

In [11]:
# Create a directory named 'metadata_files' to store the downloaded files
download_folder = 'metadata_files'
if not os.path.exists(download_folder):
    os.makedirs(download_folder)

In [12]:
import requests

def download_file(url, folder):
    """
    Download a file given its URL and store it in the specified folder.
    """
    # Get the filename from the URL
    filename = url.split("/")[-1]
    response = requests.get(url, stream=True)
    
    # Handle the response's content in chunks (useful for large files)
    with open(os.path.join(folder, filename), 'wb') as file:
        for chunk in response.iter_content(chunk_size=8192):
            file.write(chunk)

In [None]:
for url in df['HTML']:
    try:
        download_file(url, download_folder)
        print(f"Downloaded {url}")
    except Exception as e:
        print(f"Error downloading {url}. Reason: {e}")
        

## Part 3: Query the downloaded files

In [19]:
# Constants
WEST_BOUNDING = 'West_Bounding_Coordinate:'
SOUTH_BOUNDING = 'South_Bounding_Coordinate:'
EAST_BOUNDING = 'East_Bounding_Coordinate:'
NORTH_BOUNDING = 'North_Bounding_Coordinate:'
DIRECT_SPATIAL = 'Direct_Spatial_Reference_Method:'
THEME_KEYWORD = 'Theme_Keyword:'
PLACE_KEYWORD = 'Place_Keyword:'

# Extract bounding box from a metadata file
def extract_bbox(soup):
    try:
        west = soup.find('i', string=WEST_BOUNDING).next_sibling.strip()
    except AttributeError:
        west = '-80.52'
    try:
        south = soup.find('i', string='South_Bounding_Coordinate:').next_sibling.strip()   
    except AttributeError:
        south = '39.72'

    try:
        east = soup.find('i', string='East_Bounding_Coordinate:').next_sibling.strip()   
    except AttributeError:
        east = '-74.69'
    try:
        north = soup.find('i', string='North_Bounding_Coordinate:').next_sibling.strip()   
    except AttributeError:
        north = '42.51'

    return f"{west},{south},{east},{north}"

# Extract spatial reference method from a metadata file
def extract_spatial_ref(soup):
    try:
        res_type = soup.find('i', string=DIRECT_SPATIAL).next_sibling.strip() + ' data'
    except AttributeError:
        res_type = ''
    return res_type

# Extract keywords from a metadata file
def extract_keywords(soup, keyword_type):
    try:
        keywords = soup.findAll('i', string=keyword_type)
        return "|".join([kw.next_sibling.strip() for kw in keywords])
    except Exception as e:
        return ""


In [None]:
df.reset_index(drop=True, inplace=True)
metadata_folder = "metadata_files"
bounding_boxes = {}
spatial_reference_methods = {}
theme_keywords_list = []
place_keywords_list = []



for idx, metadata_file in enumerate(df['Metadata File']):
    try:
        file_path = os.path.join(metadata_folder, metadata_file)
        with open(file_path, 'r', encoding='utf-8') as file:
            file_content = file.read()
            soup = BeautifulSoup(file_content, "html.parser")
            
            print(f"Processing metadata file {idx + 1} of {len(df)}: {metadata_file}")  # Print progress

            bbox = extract_bbox(soup)
            res_type = extract_spatial_ref(soup)
            theme_keywords = extract_keywords(soup, THEME_KEYWORD)
            place_keywords = extract_keywords(soup, PLACE_KEYWORD)

            bounding_boxes[metadata_file] = bbox
            spatial_reference_methods[metadata_file] = res_type
            
            theme_keywords_list.append(theme_keywords)
            place_keywords_list.append(place_keywords)

    except Exception as e:
        print(f"Error processing metadata file {metadata_file}: {e}")  # Print error message

# Convert lists to Pandas Series
theme_keywords_series = pd.Series(theme_keywords_list)
place_keywords_series = pd.Series(place_keywords_list)



df['Bounding Box'] = df['Metadata File'].map(bounding_boxes)
df['Resource Type'] = df['Metadata File'].map(spatial_reference_methods)
df['Resource Class'] = np.where(df['Resource Type'] == 'Raster data', 'Imagery', 'Datasets')
df['Keyword'] = theme_keywords_series
df['Spatial Coverage'] = place_keywords_series

In [None]:
# optional: check the results
df

## Part 4: add default and calculated values

In [None]:
def date_range_formatter(date_issued):
    # Extract years
    years = re.findall(r'(\d{4})', date_issued)
    # If only one year is found, duplicate it to create a range
    if len(years) == 1:
        return f"{years[0]}-{years[0]}"
    # If two years are found, format them as a range
    elif len(years) == 2:
        return f"{years[0]}-{years[1]}"
    # Return original string if no match (or any other behavior you prefer)
    else:
        return date_issued

df['Date Range'] = df['Date Issued'].apply(date_range_formatter)

In [23]:
# Append default values

df['Code'] = '08a-01'
df['Access Rights'] = 'Public'
df['Accrual Method'] = 'HTML'
df['Date Accessioned'] = time.strftime('%Y-%m-%d')
df['Language'] = 'eng'
df['Is Part Of'] = '08a-01'
df['Member Of'] = 'ba5cc745-21c5-4ae9-954b-72dd8db6815a'
df['Provider'] = 'Pennsylvania Spatial Data Access (PASDA)'
df['Identifier'] = df['Information']
df['Format'] = 'File'

### Clean up the titles

Title-case the "Alternative Title".
Check for counties in the title and reformat accordingly.
If no county is found, check for cities in the title and reformat accordingly.
If neither county nor city is found, it checks for "PA " and replaces it with "[Pennsylvania]".
It then captures the content in brackets, removes it from its original position, and appends it to the end of the title.
Some specific transformations (cleanup) are performed post-transformation.
The value from 'Date Issued' is appended at the end of the title, surrounded by curly brackets.

In [None]:
counties_in_pennsylvania = [
    'Adams', 'Allegheny', 'Armstrong', 'Beaver', 'Bedford', 'Berks',
    'Blair', 'Bradford', 'Bucks', 'Butler', 'Cambria', 'Cameron',
    'Carbon', 'Centre', 'Chester', 'Clarion', 'Clearfield', 'Clinton',
    'Columbia', 'Crawford', 'Cumberland', 'Dauphin', 'Delaware', 'Elk',
    'Erie', 'Fayette', 'Forest', 'Franklin', 'Fulton', 'Greene',
    'Huntingdon', 'Indiana', 'Jefferson', 'Juniata', 'Lackawanna',
    'Lancaster', 'Lawrence', 'Lebanon', 'Lehigh', 'Luzerne', 'Lycoming',
    'McKean', 'Mercer', 'Mifflin', 'Monroe', 'Montgomery', 'Montour',
    'Northampton', 'Northumberland', 'Perry', 'Philadelphia', 'Pike',
    'Potter', 'Schuylkill', 'Snyder', 'Somerset', 'Sullivan', 'Susquehanna',
    'Tioga', 'Union', 'Venango', 'Warren', 'Washington', 'Wayne',
    'Westmoreland', 'Wyoming', 'York'
]

cities_in_pennsylvania = [
    'Allentown', 'Altoona', 'Bethlehem', 'Butler', 'Carlisle', 'Chambersburg', 
    'Chester', 'DuBois', 'Easton', 'Erie', 'Greensburg', 'Hanover', 
    'Harrisburg', 'Hazleton', 'Hermitage', 'Johnstown', 'Lancaster', 
    'Latrobe', 'Lebanon', 'Lock Haven', 'Lower Merion', 'McKeesport', 
    'Meadville', 'Monroeville', 'Nanticoke', 'New Castle', 'Norristown', 
    'Philadelphia', 'Phoenixville', 'Pittsburgh', 'Pottstown', 'Pottsville', 
    'Reading', 'Scranton', 'Sharon', 'State College', 'Uniontown', 
    'Warren', 'Washington', 'West Chester', 'Wilkes-Barre', 'Williamsport', 'York'
]


def transform_title(row):
    alt_title = row['Alternative Title']
    
    # Search for a city or county name in the title.
    for county in counties_in_pennsylvania:
        if re.search(f"{county} County", alt_title, re.I):
            alt_title = re.sub(f"{county} County", f"[Pennsylvania--{county} County]", alt_title, flags=re.I, count=1)
            break
    else:
        for city in cities_in_pennsylvania:
            if re.search(f"\b{city}\b", alt_title, re.I):
                alt_title = re.sub(f"\b{city}\b", f"[Pennsylvania--{city}]", alt_title, flags=re.I, count=1)
                break
        else:
            alt_title = re.sub(r"\b(PA|Pennsylvania)\b", "[Pennsylvania]", alt_title, flags=re.I, count=1)

    # Capture content in brackets
    bracket_content = re.findall(r'\[(.*?)\]', alt_title)
    
    if bracket_content:
        # Remove bracketed content from original position
        alt_title = re.sub(r'\[.*?\]', '', alt_title).strip()
        
        # Append bracketed content to the end of the title
        alt_title = f"{alt_title} [{bracket_content[0]}]"

    # Cleanup phrases post-transformation using case-insensitive matching
    alt_title = re.sub(r"For \[", "[", alt_title, flags=re.I)
    alt_title = re.sub(r"For The \[", "[", alt_title, flags=re.I)
    alt_title = re.sub(r"For The City Of \[", "[", alt_title, flags=re.I)

    # Remove unwanted dashes at the beginning or just before a bracket
    alt_title = re.sub(r"^\s*-\s*|\s*-\s*(?=\[)", "", alt_title)
    
    # Make sure first letter is capitalized
    alt_title = alt_title[0].capitalize() + alt_title[1:]

    # Append the value from 'Date Issued' surrounded by curly brackets
    alt_title += f" {{{row['Date Issued']}}}"

    return alt_title

df['Title'] = df.apply(transform_title, axis=1)
    


In [25]:
def transform_publisher(publisher):
    # Dictionary mapping of publishers for direct transformation
    publisher_mappings = {
        "U S Geological Survey": "Geological Survey (U.S.)",
        "U S Fish and Wildlife Service": "U.S. Fish and Wildlife Service",
        "U S Environmental Protection Agency": "United States. Environmental Protection Agency",
        "U S Department of Agriculture": "United States. Department of Agriculture",
        "U S Census Bureau": "U.S. Census Bureau"
    }
    
    # If a direct mapping is found, return the transformed value
    if publisher in publisher_mappings:
        return publisher_mappings[publisher]
    
    # Search for a county name in the publisher string.
    for county in counties_in_pennsylvania:
        if county + " County" in publisher:
            return f"Pennsylvania--{county} County"
    else:
        for city in cities_in_pennsylvania:
            if f"City of {city}" in publisher or city == publisher:
                return f"Pennsylvania--{city}"
    
    # If no match found, return the original publisher string.
    return publisher

df['Creator'] = df['Creator'].apply(transform_publisher)

In [26]:
# Define the conversion mappings from old values to new values
subject_sm_mapping = {
    "farming": "Agriculture",
    "farmin": "Agriculture",
    "biota": "Biology",
    "boundaries": "Boundaries",
    "climatologymeteorologyatmosphere": "Climate",
    "economy": "Economy",
    "elevation": "Elevation",
    "elevation data": "Elevation",
    "environment": "Environment",
    "environmental": "Environment",
    "society; climatologyMeteorologyAtmosphere": "Events",
    "geoscientificinformation": "Geology",
    "health": "Health",
    "imagerybasemapsearthcover": "Imagery|Land Cover",
    "inlandwaters": "Inland Waters",
    "location": "Location",
    "intelligencemilitary": "Military",
    "oceans": "Oceans",
    "planningcadastre": "Property",
    "planning": "Property",
    "parcel": "Property",
    "zoning": "Property",
    "society": "Society",
    "structure": "Structure",
    "transportation": "Transportation",
    "utilitiescommunication": "Utilities"
    
    # Add more key-value pairs for other conversions as needed
}


# Function to apply the mapping and join the values back together
def convert_and_join(row):
    subject_values = row['Keyword']
    if pd.notna(subject_values):  # Check for NaN before splitting
        subject_values = subject_values.split('|')
        converted_values = []
        for value in subject_values:
            value_lower = value.lower()
            if value_lower in subject_sm_mapping:
                converted_values.append(subject_sm_mapping[value_lower])
        return '|'.join(converted_values)
    else:
        return ''  # Return an empty string if the value is NaN

# Apply the mapping and create the new "Theme" column
df['Theme'] = df.apply(convert_and_join, axis=1)

# Drop duplicates from the "Theme" column
df['Theme'] = df['Theme'].str.split('|').apply(lambda x: '|'.join(sorted(set(x), key=x.index)))

In [27]:
# # Define the desired order of columns
desired_order = [
'Title',
'Alternative Title',
'Description',
'Language',
'Format',
'Creator',
'Provider',
'Resource Class',
'Resource Type',
'Theme',
'Keyword',
'Date Issued',
'Date Range',
'Spatial Coverage',
'Bounding Box',
'Member Of',
'Download',
'HTML',
'Information',
'ID',
'Identifier',
'Access Rights',

# # Add more columns as needed in the desired order
 ]

# # Reindex the DataFrame based on the desired order of columns
df = df.reindex(columns=desired_order)

In [None]:
actionDate = time.strftime('%Y%m%d')
df.to_csv(f'output_{actionDate}.csv', index=False)
print('#### Job done ####')