## Import Modules

In [26]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [27]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
import csv 
import time
import urllib.request
from urllib.request import urlopen
import numpy as np
import os
import re
import json

import sys
sys.path.append('../../')  # Add the parent directory to the path

from modules.spatial_coverage_transformer import apply_transformations


## Part 1: Obtain a list of dataset pages and query the discovery metadata

We use PASDA's search feature to return a page (https://www.pasda.psu.edu/uci/SearchResults.aspx?Keyword=+) that lists all of the activate dataset landing pages with some metadata. Then, we use the Beautiful Soup module to query this page and harvest the following values:

- Title
- Date Issued
- Publisher
- Description
- Metadata file link
- Download link

### MANUAL STEP!!

1. Open https://www.pasda.psu.edu/uci/SearchResults.aspx?Keyword=+ in a browser
2. Download the page
3. Save the file as "pasda-search.html" in the same directory as this notebook

### Read the downloaded file into a pandas dataframe

In [3]:
file_path = 'pasda-search.html'  # Modify this to the correct path to your downloaded HTML file
with open(file_path, 'r', encoding='utf-8') as file:
    html_content = file.read()


soup = BeautifulSoup(html_content, 'html.parser')

# Assuming every dataset is contained in its own <tr> tag
datasets = soup.select('tr[align="left"]')

data = []

dataset_entries = soup.select('td > h3 > a[href^="DataSummary.aspx?dataset="]')

for entry in dataset_entries:
    publisher = entry.find_next("td").text.strip()
    date = entry.find_previous("td").find_previous("td").text.strip()
    title = entry.text.strip()
    description = entry.find_next("span", id=lambda x: x and x.startswith('DataGrid1_Label3_')).text.strip()
    metadataFile = entry.parent.parent.find('a', href=True, string='Metadata')['href']
    metadataLink = "https://www.pasda.psu.edu/uci/" + metadataFile 
    try:
        download = entry.parent.parent.find('a', href=True, string='Download')['href']
    except:
        download = ''
        
    # obtain full landing page and create ID
    landing_page = "https://www.pasda.psu.edu/uci/" + entry['href']  # Landing page URL
    iden = 'pasda-' + landing_page.rsplit("=",1)[1]

    data.append([publisher, date, title, description, metadataFile, metadataLink, download, landing_page, iden])
    

# Convert to pandas dataframe
import pandas as pd
df = pd.DataFrame(data, columns=['Creator', 'Date Issued', 'Alternative Title', 'Description', 'Metadata File', 'HTML', 'Download', 'Information', 'ID'])
    

In [4]:
# optional: check the results
df
actionDate = time.strftime('%Y%m%d')
df.to_csv(f'pasda-aardvark_{actionDate}.csv', index=False)
print('#### Job done ####')

#### Job done ####


## Part 2: Download the supplemental metadata

Context: Most of the records have supplemental metadata in ISO 19139 or FGDC format. The link to this document is found in the 'HTML" column. Although these files are created as XMLs, the link is a rendered HTML.

There is additional information in these files that we want to scrape, including bounding boxes and geometry type.

We will start by downloading the metadata files - this will save time and reduce the load on PASDA's servers because this part of the recipe may need to be run multiple times after troubleshooting.

### Download the metadata files to a folder called "metadata_files"

In [None]:
# Create a directory named 'metadata_files' to store the downloaded files
download_folder = 'metadata_files'
if not os.path.exists(download_folder):
    os.makedirs(download_folder)

In [None]:
import requests

def download_file(url, folder):
    """
    Download a file given its URL and store it in the specified folder.
    """
    # Get the filename from the URL
    filename = url.split("/")[-1]
    response = requests.get(url, stream=True)
    
    # Handle the response's content in chunks (useful for large files)
    with open(os.path.join(folder, filename), 'wb') as file:
        for chunk in response.iter_content(chunk_size=8192):
            file.write(chunk)

In [None]:
for url in df['HTML']:
    try:
        download_file(url, download_folder)
        print(f"Downloaded {url}")
    except Exception as e:
        print(f"Error downloading {url}. Reason: {e}")
        

## Part 3: Query the downloaded files

In [5]:
# Constants
WEST_BOUNDING = 'West_Bounding_Coordinate:'
SOUTH_BOUNDING = 'South_Bounding_Coordinate:'
EAST_BOUNDING = 'East_Bounding_Coordinate:'
NORTH_BOUNDING = 'North_Bounding_Coordinate:'
DIRECT_SPATIAL = 'Direct_Spatial_Reference_Method:'
THEME_KEYWORD = 'Theme_Keyword:'
PLACE_KEYWORD = 'Place_Keyword:'

# Extract bounding box from a metadata file
def extract_bbox(soup):
    try:
        west = soup.find('i', string=WEST_BOUNDING).next_sibling.strip()
    except AttributeError:
        west = '-80.52'
    try:
        south = soup.find('i', string='South_Bounding_Coordinate:').next_sibling.strip()   
    except AttributeError:
        south = '39.72'

    try:
        east = soup.find('i', string='East_Bounding_Coordinate:').next_sibling.strip()   
    except AttributeError:
        east = '-74.69'
    try:
        north = soup.find('i', string='North_Bounding_Coordinate:').next_sibling.strip()   
    except AttributeError:
        north = '42.51'

    return f"{west},{south},{east},{north}"

# Extract spatial reference method from a metadata file
def extract_spatial_ref(soup):
    try:
        res_type = soup.find('i', string=DIRECT_SPATIAL).next_sibling.strip() + ' data'
    except AttributeError:
        res_type = ''
    return res_type

# Extract keywords from a metadata file
def extract_keywords(soup, keyword_type):
    try:
        keywords = soup.findAll('i', string=keyword_type)
        return "|".join([kw.next_sibling.strip() for kw in keywords])
    except Exception as e:
        return ""


In [7]:
df.reset_index(drop=True, inplace=True)
metadata_folder = "metadata_files"
bounding_boxes = {}
spatial_reference_methods = {}
theme_keywords_list = []
place_keywords_list = []



for idx, metadata_file in enumerate(df['Metadata File']):
    try:
        file_path = os.path.join(metadata_folder, metadata_file)
        with open(file_path, 'r', encoding='utf-8') as file:
            file_content = file.read()
            soup = BeautifulSoup(file_content, "html.parser")
            
            print(f"Processing metadata file {idx + 1} of {len(df)}: {metadata_file}")  # Print progress

            bbox = extract_bbox(soup)
            res_type = extract_spatial_ref(soup)
            theme_keywords = extract_keywords(soup, THEME_KEYWORD)
            place_keywords = extract_keywords(soup, PLACE_KEYWORD)

            bounding_boxes[metadata_file] = bbox
            spatial_reference_methods[metadata_file] = res_type
            
            theme_keywords_list.append(theme_keywords)
            place_keywords_list.append(place_keywords)

    except Exception as e:
        print(f"Error processing metadata file {metadata_file}: {e}")  # Print error message

# Convert lists to Pandas Series
theme_keywords_series = pd.Series(theme_keywords_list)
place_keywords_series = pd.Series(place_keywords_list)


Processing metadata file 1 of 2038: FullMetadataDisplay.aspx?file=AdamsCounty_AgSecurityAreas202401.xml
Processing metadata file 2 of 2038: FullMetadataDisplay.aspx?file=AdamsCounty_LandConservancyEasements202401.xml
Processing metadata file 3 of 2038: FullMetadataDisplay.aspx?file=AdamsCounty_MtJoyTwpPreservedFarms202401.xml
Processing metadata file 4 of 2038: FullMetadataDisplay.aspx?file=AdamsCounty_MunicipalBoundary202401.xml
Processing metadata file 5 of 2038: FullMetadataDisplay.aspx?file=AdamsCounty_Parcels202401.xml
Processing metadata file 6 of 2038: FullMetadataDisplay.aspx?file=AdamsCounty_PreservedFarms202401.xml
Processing metadata file 7 of 2038: FullMetadataDisplay.aspx?file=AdamsCounty_Roads202401.xml
Processing metadata file 8 of 2038: FullMetadataDisplay.aspx?file=NWPAGlacialDeposits2018.xml
Processing metadata file 9 of 2038: FullMetadataDisplay.aspx?file=AlleghenyCollege_GlacialDeposits1979.xml
Processing metadata file 10 of 2038: FullMetadataDisplay.aspx?file=Alleg

Processing metadata file 84 of 2038: FullMetadataDisplay.aspx?file=batEveningBat.xml
Processing metadata file 85 of 2038: FullMetadataDisplay.aspx?file=batHoary.xml
Processing metadata file 86 of 2038: FullMetadataDisplay.aspx?file=batIndianaMyotis.xml
Processing metadata file 87 of 2038: FullMetadataDisplay.aspx?file=batLittleBrownMyotis.xml
Processing metadata file 88 of 2038: FullMetadataDisplay.aspx?file=batNorthernMyotis.xml
Processing metadata file 89 of 2038: FullMetadataDisplay.aspx?file=batRafinesquBigeared.xml
Processing metadata file 90 of 2038: FullMetadataDisplay.aspx?file=batSeminole.xml
Processing metadata file 91 of 2038: FullMetadataDisplay.aspx?file=batSilverhaired.xml
Processing metadata file 92 of 2038: FullMetadataDisplay.aspx?file=batSoutheasternMyotis.xml
Processing metadata file 93 of 2038: FullMetadataDisplay.aspx?file=batTownsendBigeared.xml
Processing metadata file 94 of 2038: FullMetadataDisplay.aspx?file=BedfordCountyParcels202307.xml
Processing metadata fi

Processing metadata file 169 of 2038: FullMetadataDisplay.aspx?file=CentreCounty_VotingLocations202306.xml
Processing metadata file 170 of 2038: FullMetadataDisplay.aspx?file=CentreCounty_WaterLines202306.xml
Processing metadata file 171 of 2038: FullMetadataDisplay.aspx?file=CentreCounty_WaterTreatmentPlants202306.xml
Processing metadata file 172 of 2038: FullMetadataDisplay.aspx?file=pennpilot_centrecounty.xml
Processing metadata file 173 of 2038: FullMetadataDisplay.aspx?file=pennpilot_centrecounty.xml
Processing metadata file 174 of 2038: FullMetadataDisplay.aspx?file=pennpilot_centrecounty.xml
Processing metadata file 175 of 2038: FullMetadataDisplay.aspx?file=CentreCounty_Tunnels202306.xml
Processing metadata file 176 of 2038: FullMetadataDisplay.aspx?file=Agriculture_Most_Effective_Basins_2020.xml
Processing metadata file 177 of 2038: FullMetadataDisplay.aspx?file=Chesapeake_Bay_LU-LC_metadata_2022-Edition.xml
Processing metadata file 178 of 2038: FullMetadataDisplay.aspx?file=C

Processing metadata file 254 of 2038: FullMetadataDisplay.aspx?file=PhiladelphiaImagery2015.xml
Processing metadata file 255 of 2038: FullMetadataDisplay.aspx?file=PhiladelphiaImagery2015.xml
Processing metadata file 256 of 2038: FullMetadataDisplay.aspx?file=PhiladelphiaImagery2016_1m.xml
Processing metadata file 257 of 2038: FullMetadataDisplay.aspx?file=PhiladelphiaImagery2016_1m.xml
Processing metadata file 258 of 2038: FullMetadataDisplay.aspx?file=PhiladelphiaImagery2016_3in.xml
Processing metadata file 259 of 2038: FullMetadataDisplay.aspx?file=PhiladelphiaImagery2016_1m.xml
Processing metadata file 260 of 2038: FullMetadataDisplay.aspx?file=PhiladelphiaImagery2017.xml
Processing metadata file 261 of 2038: FullMetadataDisplay.aspx?file=PhiladelphiaImagery2017.xml
Processing metadata file 262 of 2038: FullMetadataDisplay.aspx?file=PhiladelphiaImagery2017.xml
Processing metadata file 263 of 2038: FullMetadataDisplay.aspx?file=PhiladelphiaImagery2017.xml
Processing metadata file 26

Processing metadata file 338 of 2038: FullMetadataDisplay.aspx?file=PhillyFPC_PPR_Rec_Facilities_point.xml
Processing metadata file 339 of 2038: FullMetadataDisplay.aspx?file=PhillyPlanning_Zoning_SteepSlopeProtectArea_r.xml
Processing metadata file 340 of 2038: FullMetadataDisplay.aspx?file=PhillyPlanning_City_Limits.xml
Processing metadata file 341 of 2038: FullMetadataDisplay.aspx?file=PhillyPlanning_Commercial_Corridors.xml
Processing metadata file 342 of 2038: FullMetadataDisplay.aspx?file=PhillyPlanning_CompleteStreetsTypesStndrds2020.xml
Processing metadata file 343 of 2038: FullMetadataDisplay.aspx?file=PhillyPlanning_Council_Districts_2016.xml
Processing metadata file 344 of 2038: FullMetadataDisplay.aspx?file=PhillyPlanning_Existing_Trails.xml
Processing metadata file 345 of 2038: FullMetadataDisplay.aspx?file=PhillyPlanning_Land_Use.xml
Processing metadata file 346 of 2038: FullMetadataDisplay.aspx?file=PhillyPlanning_Neighborhoods.xml
Processing metadata file 347 of 2038: F

Processing metadata file 419 of 2038: FullMetadataDisplay.aspx?file=PhillyStreets_Traffic_Districts_arc.xml
Processing metadata file 420 of 2038: FullMetadataDisplay.aspx?file=PhillyStreets_Traf_PM_Dist.xml
Processing metadata file 421 of 2038: FullMetadataDisplay.aspx?file=PhillyStreets_Traf_PM_Dist_arc.xml
Processing metadata file 422 of 2038: FullMetadataDisplay.aspx?file=PhillyStreets_WasteBaskets_Big_Belly.xml
Processing metadata file 423 of 2038: FullMetadataDisplay.aspx?file=PhillyStreets_WasteBaskets_Wire.xml
Processing metadata file 424 of 2038: FullMetadataDisplay.aspx?file=PhillyStreets_Zipcodes_Arc.xml
Processing metadata file 425 of 2038: FullMetadataDisplay.aspx?file=PhillyStreets_Zipcodes_Poly.xml
Processing metadata file 426 of 2038: FullMetadataDisplay.aspx?file=philadelphiatopographiccontours1996.xml
Processing metadata file 427 of 2038: FullMetadataDisplay.aspx?file=PhiladelphiaTransparcels201201.xml
Processing metadata file 428 of 2038: FullMetadataDisplay.aspx?file

Processing metadata file 497 of 2038: FullMetadataDisplay.aspx?file=PAMAP_Breaklines.xml
Processing metadata file 498 of 2038: FullMetadataDisplay.aspx?file=PAMAP_DEM.xml
Processing metadata file 499 of 2038: FullMetadataDisplay.aspx?file=PAMAP_BuildingPoint2007.xml
Processing metadata file 500 of 2038: FullMetadataDisplay.aspx?file=PAMAP_BuildingPolygon2007.xml
Processing metadata file 501 of 2038: FullMetadataDisplay.aspx?file=PAMAP_cycle1.xml
Processing metadata file 502 of 2038: FullMetadataDisplay.aspx?file=PAMAP_cycle2.xml
Processing metadata file 503 of 2038: FullMetadataDisplay.aspx?file=PAMAP_CountyMosaics.xml
Processing metadata file 504 of 2038: FullMetadataDisplay.aspx?file=PAMAP_CountyMosaicsCycle2.xml
Processing metadata file 505 of 2038: FullMetadataDisplay.aspx?file=PAMAP_HydrographyLine2007.xml
Processing metadata file 506 of 2038: FullMetadataDisplay.aspx?file=PAMAP_HydrographyPoly2007.xml
Processing metadata file 507 of 2038: FullMetadataDisplay.aspx?file=pamap_lidar

Processing metadata file 617 of 2038: FullMetadataDisplay.aspx?file=FEMA_NFHL_24_MD_20170620.xml
Processing metadata file 618 of 2038: FullMetadataDisplay.aspx?file=FEMA_NFHL_36_NY_20170614.xml
Processing metadata file 619 of 2038: FullMetadataDisplay.aspx?file=FEMA_NFHL_36_NY_20170614.xml
Processing metadata file 620 of 2038: FullMetadataDisplay.aspx?file=FEMA_NFHL_39_OH_20170802.xml
Processing metadata file 621 of 2038: FullMetadataDisplay.aspx?file=NFHL_42_20231001.xml
Processing metadata file 622 of 2038: FullMetadataDisplay.aspx?file=FEMA_dfirm_AllPA.xml
Processing metadata file 623 of 2038: FullMetadataDisplay.aspx?file=FEMA_NFHL_51_VA_20170805.xml
Processing metadata file 624 of 2038: FullMetadataDisplay.aspx?file=FEMA_NFHL_54_WV_20161030.xml
Processing metadata file 625 of 2038: FullMetadataDisplay.aspx?file=FEMA_PA_Historic_Flood_Maps.xml
Processing metadata file 626 of 2038: FullMetadataDisplay.aspx?file=HAZUS_Hpr_Baltimore_City_MD.xml
Processing metadata file 627 of 2038: Fu

Processing metadata file 713 of 2038: FullMetadataDisplay.aspx?file=IndianaCountyCenterlines202311.xml
Processing metadata file 714 of 2038: FullMetadataDisplay.aspx?file=IndianaCountyMajorRoads202311.xml
Processing metadata file 715 of 2038: FullMetadataDisplay.aspx?file=IndianaCountyLakes202311.xml
Processing metadata file 716 of 2038: FullMetadataDisplay.aspx?file=IndianaCountyMunicipalBoundaries202311.xml
Processing metadata file 717 of 2038: FullMetadataDisplay.aspx?file=IndianaCountyParcels202311.xml
Processing metadata file 718 of 2038: FullMetadataDisplay.aspx?file=IndianaCountyRailroads202311.xml
Processing metadata file 719 of 2038: FullMetadataDisplay.aspx?file=juniatamunicipal200801.xml
Processing metadata file 720 of 2038: FullMetadataDisplay.aspx?file=JuniataCountyParcels200801.xml
Processing metadata file 721 of 2038: FullMetadataDisplay.aspx?file=KeepPennsylvaniaBeautifulIllegalDumpSurveys.xml
Processing metadata file 722 of 2038: FullMetadataDisplay.aspx?file=KeepPenns

Processing metadata file 794 of 2038: FullMetadataDisplay.aspx?file=MontgomeryCounty_ElectricServiceAreas202207.xml
Processing metadata file 795 of 2038: FullMetadataDisplay.aspx?file=MontgomeryCounty_EMSAmbulanceDistricts202207.xml
Processing metadata file 796 of 2038: FullMetadataDisplay.aspx?file=MontgomeryCounty_EMSAmbulanceStations202207.xml
Processing metadata file 797 of 2038: FullMetadataDisplay.aspx?file=MontgomeryCounty_FireDistricts202207.xml
Processing metadata file 798 of 2038: FullMetadataDisplay.aspx?file=MontgomeryCounty_FireStations202207.xml
Processing metadata file 799 of 2038: FullMetadataDisplay.aspx?file=MontgomeryCounty_HistoricalAttraction202207.xml
Processing metadata file 800 of 2038: FullMetadataDisplay.aspx?file=MontgomeryCounty_Hospitals202207.xml
Processing metadata file 801 of 2038: FullMetadataDisplay.aspx?file=MontgomeryCounty_Libraries202207.xml
Processing metadata file 802 of 2038: FullMetadataDisplay.aspx?file=MontgomeryCounty_MunicipalBoundaries2022

Processing metadata file 874 of 2038: FullMetadataDisplay.aspx?file=ExplorePAtrails_Trails201805.xml
Processing metadata file 875 of 2038: FullMetadataDisplay.aspx?file=ExplorePAtrails_WP201703.xml
Processing metadata file 876 of 2038: FullMetadataDisplay.aspx?file=ExplorePAtrails_WaterTrails201703.xml
Processing metadata file 877 of 2038: FullMetadataDisplay.aspx?file=DCNR_BOF_Bndry_SFM201703.xml
Processing metadata file 878 of 2038: FullMetadataDisplay.aspx?file=FragMap_20121231Results.xml
Processing metadata file 879 of 2038: FullMetadataDisplay.aspx?file=FragMap_Pre2008Results.xml
Processing metadata file 880 of 2038: FullMetadataDisplay.aspx?file=DCNR_Road_Gas_Export_Dec_31_2012.xml
Processing metadata file 881 of 2038: FullMetadataDisplay.aspx?file=DCNR_Pads_Export_Dec_31_2012.xml
Processing metadata file 882 of 2038: FullMetadataDisplay.aspx?file=DCNR_LOC_Export_Dec_31_2012.xml
Processing metadata file 883 of 2038: FullMetadataDisplay.aspx?file=Draft_DCNR_OG_Ownership_Dec_31_201

Processing metadata file 957 of 2038: FullMetadataDisplay.aspx?file=MineMaps.xml
Processing metadata file 958 of 2038: FullMetadataDisplay.aspx?file=MineMaps.xml
Processing metadata file 959 of 2038: FullMetadataDisplay.aspx?file=MS4_Municipalities201201.xml
Processing metadata file 960 of 2038: FullMetadataDisplay.aspx?file=PublicWaterSupply2024_01.xml
Processing metadata file 961 of 2038: FullMetadataDisplay.aspx?file=RadiationFacilities2024_01.xml
Processing metadata file 962 of 2038: FullMetadataDisplay.aspx?file=ResidualWasteOperations2024_01.xml
Processing metadata file 963 of 2038: FullMetadataDisplay.aspx?file=StorageTankLocations_Active2024_03.xml
Processing metadata file 964 of 2038: FullMetadataDisplay.aspx?file=StorageTankLocations_Inactive2024_03.xml
Processing metadata file 965 of 2038: FullMetadataDisplay.aspx?file=Stormwater167_2024_01.xml
Processing metadata file 966 of 2038: FullMetadataDisplay.aspx?file=STREAMRELEAF2006_10.xml
Processing metadata file 967 of 2038: Fu

Processing metadata file 1050 of 2038: FullMetadataDisplay.aspx?file=PAhazus.xml
Processing metadata file 1051 of 2038: FullMetadataDisplay.aspx?file=PAhazus.xml
Processing metadata file 1052 of 2038: FullMetadataDisplay.aspx?file=PAhazus.xml
Processing metadata file 1053 of 2038: FullMetadataDisplay.aspx?file=PAhazus.xml
Processing metadata file 1054 of 2038: FullMetadataDisplay.aspx?file=PAhazus.xml
Processing metadata file 1055 of 2038: FullMetadataDisplay.aspx?file=PAhazus.xml
Processing metadata file 1056 of 2038: FullMetadataDisplay.aspx?file=PAhazus.xml
Processing metadata file 1057 of 2038: FullMetadataDisplay.aspx?file=PAhazus.xml
Processing metadata file 1058 of 2038: FullMetadataDisplay.aspx?file=PAhazus.xml
Processing metadata file 1059 of 2038: FullMetadataDisplay.aspx?file=PAhazus.xml
Processing metadata file 1060 of 2038: FullMetadataDisplay.aspx?file=paivanflood2004.xml
Processing metadata file 1061 of 2038: FullMetadataDisplay.aspx?file=Access202403.xml
Processing meta

Processing metadata file 1142 of 2038: FullMetadataDisplay.aspx?file=SomersetCountyHouseNumbers202203.xml
Processing metadata file 1143 of 2038: FullMetadataDisplay.aspx?file=SomersetCountyCenterlines202203.xml
Processing metadata file 1144 of 2038: FullMetadataDisplay.aspx?file=SEPTAGISHighspeedLines_201207.xml
Processing metadata file 1145 of 2038: FullMetadataDisplay.aspx?file=SEPTAGISHighspeedStations_201207.xml
Processing metadata file 1146 of 2038: FullMetadataDisplay.aspx?file=SEPTAGISRegionalRailLines_201207.xml
Processing metadata file 1147 of 2038: FullMetadataDisplay.aspx?file=SEPTAGISRegionalRailStations_2016.xml
Processing metadata file 1148 of 2038: FullMetadataDisplay.aspx?file=SEPTARoutesSpring2016.xml
Processing metadata file 1149 of 2038: FullMetadataDisplay.aspx?file=SEPTAStopsByLineSpring2016.xml
Processing metadata file 1150 of 2038: FullMetadataDisplay.aspx?file=SeptaTransitRoutesSummer2014.xml
Processing metadata file 1151 of 2038: FullMetadataDisplay.aspx?file=S

Processing metadata file 1228 of 2038: FullMetadataDisplay.aspx?file=Lake_Erie_Watershed_LiDAR2012.xml
Processing metadata file 1229 of 2038: FullMetadataDisplay.aspx?file=Lake_Erie_Watershed_Imagery2012.xml
Processing metadata file 1230 of 2038: FullMetadataDisplay.aspx?file=Lake_Erie_Watershed_Imagery2012.xml
Processing metadata file 1231 of 2038: FullMetadataDisplay.aspx?file=Lake_Erie_Watershed_Imagery2015.xml
Processing metadata file 1232 of 2038: FullMetadataDisplay.aspx?file=Lake_Erie_Watershed_Imagery2015.xml
Processing metadata file 1233 of 2038: FullMetadataDisplay.aspx?file=Lake_Erie_Watershed_2012_TileIndex.xml
Processing metadata file 1234 of 2038: FullMetadataDisplay.aspx?file=Lake_Erie_Watershed_Imagery2015.xml
Processing metadata file 1235 of 2038: FullMetadataDisplay.aspx?file=Lake_Erie_Watershed_Imagery2015.xml
Processing metadata file 1236 of 2038: FullMetadataDisplay.aspx?file=Lake_Erie_Watershed_Imagery2015.xml
Processing metadata file 1237 of 2038: FullMetadataDis

Processing metadata file 1319 of 2038: FullMetadataDisplay.aspx?file=gapherps30.xml
Processing metadata file 1320 of 2038: FullMetadataDisplay.aspx?file=gapherps90.xml
Processing metadata file 1321 of 2038: FullMetadataDisplay.aspx?file=gapmammals30.xml
Processing metadata file 1322 of 2038: FullMetadataDisplay.aspx?file=gapmammals90.xml
Processing metadata file 1323 of 2038: FullMetadataDisplay.aspx?file=gapstewardship1999.xml
Processing metadata file 1324 of 2038: FullMetadataDisplay.aspx?file=PA_GeomorphonLandformMaps2021.xml
Processing metadata file 1325 of 2038: FullMetadataDisplay.aspx?file=painactiverailroads.xml
Processing metadata file 1326 of 2038: FullMetadataDisplay.aspx?file=Riparia_HUC10_2011.xml
Processing metadata file 1327 of 2038: FullMetadataDisplay.aspx?file=Riparia_HUC12_2011.xml
Processing metadata file 1328 of 2038: FullMetadataDisplay.aspx?file=Riparia_HUC8_LC_Change.xml
Processing metadata file 1329 of 2038: FullMetadataDisplay.aspx?file=palanduse2000.xml
Proce

Processing metadata file 1407 of 2038: FullMetadataDisplay.aspx?file=tl_PennsylvaniaEconomicCensusCounty2009.xml
Processing metadata file 1408 of 2038: FullMetadataDisplay.aspx?file=tl_PennsylvaniaEconomicCensusPlace2009.xml
Processing metadata file 1409 of 2038: FullMetadataDisplay.aspx?file=tl_Pennsylvania3Digit2009.xml
Processing metadata file 1410 of 2038: FullMetadataDisplay.aspx?file=tl_Pennsylvania5Digit2009.xml
Processing metadata file 1411 of 2038: FullMetadataDisplay.aspx?file=tl_PennsylvaniaCongressionalDistricts2009.xml
Processing metadata file 1412 of 2038: FullMetadataDisplay.aspx?file=tl_PennsylvaniaBlockCurrent2009.xml
Processing metadata file 1413 of 2038: FullMetadataDisplay.aspx?file=tl_PennsylvaniaCurrentBlockStatebased2009.xml
Processing metadata file 1414 of 2038: FullMetadataDisplay.aspx?file=tl_PennsylvaniaCurrentCombinedStatisticalArea2009.xml
Processing metadata file 1415 of 2038: FullMetadataDisplay.aspx?file=tl_PennsylvaniaCurrentCounty2009.xml
Processing me

Processing metadata file 1493 of 2038: FullMetadataDisplay.aspx?file=PA_Soils2022.xml
Processing metadata file 1494 of 2038: FullMetadataDisplay.aspx?file=PA_Soils2022.xml
Processing metadata file 1495 of 2038: FullMetadataDisplay.aspx?file=NAIPQQIndex.xml
Processing metadata file 1496 of 2038: FullMetadataDisplay.aspx?file=nationalagricultureimageryprogram_2004.xml
Processing metadata file 1497 of 2038: FullMetadataDisplay.aspx?file=nationalagricultureimageryprogram_2004.xml
Processing metadata file 1498 of 2038: FullMetadataDisplay.aspx?file=nationalagricultureimageryprogram_2004.xml
Processing metadata file 1499 of 2038: FullMetadataDisplay.aspx?file=NAIP_CountyMosaics2005.xml
Processing metadata file 1500 of 2038: FullMetadataDisplay.aspx?file=nationalagricultureimageryprogram_2008.xml
Processing metadata file 1501 of 2038: FullMetadataDisplay.aspx?file=nationalagricultureimageryprogram_2008.xml
Processing metadata file 1502 of 2038: FullMetadataDisplay.aspx?file=NAIP_PA2010.xml
Pr

Processing metadata file 1583 of 2038: FullMetadataDisplay.aspx?file=EPARenewableEnergySites.xml
Processing metadata file 1584 of 2038: FullMetadataDisplay.aspx?file=EPAToxicReleaseInv1994.xml
Processing metadata file 1585 of 2038: FullMetadataDisplay.aspx?file=EPAToxicReleaseInv2017.xml
Processing metadata file 1586 of 2038: FullMetadataDisplay.aspx?file=EPAToxicReleaseInv2018.xml
Processing metadata file 1587 of 2038: FullMetadataDisplay.aspx?file=EPAToxicReleaseInv2019.xml
Processing metadata file 1588 of 2038: FullMetadataDisplay.aspx?file=chestercountywetandsline.xml
Processing metadata file 1589 of 2038: FullMetadataDisplay.aspx?file=chestercountywetlandspolygon.xml
Processing metadata file 1590 of 2038: FullMetadataDisplay.aspx?file=NWI_DE.xml
Processing metadata file 1591 of 2038: FullMetadataDisplay.aspx?file=NWI_MD.xml
Processing metadata file 1592 of 2038: FullMetadataDisplay.aspx?file=NWI_NJ.xml
Processing metadata file 1593 of 2038: FullMetadataDisplay.aspx?file=NWI_NY.xml

Processing metadata file 1672 of 2038: FullMetadataDisplay.aspx?file=NLCD_2001_Impervious_PA_20210604.xml
Processing metadata file 1673 of 2038: FullMetadataDisplay.aspx?file=NLCD_2004_Impervious_PA_20210604.xml
Processing metadata file 1674 of 2038: FullMetadataDisplay.aspx?file=NLCD_2006_Impervious_PA_20210604.xml
Processing metadata file 1675 of 2038: FullMetadataDisplay.aspx?file=NLCD_2008_Impervious_PA_20210604.xml
Processing metadata file 1676 of 2038: FullMetadataDisplay.aspx?file=NLCD_2011_Impervious_PA_20210604.xml
Processing metadata file 1677 of 2038: FullMetadataDisplay.aspx?file=NLCD_2013_Impervious_PA_20210604.xml
Processing metadata file 1678 of 2038: FullMetadataDisplay.aspx?file=NLCD_2016_Impervious_PA_20210604.xml
Processing metadata file 1679 of 2038: FullMetadataDisplay.aspx?file=NLCD_2019_Impervious_PA_20210604.xml
Processing metadata file 1680 of 2038: FullMetadataDisplay.aspx?file=NLCD_2001_Land_Cover_PA_20210604.xml
Processing metadata file 1681 of 2038: FullMet

Processing metadata file 1764 of 2038: FullMetadataDisplay.aspx?file=PA_NorthCentral_QL1_DEM.xml
Processing metadata file 1765 of 2038: FullMetadataDisplay.aspx?file=PA_NorthCentral_32498_NorthQL2_DEM.xml
Processing metadata file 1766 of 2038: FullMetadataDisplay.aspx?file=PA_NorthCentral_32498_SouthQL2_DEM.xml
Processing metadata file 1767 of 2038: FullMetadataDisplay.aspx?file=PA_NorthCentral_32498_NorthQL2_Intensity.xml
Processing metadata file 1768 of 2038: FullMetadataDisplay.aspx?file=PA_NorthCentral_32498_SouthQL2_Intensity.xml
Processing metadata file 1769 of 2038: FullMetadataDisplay.aspx?file=pasdaquads24k.xml
Processing metadata file 1770 of 2038: FullMetadataDisplay.aspx?file=Western_PA_QL1North_Breakline2020.xml
Processing metadata file 1771 of 2038: FullMetadataDisplay.aspx?file=Western_PA_QL1North_ClassifiedPointCloud2020.xml
Processing metadata file 1772 of 2038: FullMetadataDisplay.aspx?file=Western_PA_QL1North_Contour2020.xml
Processing metadata file 1773 of 2038: Ful

Processing metadata file 1853 of 2038: FullMetadataDisplay.aspx?file=AllentownPA_HydroFlattenedBareEarthRasterDEM2016.xml
Processing metadata file 1854 of 2038: FullMetadataDisplay.aspx?file=AllentownPA_IntensityImages2016.xml
Processing metadata file 1855 of 2038: FullMetadataDisplay.aspx?file=AllentownPA_Orthos2016.xml
Processing metadata file 1856 of 2038: FullMetadataDisplay.aspx?file=AllentownPA_Orthos2016.xml
Processing metadata file 1857 of 2038: FullMetadataDisplay.aspx?file=AllentownPA_Orthos2016.xml
Processing metadata file 1858 of 2038: FullMetadataDisplay.aspx?file=USGS_LiDAR2017.xml
Processing metadata file 1859 of 2038: FullMetadataDisplay.aspx?file=USGS_LiDAR2017.xml
Processing metadata file 1860 of 2038: FullMetadataDisplay.aspx?file=USGS_LiDAR2017.xml
Processing metadata file 1861 of 2038: FullMetadataDisplay.aspx?file=USGS_LiDAR2017.xml
Processing metadata file 1862 of 2038: FullMetadataDisplay.aspx?file=USGSLiDAR_DauphinCoPA_2016.xml
Processing metadata file 1863 of 

Processing metadata file 1937 of 2038: FullMetadataDisplay.aspx?file=WashingtonCounty_Parcels202312.xml
Processing metadata file 1938 of 2038: FullMetadataDisplay.aspx?file=WashingtonCounty_Parks202311.xml
Processing metadata file 1939 of 2038: FullMetadataDisplay.aspx?file=WashingtonCounty_PollingLocations202207.xml
Processing metadata file 1940 of 2038: FullMetadataDisplay.aspx?file=WashingtonCounty_RoadCenterlines202401.xml
Processing metadata file 1941 of 2038: FullMetadataDisplay.aspx?file=WashingtonCounty_SchoolDistricts202108.xml
Processing metadata file 1942 of 2038: FullMetadataDisplay.aspx?file=DRB_ProtectedLands202401.xml
Processing metadata file 1943 of 2038: FullMetadataDisplay.aspx?file=DRB_ProtectedLands202401.xml
Processing metadata file 1944 of 2038: FullMetadataDisplay.aspx?file=DRB_ProtectedLands202401.xml
Processing metadata file 1945 of 2038: FullMetadataDisplay.aspx?file=DRB_ProtectedLands202401.xml
Processing metadata file 1946 of 2038: FullMetadataDisplay.aspx?f

Processing metadata file 2027 of 2038: FullMetadataDisplay.aspx?file=Pennsylvania_soil_metadata_201409.xml
Processing metadata file 2028 of 2038: FullMetadataDisplay.aspx?file=YorkCounty_HYDRO_Streams202209.xml
Processing metadata file 2029 of 2038: FullMetadataDisplay.aspx?file=YorkCounty_Transit_Routes202209.xml
Processing metadata file 2030 of 2038: FullMetadataDisplay.aspx?file=YorkCounty_TRANSP_Transit_Stops202209.xml
Processing metadata file 2031 of 2038: FullMetadataDisplay.aspx?file=YorkCounty_ENVIR_Unique_Features202209.xml
Processing metadata file 2032 of 2038: FullMetadataDisplay.aspx?file=YorkCounty_DIST_Voting202209.xml
Processing metadata file 2033 of 2038: FullMetadataDisplay.aspx?file=YorkCounty_Zipcodes202209.xml
Processing metadata file 2034 of 2038: FullMetadataDisplay.aspx?file=YorkCounty_Zoning202209.xml
Processing metadata file 2035 of 2038: FullMetadataDisplay.aspx?file=YorkCounty_ZONING_Overlays202209.xml
Processing metadata file 2036 of 2038: FullMetadataDispla

In [8]:
df['Bounding Box'] = df['Metadata File'].map(bounding_boxes)
df['Resource Type'] = df['Metadata File'].map(spatial_reference_methods)
df['Resource Class'] = np.where(df['Resource Type'] == 'Raster data', 'Imagery', 'Datasets')
df['Keyword'] = theme_keywords_series
df['Place Names'] = place_keywords_series

In [None]:
# optional: check the results
df

## Part 4: add default and calculated values

In [9]:
def date_range_formatter(date_issued):
    # Extract years
    years = re.findall(r'(\d{4})', date_issued)
    # If only one year is found, duplicate it to create a range
    if len(years) == 1:
        return f"{years[0]}-{years[0]}"
    # If two years are found, format them as a range
    elif len(years) == 2:
        return f"{years[0]}-{years[1]}"
    # Return original string if no match (or any other behavior you prefer)
    else:
        return date_issued

df['Date Range'] = df['Date Issued'].apply(date_range_formatter)

In [10]:
# Append default values

df['Code'] = '08a-01'
df['Access Rights'] = 'Public'
df['Accrual Method'] = 'HTML'
df['Date Accessioned'] = time.strftime('%Y-%m-%d')
df['Language'] = 'eng'
df['Is Part Of'] = '08a-01'
df['Member Of'] = 'ba5cc745-21c5-4ae9-954b-72dd8db6815a'
df['Provider'] = 'Pennsylvania Spatial Data Access (PASDA)'
df['Identifier'] = df['Information']
df['Format'] = 'File'

### Clean up the titles

Title-case the "Alternative Title".
Check for counties in the title and reformat accordingly.
If no county is found, check for cities in the title and reformat accordingly.
If neither county nor city is found, it checks for "PA " and replaces it with "[Pennsylvania]".
It then captures the content in brackets, removes it from its original position, and appends it to the end of the title.
Some specific transformations (cleanup) are performed post-transformation.
The value from 'Date Issued' is appended at the end of the title, surrounded by curly brackets.

In [11]:
# Assuming locations.json is in the "data" directory
json_path = os.path.join('../../', 'data', 'locations.json')

with open(json_path, 'r') as file:
    locations = json.load(file)

counties_in_pennsylvania = locations['counties_in_pennsylvania']
cities_in_pennsylvania = locations['cities_in_pennsylvania']



def transform_title(row):
    alt_title = row['Alternative Title']
    
    # Search for a city or county name in the title.
    for county in counties_in_pennsylvania:
        if re.search(f"{county} County", alt_title, re.I):
            alt_title = re.sub(f"{county} County", f"[Pennsylvania--{county} County]", alt_title, flags=re.I, count=1)
            break
    else:
        for city in cities_in_pennsylvania:
            if re.search(f"\b{city}\b", alt_title, re.I):
                alt_title = re.sub(f"\b{city}\b", f"[Pennsylvania--{city}]", alt_title, flags=re.I, count=1)
                break
        else:
            alt_title = re.sub(r"\b(PA|Pennsylvania)\b", "[Pennsylvania]", alt_title, flags=re.I, count=1)

    # Capture content in brackets
    bracket_content = re.findall(r'\[(.*?)\]', alt_title)
    
    if bracket_content:
        # Remove bracketed content from original position
        alt_title = re.sub(r'\[.*?\]', '', alt_title).strip()
        
        # Append bracketed content to the end of the title
        alt_title = f"{alt_title} [{bracket_content[0]}]"

    # Cleanup phrases post-transformation using case-insensitive matching
    alt_title = re.sub(r"For \[", "[", alt_title, flags=re.I)
    alt_title = re.sub(r"For The \[", "[", alt_title, flags=re.I)
    alt_title = re.sub(r"For The City Of \[", "[", alt_title, flags=re.I)

    # Remove unwanted dashes at the beginning or just before a bracket
    alt_title = re.sub(r"^\s*-\s*|\s*-\s*(?=\[)", "", alt_title)
    
    # Make sure first letter is capitalized
    alt_title = alt_title[0].capitalize() + alt_title[1:]

    # Append the value from 'Date Issued' surrounded by curly brackets
    alt_title += f" {{{row['Date Issued']}}}"

    return alt_title

df['Title'] = df.apply(transform_title, axis=1)
    


In [12]:
def transform_publisher(publisher):
    # Dictionary mapping of publishers for direct transformation
    publisher_mappings = {
        "U S Geological Survey": "Geological Survey (U.S.)",
        "U S Fish and Wildlife Service": "U.S. Fish and Wildlife Service",
        "U S Environmental Protection Agency": "United States. Environmental Protection Agency",
        "U S Department of Agriculture": "United States. Department of Agriculture",
        "U S Census Bureau": "U.S. Census Bureau"
    }
    
    # If a direct mapping is found, return the transformed value
    if publisher in publisher_mappings:
        return publisher_mappings[publisher]
    
    # Search for a county name in the publisher string.
    for county in counties_in_pennsylvania:
        if county + " County" in publisher:
            return f"Pennsylvania--{county} County"
    else:
        for city in cities_in_pennsylvania:
            if f"City of {city}" in publisher or city == publisher:
                return f"Pennsylvania--{city}"
    
    # If no match found, return the original publisher string.
    return publisher

df['Creator'] = df['Creator'].apply(transform_publisher)

In [13]:
# Define the conversion mappings from old values to new values
subject_sm_mapping = {
    "farming": "Agriculture",
    "farmin": "Agriculture",
    "biota": "Biology",
    "boundaries": "Boundaries",
    "climatologymeteorologyatmosphere": "Climate",
    "economy": "Economy",
    "elevation": "Elevation",
    "elevation data": "Elevation",
    "environment": "Environment",
    "environmental": "Environment",
    "society; climatologyMeteorologyAtmosphere": "Events",
    "geoscientificinformation": "Geology",
    "health": "Health",
    "imagerybasemapsearthcover": "Imagery|Land Cover",
    "inlandwaters": "Inland Waters",
    "location": "Location",
    "intelligencemilitary": "Military",
    "oceans": "Oceans",
    "planningcadastre": "Property",
    "planning": "Property",
    "parcel": "Property",
    "zoning": "Property",
    "society": "Society",
    "structure": "Structure",
    "transportation": "Transportation",
    "utilitiescommunication": "Utilities"
    
    # Add more key-value pairs for other conversions as needed
}


# Function to apply the mapping and join the values back together
def convert_and_join(row):
    subject_values = row['Keyword']
    if pd.notna(subject_values):  # Check for NaN before splitting
        subject_values = subject_values.split('|')
        converted_values = []
        for value in subject_values:
            value_lower = value.lower()
            if value_lower in subject_sm_mapping:
                converted_values.append(subject_sm_mapping[value_lower])
        return '|'.join(converted_values)
    else:
        return ''  # Return an empty string if the value is NaN

# Apply the mapping and create the new "Theme" column
df['Theme'] = df.apply(convert_and_join, axis=1)

# Drop duplicates from the "Theme" column
df['Theme'] = df['Theme'].str.split('|').apply(lambda x: '|'.join(sorted(set(x), key=x.index)))

In [14]:
# Define a function to remove punctuation characters from the beginning of a string
def remove_special_characters(title):
    # Use regular expression to remove special characters at the beginning of the title
    cleaned_title = re.sub(r'^[^a-zA-Z0-9]+', '', title)
    return cleaned_title

# Apply the function to the "Title" column
df['Title'] = df['Title'].apply(remove_special_characters)

In [15]:
# # Define the desired order of columns
desired_order = [
'Place Names',
'Spatial Coverage',
'Title',
'Alternative Title',
'Description',
'Language',
'Format',
'Creator',
'Provider',
'Resource Class',
'Resource Type',
'Theme',
'Keyword',
'Date Issued',
'Date Range',
'Date Accessioned',
'Bounding Box',
'Member Of',
'Is Part Of',
'Accrual Method',
'Download',
'HTML',
'Information',
'ID',
'Identifier',
'Access Rights',

# # Add more columns as needed in the desired order
 ]

# # Reindex the DataFrame based on the desired order of columns
df = df.reindex(columns=desired_order)

In [28]:
df = apply_transformations(df)

In [30]:
def clean_punctuation(text):
    if isinstance(text, str):
        # Remove leading and trailing pipes, dashes, or spaces
        return text.strip('|- ')
    return text

# Apply the function to each cell in the DataFrame
df = df.applymap(clean_punctuation)

In [31]:
actionDate = time.strftime('%Y%m%d')
df.to_csv(f'output_{actionDate}.csv', index=False)
print('#### Job done ####')

#### Job done ####
