## Import Modules

In [136]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [137]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
import csv 
import time
import urllib.request
from urllib.request import urlopen
import numpy as np
import os
import re
import json

import sys
sys.path.append('../../')  # Add the parent directory to the path

actionDate = time.strftime('%Y%m%d')



## Part 1: Obtain a list of dataset pages and query the discovery metadata

We use PASDA's search feature to return a page (https://www.pasda.psu.edu/uci/SearchResults.aspx?Keyword=+) that lists all of the activate dataset landing pages with some metadata. Then, we use the Beautiful Soup module to query this page and harvest the following values:

- Title
- Date Issued
- Publisher
- Description
- Metadata file link
- Download link

### MANUAL STEP!!

1. Open https://www.pasda.psu.edu/uci/SearchResults.aspx?Keyword=+ in a browser
2. Download the page
3. Save the file as "pasda-search.html" in the same directory as this notebook

### Read the downloaded file into a pandas dataframe

In [138]:
file_path = 'pasda-search.html'  # Modify this to the correct path to your downloaded HTML file
with open(file_path, 'r', encoding='utf-8') as file:
    html_content = file.read()


soup = BeautifulSoup(html_content, 'html.parser')

# Assuming every dataset is contained in its own <tr> tag
datasets = soup.select('tr[align="left"]')

data = []

dataset_entries = soup.select('td > h3 > a[href^="DataSummary.aspx?dataset="]')

for entry in dataset_entries:
    publisher = entry.find_next("td").text.strip()
    date = entry.find_previous("td").find_previous("td").text.strip()
    title = entry.text.strip()
    description = entry.find_next("span", id=lambda x: x and x.startswith('DataGrid1_Label3_')).text.strip()
    metadataFile = entry.parent.parent.find('a', href=True, string='Metadata')['href']
    metadataLink = "https://www.pasda.psu.edu/uci/" + metadataFile 
    try:
        download = entry.parent.parent.find('a', href=True, string='Download')['href']
    except:
        download = ''
        
    # obtain full landing page and create ID
    landing_page = "https://www.pasda.psu.edu/uci/" + entry['href']  # Landing page URL
    iden = 'pasda-' + landing_page.rsplit("=",1)[1]

    data.append([publisher, date, title, description, metadataFile, metadataLink, download, landing_page, iden])
    

# Convert to pandas dataframe
import pandas as pd
df = pd.DataFrame(data, columns=['Creator', 'Date Issued', 'Alternative Title', 'Description', 'Metadata File', 'metadata_html', 'download', 'full_layer_description', 'ID'])
    

In [139]:
# optional: check the results
df
actionDate = time.strftime('%Y%m%d')
df.to_csv(f'pasda-minimal_{actionDate}.csv', index=False)
print('#### Job done ####')

#### Job done ####


### Drop federal datasets

In [140]:
# List of creator values to drop
values_to_drop = [
    "United States Army Corps of Engineers USACE",
    "U S Geological Survey",
    "U S Fish and Wildlife Service",
    "U S Environmental Protection Agency",
    "U S Department of Justice",
    "U S Department of Commerce",
    "U S Department of Agriculture",
    "U S Census Bureau",
    "National Weather Service NOAA NWS",
    "National Renewable Energy Laboratory NREL",
    "National Park Service",
    "National Geodetic Survey",
    "National Aeronautics and Space Administration NASA"
]

# Drop rows where 'Creator' column contains any of these values
df = df[~df['Creator'].isin(values_to_drop)]


## Part 3: add default and calculated values

In [141]:
def date_range_formatter(date_issued):
    # Extract years
    years = re.findall(r'(\d{4})', date_issued)
    # If only one year is found, duplicate it to create a range
    if len(years) == 1:
        return f"{years[0]}-{years[0]}"
    # If two years are found, format them as a range
    elif len(years) == 2:
        return f"{years[0]}-{years[1]}"
    # Return original string if no match (or any other behavior you prefer)
    else:
        return date_issued

df['Date Range'] = df['Date Issued'].apply(date_range_formatter)

In [142]:
# Append default values

df['Code'] = '08a-01'
df['Access Rights'] = 'Public'
df['Accrual Method'] = 'HTML'
df['Date Accessioned'] = time.strftime('%Y-%m-%d')
df['Language'] = 'eng'
df['Is Part Of'] = '08a-01'
df['Member Of'] = 'ba5cc745-21c5-4ae9-954b-72dd8db6815a'
df['Provider'] = 'Pennsylvania Spatial Data Access (PASDA)'
df['Identifier'] = df['full_layer_description']
df['Format'] = 'File'
df['Resource Class'] = 'Datasets'

### Clean up the titles

Title-case the "Alternative Title".
Check for counties in the title and reformat accordingly.
If no county is found, check for cities in the title and reformat accordingly.
If neither county nor city is found, it checks for "PA " and replaces it with "[Pennsylvania]".
It then captures the content in brackets, removes it from its original position, and appends it to the end of the title.
Some specific transformations (cleanup) are performed post-transformation.
The value from 'Date Issued' is appended at the end of the title, surrounded by curly brackets.

In [143]:
from modules.spatial_coverage_transformer import apply_transformations

json_path = os.path.join('../../', 'data', 'locations.json')

with open(json_path, 'r') as file:
    locations = json.load(file)

counties_in_pennsylvania = locations['counties_in_pennsylvania']
cities_in_pennsylvania = locations['cities_in_pennsylvania']



def transform_title(row):
    alt_title = row['Alternative Title']
    
    # Search for a city or county name in the title.
    for county in counties_in_pennsylvania:
        if re.search(f"{county} County", alt_title, re.I):
            alt_title = re.sub(f"{county} County", f"[Pennsylvania--{county} County]", alt_title, flags=re.I, count=1)
            break
    else:
        for city in cities_in_pennsylvania:
            if re.search(f"\b{city}\b", alt_title, re.I):
                alt_title = re.sub(f"\b{city}\b", f"[Pennsylvania--{city}]", alt_title, flags=re.I, count=1)
                break
        else:
            alt_title = re.sub(r"\b(PA|Pennsylvania)\b", "[Pennsylvania]", alt_title, flags=re.I, count=1)

    # Capture content in brackets
    bracket_content = re.findall(r'\[(.*?)\]', alt_title)
    
    if bracket_content:
        # Remove bracketed content from original position
        alt_title = re.sub(r'\[.*?\]', '', alt_title).strip()
        
        # Append bracketed content to the end of the title
        alt_title = f"{alt_title} [{bracket_content[0]}]"

    # Cleanup phrases post-transformation using case-insensitive matching
    alt_title = re.sub(r"For \[", "[", alt_title, flags=re.I)
    alt_title = re.sub(r"For The \[", "[", alt_title, flags=re.I)
    alt_title = re.sub(r"For The City Of \[", "[", alt_title, flags=re.I)

    # Remove unwanted dashes at the beginning or just before a bracket
    alt_title = re.sub(r"^\s*-\s*|\s*-\s*(?=\[)", "", alt_title)
    
    # Make sure first letter is capitalized
    alt_title = alt_title[0].capitalize() + alt_title[1:]

    # Append the value from 'Date Issued' surrounded by curly brackets
    alt_title += f" {{{row['Date Issued']}}}"

    return alt_title

df['Title'] = df.apply(transform_title, axis=1)


    


In [144]:
df = apply_transformations(df)

In [145]:
def transform_publisher(publisher):
    
    # Search for a county name in the publisher string.
    for county in counties_in_pennsylvania:
        if county + " County" in publisher:
            return f"Pennsylvania--{county} County"
    else:
        for city in cities_in_pennsylvania:
            if f"City of {city}" in publisher or city == publisher:
                return f"Pennsylvania--{city}"
    
    # If no match found, return the original publisher string.
    return publisher

df['Creator'] = df['Creator'].apply(transform_publisher)

In [146]:
## fill spatial values

from modules.spatial_values import load_csv_data, match_and_append_values


# Define the path to the CSV
csv_path = os.path.join('../../', 'data', 'spatial_counties.csv')

# Load CSV and append data
try:
    csv_data = load_csv_data(csv_path)
    df = match_and_append_values(df, csv_data)  # Update the DataFrame in place
except Exception as e:
    print(f"An error occurred: {e}")


In [147]:
# Define the default values
default_values = {
    'Bounding Box': "-80.52,39.72,-74.69,42.27",
    'Geometry': "MultiPolygon(((-75.6 39.8, -75.8 39.7, -80.5 39.7, -80.5 42.3, -79.8 42.5, -79.8 42, -75.3 42, -75.1 41.8, -75 41.5, -74.7 41.4, -75.1 41, -75.1 40.9, -75.2 40.7, -74.7 40.2, -75.1 39.9, -75.6 39.8)))",
    'GeoNames ID': "http://sws.geonames.org/6254927"
}

# Fill blanks in the DataFrame
for column, default in default_values.items():
    if column in df.columns:
        df[column] = df[column].fillna(default)
    else:
        df[column] = default  # Create column if it doesn't exist

In [148]:
# Define a function to remove punctuation characters from the beginning of a string
def remove_special_characters(title):
    # Use regular expression to remove special characters at the beginning of the title
    cleaned_title = re.sub(r'^[^a-zA-Z0-9]+', '', title)
    return cleaned_title

# Apply the function to the "Title" column
df['Title'] = df['Title'].apply(remove_special_characters)

In [149]:
# # Define the desired order of columns
desired_order = [
'Spatial Coverage',
'Title',
'Alternative Title',
'Description',
'Language',
'Format',
'Creator',
'Provider',
'Resource Class',
'Resource Type',
'Theme',
'Keyword',
'Date Issued',
'Date Range',
'Date Accessioned',
'Bounding Box',
'Geometry',
'GeoNames ID',
'Member Of',
'Is Part Of',
'Accrual Method',
'ID',
'Identifier',
'Access Rights',
'download',
'metadata_html',
'full_layer_description'
 ]

# # Reindex the DataFrame based on the desired order of columns
df = df.reindex(columns=desired_order)

In [150]:
def clean_punctuation(text):
    if isinstance(text, str):
        # Remove leading and trailing pipes, dashes, or spaces
        return text.strip('|- ')
    return text

# Apply the function to each cell in the DataFrame
df = df.applymap(clean_punctuation)

In [151]:
# Create the first CSV (all fields except links)
# Updated fields for links
link_fields = ['full_layer_description', 'download', 'metadata_html']
df_first_csv = df.drop(columns=link_fields)
df_first_csv.to_csv(f'{actionDate}_pasda-metadata.csv', index=False)

# Create the second CSV with friendlier_id, reference_type, distribution_url, and label
rows = []

for _, r in df.iterrows():
    slug = r['ID']
    for lf in link_fields:
        if pd.notna(r[lf]) and r[lf] != "":
            # Add Format as the label only if the reference_type is "download"
            label_value = r['Format'] if lf.lower() == 'download' else ''
            rows.append({
                'friendlier_id': slug, 
                'reference_type': lf, 
                'distribution_url': r[lf], 
                'label': label_value
            })

df_second_csv = pd.DataFrame(rows, columns=['friendlier_id', 'reference_type', 'distribution_url', 'label'])
df_second_csv.to_csv(f'{actionDate}_pasda-links.csv', index=False)

print("CSV files have been created successfully.")

CSV files have been created successfully.
