In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd


import csv 
import time
import urllib.request
from urllib.request import urlopen
import numpy as np
import os
import re



## Part 1: Obtain a list of dataset pages and query the discovery metadata

We will query the website to return:

- Title
- Description
- Metadata file link
- Download link
- Web Services link

In [3]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

base_url = 'https://clearinghouse.isgs.illinois.edu'
landing_pages = get_landing_pages(base_url)

def get_landing_pages(base_url):
    response = requests.get(f"{base_url}/data")
    soup = BeautifulSoup(response.text, 'html.parser')
    landing_pages = []
    
    
    for theme_section in soup.select('.item-list'):
        theme = theme_section.find('h3').text
        for dataset in theme_section.select('.views-row'):
            title = dataset.select_one('.views-field-title').text.strip()
            relative_url = dataset.select_one('.views-field-title a')['href']
            landing_page = f"{base_url}{relative_url}"
            landing_pages.append((theme, title, landing_page))
            
    return landing_pages

def get_dataset_info(theme, title, landing_page):
    response = requests.get(landing_page)
    soup = BeautifulSoup(response.text, 'html.parser')
    
    dataset_info = {
        'Keyword': theme,
        'Alternative Title': title,
        'Information': landing_page,
        'Description': '',
        'HTML': '',
        'FGDC': '',
        'Documentation': '',
        'ImageServer': '',
        'MapServer': '',
        'FeatureServer': '',
        'Download': ''
    }
    
    # Extract Summary
    
    summary_fieldset = soup.select_one('fieldset.group-summary')

    if summary_fieldset:
        summary_div = summary_fieldset.select_one('div.field-item')
        if summary_div:
            dataset_info['Description'] = summary_div.text.strip()
        else:
            dataset_info['Description'] = ''  # Fallback if 'div.field-item' is not found
    else:
        dataset_info['Description'] = ''  # Fallback if 'fieldset.group-summary' is not found

    
            
    # Extract the first .zip Download Link
    download_link = None
    download_sections = soup.select('.group-downloads .field-item a')
    if download_sections:
        for download_section in download_sections:
            if 'href' in download_section.attrs:
                potential_link = download_section['href']
                if potential_link.endswith('.zip'):
                    download_link = potential_link
                    break  # Exit the loop once the first .zip link is found

    # Store the download link
    if download_link:  # Only store if there's a .zip download link
        dataset_info['Download'] = download_link
    

    metadata_sections = soup.select('.group-metadata .field-item a')
    if metadata_sections is not None:
        for metadata_section in metadata_sections:
            metadata_link = metadata_section['href']

            # Check if it's a relative link and prepend the base URL if needed
            if not metadata_link.startswith('http'):
                metadata_link = f"{base_url}{metadata_link}"

            # Categorize by extension type
            if metadata_link.endswith('.htm') or metadata_link.endswith('.html'):
                dataset_info['HTML'] = metadata_link
            elif metadata_link.endswith('.xml'):
                dataset_info['FGDC'] = metadata_link
            elif metadata_link.endswith('.pdf'):
                dataset_info['Documentation'] = metadata_link

        
    service_sections = soup.select('.group_services .field-item a')
    if service_sections is not None:
        for service_section in service_sections:
            service_link = service_section['href']
            if service_link.endswith('/ImageServer'):
                dataset_info['ImageServer'] = service_link
            elif service_link.endswith('/MapServer'):
                dataset_info['MapServer'] = service_link
            elif service_link.endswith('/FeatureServer'):
                dataset_info['FeatureServer'] = service_link
                
    

    return dataset_info

def main():
#     base_url = 'https://clearinghouse.isgs.illinois.edu'
#     landing_pages = get_landing_pages(base_url)
    
    all_datasets = []
    for theme, title, landing_page in landing_pages:
        dataset_info = get_dataset_info(theme, title, landing_page)
        all_datasets.append(dataset_info)
        
    df = pd.DataFrame(all_datasets)
    return df 
    print("Metadata harvesting completed.")

if __name__ == "__main__":
    df = main()



## Part 2: Configure additional fields

### Create an ID based upon URL

In [4]:
import urllib.parse

def generate_id(url):
    # Extract the path from the URL
    path = urllib.parse.urlparse(url).path
    
    # Define the prefixes you're interested in
    prefixes = ["/data/", "/datasets/"]
    
    # Initialize start_pos to -1 (not found)
    start_pos = -1
    
    # Choose the appropriate prefix
    for prefix in prefixes:
        start_pos = path.find(prefix)
        if start_pos != -1:
            break
    
    # If neither prefix is found
    if start_pos == -1:
        return "Prefix not found"
    
    start_pos += len(prefix)
    
    # Extract the relevant part of the path
    relevant_path = path[start_pos:]
    
    # Replace slashes with dashes
    modified_path = relevant_path.replace('/', '-')
    
    # Construct the ID
    dataset_id = f"02a-01_{modified_path}"
    
    return dataset_id

# Apply the function to generate IDs and store them in a new DataFrame column
df['ID'] = df['Information'].apply(generate_id)

### Transform the title and extract a year

In [5]:
import re

def transform_title_and_extract_year(title):
    # Remove the word "Illinois" if it exists in the title
    title = re.sub(r'\bin Illinois\b', '', title)
    title = re.sub(r'\bIllinois, \b', '', title)
    title = re.sub(r'\bIllinois\b', '', title)
    # Remove leading and trailing spaces and dashes
    title = title.strip(' -')
    
    # Look for integers of 4 digits that could be years
    year_match = re.search(r'\b(\d{4})\b', title)
    if year_match:
        year = year_match.group(1)
        # Remove the year from the original title
        title = re.sub(r'\b\d{4}\b', '', title).strip(' -')
        transformed_title = f"{title} [Illinois] {{{year}}}"
        return transformed_title, year
    else:
        transformed_title = f"{title} [Illinois]"
        return transformed_title, None

# Apply the function and store results in new columns
df[['Title', 'Temporal Coverage']] = df['Alternative Title'].apply(
    lambda x: pd.Series(transform_title_and_extract_year(x))
)

### Adjust spelling of ISO Topics and determine Resource Class

In [6]:
# Define the conversion mappings from old values to new values
subject_sm_mapping = {
    "Climate": "Climate",
    "Coastal": "Environment|Inland Waters",
    "Elevation": "Elevation",
    "Geology": "Geology",
    "Hydrology": "Inland Waters",
    "Imagery": "Imagery",
    "Infrastructure": "Boundaries",
    "Landcover": "Land Cover",
    "Reference": "Boundaries"
    }


# Function to apply the mapping and join the values back together
def convert_and_join(row):
    subject_values = row['Keyword']
    if pd.notna(subject_values):  # Check for NaN before splitting
        subject_values = subject_values.split('|')
        converted_values = []
        for value in subject_values:
            if value in subject_sm_mapping:
                converted_values.append(subject_sm_mapping[value])
        return '|'.join(converted_values)
    else:
        return ''  # Return an empty string if the value is NaN

# Apply the mapping and create the new "Theme" column
df['Theme'] = df.apply(convert_and_join, axis=1)

# Use Theme to determine Resource Class
df['Resource Class'] = df['Keyword'].apply(lambda x: 'Imagery' if x == 'Imagery' else 'Datasets')


### Append default values

In [7]:
df['Code'] = '02a-01'
df['Access Rights'] = 'Public'
df['Accrual Method'] = 'HTML'
df['Date Accessioned'] = time.strftime('%Y-%m-%d')
df['Language'] = 'eng'
df['Is Part Of'] = '02a-01'
df['Member Of'] = 'ba5cc745-21c5-4ae9-954b-72dd8db6815a'
df['Provider'] = 'Illinois Geospatial Data Clearinghouse'
df['Identifier'] = df['Information']
df['Format'] = 'File'
df['Bounding Box'] = '-91.51,36.97,-87.02,42.51'

## Part 3: Print to CSV

In [8]:
actionDate = time.strftime('%Y%m%d')
df.to_csv(f'02a-01_{actionDate}.csv', index=False)