## Purpose

This script will scan the DCAT API for the OpenDataPhilly data portal and return the metadata for all items as a CSV file in the GeoBTAA Metadata Application Profile. It will also create a secondary CSV file for the associated multiple downloads. 

## Import modules and declare file names.

In [None]:
# Standard libraries
import csv
import json
import time
import re


# Third-party libraries
import pandas as pd
import numpy as np

# auto-generate the current time in 'YYYYMM' format
action_date = time.strftime('%Y%m%d')
csv_file = action_date + "_08c-01-metadata.csv"

## Download the JSONs and merge them into a single pandas dataframe

In [None]:
import pandas as pd
import requests
import json


# URLs of the APIs
url1 = "https://opendataphilly.org/datasets.json"
url2 = "https://opendataphilly.org/data.json"

# Downloading JSON data
response1 = requests.get(url1)
data1 = response1.json()

response2 = requests.get(url2)
data2 = response2.json()

# Load JSONs into pandas dataframes
df1 = pd.DataFrame(data1)
df2 = pd.DataFrame(data2['dataset'])  # Here's where we're targeting the "dataset" key

# Merge dataframes by matching the titles
merged_df = pd.merge(df1, df2, on='title', how='inner')

In [None]:
# Specify the columns to keep and their new names
columns_to_keep_and_rename = {
    "title": "Alternative Title",
    "organization": "Creator",
    "description": "Description",
    "category_x": "Keyword",
    "url": "url",
    "license": "Rights",
    "distribution": "distribution"
}

# Select and rename the specified columns
df = merged_df[list(columns_to_keep_and_rename.keys())].rename(columns=columns_to_keep_and_rename)

## Analyze the distribution arrays

There are many datasets in this portal that have dozens of links with various types of formats listed in the distribution section. Since this array is fairly heterogeneous, we are only going to pull records that have Shapefile downloads.

In [None]:
# Optional section: This code gives you an idea of how many links and formats there are

# # Create a new column that counts the number of items in the 'distribution' column
# df['distribution_count'] = df['distribution'].apply(lambda x: len(x))

# # Initialize a dictionary to count the different formats
# format_counts = {}

# # Iterate through the 'distribution' column to collect formats
# for distribution in df['distribution']:
#     for item in distribution:
#         format_type = item['format']
#         format_counts[format_type] = format_counts.get(format_type, 0) + 1

# # Summary of distribution counts
# distribution_summary = df['distribution_count'].describe()
# print("Distribution count summary:")
# print(distribution_summary)

# # Summary of format counts
# print("\nFormat counts:")
# for format_type, count in format_counts.items():
#     print(f"{format_type}: {count}")


In [None]:
# Function to process the 'distribution' column
def process_distribution(row):
    for item in row['distribution']:
        if item['format'] == "SHP":
            download_url = item['downloadURL']
            # Downloads from phl.carto.com are not working, so we will drop them
            if not download_url.startswith("https://phl.carto.com"):
                row['Download'] = download_url
                row['Format'] = "Shapefile"
                break
    return row

# Apply the function to the DataFrame
df = df.apply(process_distribution, axis=1)

# Drop rows without valid Shapefiles
df.dropna(subset=['Download'], inplace=True)


### Use the 'url' value to create landing page, Identifier and ID

In [None]:
df['Information'] = "https://opendataphilly.org/" + df['url']
df['Identifier'] = df['url'].str.replace('/datasets/', '')  # Remove the beginning "/datasets/"
df['Identifier'] = df['Identifier'].str.rstrip('/')  # Remove the trailing "/"
df['ID'] = "opendataphilly_" + df['Identifier']

### If the creator is Philadelphia, reformat

In [None]:
df['Creator'] = df['Creator'].replace("City of Philadelphia", "Pennsylvania--Philadelphia")

### Get a date from the title if possible

In [None]:
# The function uses regular expressions to extract the year from the alternative title, and replaces it with an empty string to remove it from the title.

import re

def extract_year(row):
    alternative_title = row.get('Alternative Title', '')
    match = re.search(r'\b\d{4}\b', alternative_title)  # Looks for a 4-digit number
    if match:
        return match.group(0)
    return None

# Applying the function to the DataFrame
df['Temporal Coverage'] = df.apply(extract_year, axis=1)
df['Date Range'] = df['Temporal Coverage'] + "-" + df['Temporal Coverage']

### Parse keywords and add values to Theme

In [None]:
# Define the predefined list of terms
predefined_terms = [
    'Boundaries',
    'Economy',
    'Environment',
    'Transportation'
]

# Function to process the 'tags' column
def process_tags(row):
    tags = row['Keyword']
    theme_values = []

    for tag in tags:
        if tag in predefined_terms:
            theme_values.append(tag)
        if "Zoning" in tag or "Land Records" in tag or "Real Estate" in tag:
            theme_values.append("Property")
        if "Health" in tag:
            theme_values.append("Health")
        if "Arts" in tag:
            theme_values.append("Society")
        if "Budget" in tag:
            theme_values.append("Economy")
        if "Education" in tag:
            theme_values.append("Society")
        if "Elections" in tag:
            theme_values.append("Society|Events")

    row['Theme'] = '|'.join(theme_values)
    # Keep the original 'tags' value
    row['Keyword'] = '|'.join(tags)
    return row

# Apply the function to the DataFrame
df = df.apply(process_tags, axis=1)

## Add default and constructed columns

In [None]:
df['Title'] = df['Alternative Title'] + " [Pennsylvania--Philadelphia]"
df['Resource Class'] = "Datasets"
df['Date Accessioned'] = action_date
df['Spatial Coverage'] = "Pennsylvania--Pennsylvania"
df['Code'] = "08c-01"
df['Is Part Of'] = "08c-01"
df['Member Of'] = "ba5cc745-21c5-4ae9-954b-72dd8db6815a"
df['Accrual Method'] = "JKAN"
df['Access Rights'] = "Public"
df['Language'] = "eng"
df['Provider'] = "OpenDataPhilly"
df['Bounding Box'] = "-75.280298,39.867005,-74.955833,40.137959"
df['Display Note'] = "Tip: This dataset was automatically cataloged from the OpenDataPhilly data portal. Click the View Source button to see more formats and layers associated with this record."

In [None]:
# Define the desired order of columns
desired_order = [
'Title',
'Alternative Title',
'Description',
'Language',
'Display Note',
'Creator',
'Provider',
'Resource Class',
'Theme',
'Temporal Coverage',
'Date Range',
'Spatial Coverage',
'Bounding Box',
'Member Of',
'Is Part Of',
'Format',
'Information',
'Download',
'ID',
'Identifier',
'Rights',
'Access Rights',
'Date Accessioned',
'Code',
'Accrual Method'
]

# Reindex the DataFrame based on the desired order of columns
df = df.reindex(columns=desired_order)


In [None]:
df.to_csv(csv_file, quoting=csv.QUOTE_NONNUMERIC, index=False)

In [None]:
# Apply .str.strip() method to all string columns in the DataFrame and replace newline and tab characters
# df = df.apply(lambda x: x.str.replace('\n', ' ').str.replace('\t', ' ').str.replace('<br/>', ' ').str.replace('<br/><br/>', '|').str.strip() if x.dtype == "object" else x)