## Introduction

The script aims to run the regular re-accession for CKAN portals. Compared with DCAT portals, CKAN updates less frequently. Thus, we often run the script every 3 months.


> Orignal created by Yijing Zhou (@YijingZhou33) and Ziying Cheng(@Ziiiiing)

> Updated January 15, 2021                           
> Updated by Ziying Cheng (@Ziiiiing)

> Updated July 05, 2021                           
> Updated by Ziying Cheng (@Ziiiiing)

## Set up directories

Verify that you have the following files and folders in the same directory as this Notebook:

- `resource` folder collects existing resource names by portal for each re-accession. The new one will be compared with the latest one to get both the created and deleted datasets.
- `reports` folder stores the metadata CSV files for all **New** datasets which are named as `allNewItems_YYYYMMDD.csv`. **Deleted** datasets are also stored within CSV files called `allDeletedItems_YYYYMMDD.csv`.




## Import modules

In [None]:
import csv
import urllib.request
import json 
import time
import os
import pandas as pd
from html.parser import HTMLParser
import re
import ast
import decimal
import ssl
import sys
import numpy as np

In [None]:
# auto-generate the current time in 'YYYYMM' format
actionDate = time.strftime('%Y%m%d')

In [None]:
import urllib.request
import json
import ssl

# Specify the CKAN portal URL you want to harvest from
portalURL = "https://gisdata.mn.gov/"

# Construct the API URL for package search
packageURL = portalURL + 'api/3/action/package_search'

# Specify the number of items per page
items_per_page = 10

# Initialize variables for pagination
start = 0
total_results = 0

# List to store all metadata
all_metadata = []

# Request metadata in paginated manner
while True:
    # Construct the API request URL with pagination parameters
    api_request_url = f"{packageURL}?start={start}&rows={items_per_page}"
    
    # Request metadata
    context = ssl._create_unverified_context()
    response = urllib.request.urlopen(api_request_url, context=context).read()
    response_json = json.loads(response.decode('utf-8'))
    
    # Extract metadata from the response
    metadata = response_json['result']['results']
    all_metadata.extend(metadata)
    
    # Update pagination variables
    start += items_per_page
    total_results = response_json['result']['count']
    
    # Break the loop if we have collected all items
    if start >= total_results:
        break

# Save the metadata to a local JSON file on your desktop
desktop_path = ""  # Replace with your desktop path
output_filename = "ckan_metadata.json"
output_path = desktop_path + output_filename

with open(output_path, "w") as json_file:
    json.dump(all_metadata, json_file, indent=4)

print(f"Metadata for {total_results} items saved to {output_path}")

In [None]:
import pandas as pd

# # Specify the path to the JSON file
# json_file_path = "ckan_metadata.json"  # Update with the correct path

# Read the JSON file into a DataFrame
df = pd.read_json(output_filename)

# Display the first few rows of the DataFrame
print(df.head())

In [None]:
# Function to process the resources
def process_resources(resources):
    result = {}
    for resource in resources:
        resource_type = resource["resource_type"]
        url = resource["url"]
        result[resource_type] = url
    return pd.Series(result)

# Apply the function to create the resource columns
resource_columns = df['resources'].apply(process_resources)

# Flatten the nested dictionaries within the "extras" array
def flatten_extras(extras):
    flattened = {}
    for item in extras:
        key = item["key"]
        value = item["value"]
        flattened[key] = value
    return flattened

df["flattened_extras"] = df["extras"].apply(flatten_extras)

# Expand the flattened_extras dictionary columns into separate columns
flattened_extras_df = df["flattened_extras"].apply(pd.Series)

# Flatten the "groups" dictionaries within the "groups" array
def flatten_groups(groups):
    if isinstance(groups, list):
        return "|".join([group["display_name"] for group in groups])
    return ""

df["group_titles"] = df["groups"].apply(flatten_groups)

# Flatten the "tags" dictionaries within the "tags" array
def flatten_tags(tags):
    if isinstance(tags, list):
        return "|".join([tag["display_name"] for tag in tags])
    return ""
df["tag_titles"] = df["tags"].apply(flatten_tags)

# Extract the "title" value from the "organization" dictionary
df["organization_title"] = df["organization"].apply(lambda x: x["title"] if isinstance(x, dict) else "")

# Select the columns you want to keep from the original DataFrame
selected_columns = ["title", "license_title", "id", "type", "name", "notes"]

# Select the columns you want to keep from the flattened_extras DataFrame
selected_extras_columns = ["dsAccessConst", "dsCurrentRef", "dsMetadataUrl", "dsModifiedDate", "dsOriginator", "dsPeriodOfContent", "dsPurpose", "gdrsDsGuid", "spatial"]

# Combine the selected columns from the original, flattened_extras, and resource_columns DataFrames
selected_df = pd.concat([df[selected_columns], flattened_extras_df[selected_extras_columns], df["group_titles"], df["organization_title"], df["tag_titles"], resource_columns], axis=1)

# Specify the path for the CSV file
csv_file_path = "selected_ckan_metadata_with_groups.csv"  # Update with the desired path

# Write the selected DataFrame to a CSV file
selected_df.to_csv(csv_file_path, index=False)

print("Selected columns, flattened extras, group titles, tag titles, and resources have been written to CSV.")