# Harvest from UMedia (05d-01)

Purpose: This recipe is for harvesting items from the [University Of Minnesota's UMedia Digital Library](https://umedia.lib.umn.edu).

## Import modules 

In [None]:
import json 
import re
import requests
import time
import csv 
import urllib.request 
import pandas as pd
import numpy as np

In [None]:
import sys
sys.path.append('../')
from data_processing_utils import load_language_mapping, convert_languages_to_iso, clean_complex_fields, replace_with_pipes

## Declare variables

In [None]:
# CHANGE IF NEEDED Specify the path to your JSON file
json_file_path = 'east-asian.json'

# Specify the path for the CSV file you want to create
csv_file_path = 'east-asian.csv'

In [None]:
# Base URL and parameters for the API request
base_url = 'https://umedia.lib.umn.edu/search.json'

# Change these to match the facets you want to harvest
params = {
    'facets[contributing_organization_name_s][]': 'University of Minnesota Libraries, John R. Borchert Map Library.',
#     'facets[super_collection_name_ss][]': 'Revealing Bound Maps'
}

# Change if needed 
max_items = 10000

## Part 1: Download the metadata JSON

In [None]:
def fetch_metadata(base_url, params, max_items):
    items = []
    page = 0
    while True:
        current_params = params.copy()
        current_params['page'] = page

        # Make the request
        response = requests.get(base_url, params=current_params)
        response_json = response.json()

        if not response_json or len(items) + len(response_json) > max_items:
            break

        items.extend(response_json)

        if len(response_json) < 20:  # Assuming each page has 20 items; adjust based on actual pagination
            break

        page += 1

    return items[:max_items]



# Fetch the metadata
metadata_items = fetch_metadata(base_url, params, max_items)

# Save the data to a JSON file
with open(json_file_path, 'w', encoding='utf-8') as f:
    json.dump(metadata_items, f, ensure_ascii=False, indent=4)

print(f"Data successfully saved to {json_file_path}")



## Part 2: Parse the metadata

In [None]:
dateEnd = time.strftime('%Y-%m')

fieldnames = ['Title', 'Alternative Title', 'Description', 'notes', 'dimensions', 'scale', 'Language', 'Creator', 'Publisher',
              'Resource Type', 'Keyword', 'Date Issued', 'Temporal Coverage', 'Date Range',
              'Information', 'Download', 'Image', 'Manifest', 
              'Identifier', 'ID', 'Access Rights', 'Provider', 'Code', 'Is Part Of', 'Member Of',
              'Accrual Method', 'Date Accessioned', 'Rights', 'Resource Class', 'Format', 'Date Added', 'Collection Name', 'Set', 'City', 'State', 'Country', 'Continent', 'Region', 'coordinates'] 

actionDate = time.strftime('%Y-%m-%d')

In [None]:
# Load the JSON content into a DataFrame
df = pd.read_json(json_file_path)

In [None]:
# Create empty dataframe and transfer values to it
out_df = pd.DataFrame()

## extract content from df
out_df['Title'] = df['title']
out_df['Alternative Title'] = df['title']
out_df['Publisher'] = df['publisher']
out_df['Keyword'] = df['subject'].str.join('|')
out_df['Date Issued'] = df['date_created'].str.join(';')


## extract content with variable presence
        
try:
      out_df['Creator'] = df['creator'].str.join('|')
except:
      out_df['Creator'] = ''
        
try:
      out_df['Provider'] = df['contributing_organization']
except:
      out_df['Provider'] = '' 
        
try:
      out_df['Rights'] = df['local_rights']
except:
      out_df['Rights'] = ''   

try:
      out_df['Identifier'] = df['persistent_url']
except:
      out_df['Identifier'] = '' 
        
## These need to be manually checked
try:
      out_df['coordinates'] = df['coordinates'].str.join('|')
except:
      out_df['coordinates'] = '' 
        
    
## construct links
out_df['Information'] = 'https://umedia.lib.umn.edu/item/' + df['id']
out_df['Download'] = 'http://cdm16022.contentdm.oclc.org/utils/getfile/collection/' + df['set_spec'] + '/id/' + df['parent_id'].astype(str) + '/filename/print/page/download/fparams/forcedownload'
out_df['B1G Image'] = df['thumb_url']
out_df['Manifest'] = 'https://cdm16022.contentdm.oclc.org/iiif/info/' + df['set_spec'] + '/' + df['parent_id'].astype(str) + '/manifest.json'
out_df['ID'] = df['id']


## some hard-coded fields
out_df['Resource Type'] = ''
out_df['Code'] = '05d-01'
out_df['Is Part Of'] = '05d-01'
out_df['Member Of'] = '64bd8c4c-8e60-4956-b43d-bdc3f93db488'
out_df['Accrual Method'] = 'JSON API'
out_df['Access Rights'] = 'Public'
out_df['Date Accessioned'] = actionDate
out_df['Resource Class'] = 'Maps'
out_df['Format'] = 'JPEG'


## useful info that is not part of the BTAA Metadata Profile
out_df['Date Added'] = df['date_added']
try:
      out_df['Collection Name'] = df['collection_name']
except:
      out_df['Collection Name'] = '' 

try:
      out_df['Set'] = df['set_spec']
except:
      out_df['Set'] = ''  

In [None]:
def create_date_range(date_str):
    # Check if date_str is not a string (e.g., NaN represented as float)
    if not isinstance(date_str, str):
        # Handle non-string input (e.g., NaN) as you see fit; here, we return a placeholder
        return ""
    
    # Split the date string into a list of years
    years = date_str.split()
    # Convert each year to an integer to sort them correctly
    years = [int(year) for year in years]
    
    # Use the first and last years to create the range if there are multiple years
    if len(years) > 1:
        return f"{min(years)}-{max(years)}"
    # If only one year is present, use it for both start and end of the range
    else:
        year = years[0]
        return f"{year}-{year}"


# Apply the function to create the 'Date Range' column
out_df['Date Range'] = df['date_created_sort'].apply(create_date_range)


In [None]:
# Specify columns
complex_columns = ['language']

# Apply clean_complex_fields to complex columns
for col in complex_columns:
    df[col] = df[col].apply(clean_complex_fields)

In [None]:
columns_to_pipe = ['country', 'state', 'city', 'region', 'continent']

# Apply the function to each column in the list
for col in columns_to_pipe:
    df[col] = replace_with_pipes(df[col])

In [None]:
## update values for cities to match FAST format

# Step 1: Prepare 'state' by using the first state if available, otherwise use 'country'
df['state_or_country'] = df['state'].str.split('|').str[0].fillna('')
df['state_or_country'] = df.apply(lambda row: row['country'] if row['state_or_country'] == '' else row['state_or_country'], axis=1)

# Step 2: Concatenate 'state_or_country' with each city in the 'city' column
def concatenate_location_city(row):
    # Check if 'city' is not null or empty
    if pd.notnull(row['city']) and row['city'] != '':
        # Split the cities on pipe, concatenate with state_or_country, and join back with pipe
        cities_augmented = '|'.join([f"{row['state_or_country']}--{city.strip()}" for city in row['city'].split('|')])
        return cities_augmented
    else:
        # If 'city' is null or empty, just return an empty string or some default value
        return ''

# Apply the function to each row to augment 'city' values
df['city'] = df.apply(concatenate_location_city, axis=1)
        
out_df['City'] = df['city']
 

try:
      out_df['State'] = df['state']
except:
      out_df['State'] = ''   
        

try:
      out_df['Country'] = df['country']
except:
      out_df['Country'] = '' 

try:
      out_df['Region'] = df['region']
except:
      out_df['Region'] = '' 

In [None]:
# Concatenate

def concatenate_fields(row, fields):
    # Initial concatenation of the fields if they exist in the row, ignoring empty or NaN values
    concatenated_values = '|'.join(filter(None, [str(row[field]).strip() if (field in row.index and pd.notnull(row[field])) else '' for field in fields]))
    # Additional step to remove whitespace around pipe separators
    concatenated_values = '|'.join([value.strip() for value in concatenated_values.split('|')])
    return concatenated_values

# List of fields to concatenate for Description
fields_for_description = ['description', 'notes', 'dimension', 'scale']

# List of fields to concatenate for Spatial Coverage
fields_for_spatial_coverage = ['state', 'city', 'region', 'country', 'continent']

# Apply the function to each row of df to create the 'Description' column in out_df
out_df['Description'] = df.apply(lambda row: concatenate_fields(row, fields_for_description), axis=1)

# Apply the function to each row of df to create the 'Spatial Coverage' column
out_df['Spatial Coverage'] = df.apply(lambda row: concatenate_fields(row, fields_for_spatial_coverage), axis=1)

In [None]:
# Load the mapping
lang_to_iso = load_language_mapping('../language-vocabulary.csv')

out_df['Text Language'] = df['language']
out_df['Language'] = df['language'].apply(lambda x: convert_languages_to_iso(x, lang_to_iso))

In [None]:
# Export the DataFrame to a CSV file
out_df.to_csv(csv_file_path, index=False)  # Set index=False to avoid adding the DataFrame index as a column in the CSV

print(f"Data successfully saved to {csv_file_path}")