In [4]:
# requirements
# beautifulsoup4==4.4.1 USE 'conda install -c anaconda beautifulsoup4'
# lxml==3.5.0
# python-ntlm3==1.0.2
# requests==2.9.1
# requests-ntlm==0.2.0 (not available from conda 4/28/2016, USE pip instead - 'pip install requests_ntlm')
# six==1.10.0
# wheel==0.29.0
# sciencebasepy USE 'pip install sciencebasepy'
# xmltodict USE 'pip install xmltodict'

from bs4 import BeautifulSoup
import requests

from requests_ntlm import HttpNtlmAuth
import sciencebasepy as pysb
import time
import os
import subprocess
import re
from getpass import getpass
import getpass
import json
from urllib.request import urlopen
import xmltodict
from IPython.core.display import display, HTML
import smtplib

import pandas as pd
import numpy as np

import warnings
warnings.filterwarnings('ignore')

try: raw_input = input
except NameError: pass

In [10]:
# get list of all public data releases
sb = pysb.SbSession()
def get_dr():
    filter = {
        "max": 1000,
        "filter0": "systemType=Data Release",
        "fields": "id, title"
    }
    response = sb.find_items(filter)
    return response 

def create_item_list(response):
    item_list = []
    while response and 'items' in response:
        for item in response['items']:
            item_list.append(item)
        response = sb.next(response)
    return item_list

id_list = []

item_list = create_item_list(get_dr())

for item in item_list:
    id_list.append(item['id'])

print ("number of data releases: " + str(len(id_list)))

number of data releases: 3452


In [11]:
# get list of all public data release landing pages + child items

complete_id_list = []

for id in id_list:
    complete_id_list.append(id)
    
    descendant_id_list = sb.get_ancestor_ids(id)
    for descendant_id in descendant_id_list:
        complete_id_list.append(descendant_id)

print ("number of landing pages + child items: " + str(len(complete_id_list)))

number of landing pages + child items: 15492


In [12]:
complete_id_list

['59a6fa44e4b0fd9b77cf6b4e',
 '5af4a5dee4b0da30c1b44ebd',
 '5b47d429e4b060350a176fc9',
 '5b75b428e4b0f5d5787fe815',
 '59e7a4fbe4b05fe04cd3a311',
 '597b94e2e4b0a38ca27565b0',
 '59937babe4b02da3062a17bf',
 '59937ce4e4b02da3062a17c3',
 '59937af8e4b02da3062a17ba',
 '59937d9ae4b02da3062a17c5',
 '59937c68e4b02da3062a17c1',
 '59937b71e4b02da3062a17bd',
 '5a006f5ee4b0531197b5a7aa',
 '56f98817e4b0a6037df06b8c',
 '5aa0648ee4b0b1c392e6c0cf',
 '5bd0de20e4b0b3fc5ce15cbd',
 '5bd0dddde4b0b3fc5ce15cb9',
 '5a281835e4b03852bafe0fc0',
 '5c9d1887e4b0b8a7f62e0aa5',
 '5ca37bb7e4b0b8a7f6333f36',
 '5ca37b5fe4b0b8a7f6333f2f',
 '5ca37c9ce4b0b8a7f6333f40',
 '5ca37c08e4b0b8a7f6333f38',
 '5ae9d2c4e4b0860c0f6f3689',
 '5af4aad8e4b0da30c1b44ee9',
 '5a2032eae4b09fc93ddbad12',
 '5abe8052e4b081f61ac14d0b',
 '586e9072e4b0f5ce109fccf1',
 '5a906bd6e4b069906067062e',
 '5aa6be45e4b0b1c392ed671f',
 '5ab2cc1fe4b081f61ab46269',
 '5ab2cc56e4b081f61ab4626e',
 '558194c3e4b023124e8f0b10',
 '5aa822fbe4b0b1c392ef3276',
 '5a87249de4b0

In [13]:
# run code on the ids of all landing pages and child items (i.e., complete_id_list)
complete_list = []

for id in complete_id_list:
    
    item = sb.get_item(id, {'fields':'title,files'})
    
    try:
        file_list = item['files']
    except:
        pass
    
        complete_list.append(item)

In [14]:
complete_list

[{'link': {'rel': 'self',
   'url': 'https://www.sciencebase.gov/catalog/item/597b94e2e4b0a38ca27565b0'},
  'relatedItems': {'link': {'url': 'https://www.sciencebase.gov/catalog/itemLinks?itemId=597b94e2e4b0a38ca27565b0',
    'rel': 'related'}},
  'id': '597b94e2e4b0a38ca27565b0',
  'title': 'Concentrations of tetrachloroethylene in tree-core and passive soil-gas samples and interpolated tetrachloroethylene soil data at the Vienna Wells site, Maries County, Missouri, 2011-2016'},
 {'link': {'rel': 'self',
   'url': 'https://www.sciencebase.gov/catalog/item/59937babe4b02da3062a17bf'},
  'relatedItems': {'link': {'url': 'https://www.sciencebase.gov/catalog/itemLinks?itemId=59937babe4b02da3062a17bf',
    'rel': 'related'}},
  'id': '59937babe4b02da3062a17bf',
  'title': 'Concentrations of tetrachloroethylene in tree-core samples from tree 29 at the Vienna Wells site on May 21, 2015, Vienna, Missouri.'},
 {'link': {'rel': 'self',
   'url': 'https://www.sciencebase.gov/catalog/item/59937ce4

In [None]:
len(complete_list)

In [None]:
# Define - create dataframe for data releases

def make_clickable(val):
    if val != '':
        name, url = val.split('#')
        return f'<a href="{url}" target="_blank">{name}</a>'
    else:
        return ''

def create_dr_df(complete_list):
    item_list_dict = [] 
    
    for item in complete_list:
        item_dict = {}
        item_dict['itemID'] = item['id']
        item_dict['title_hyperlink'] = item['title']+'#https://www.sciencebase.gov/catalog/item/'+item['id']
            
        ### Append dictionary to item_list_dict to create a list of dictionaries
        item_list_dict.append(item_dict)
        
    # Convert item_list_dict into dataframe
    df = pd.DataFrame(item_list_dict)
    
    return df

In [None]:
# create dataframe

df = create_dr_df(complete_list)

In [None]:
# display dataframe with clickable URLs

display(df.style.format({'title_hyperlink': make_clickable}))

In [None]:
# save dataframe as .csv

df.to_csv('metadata_output_complete.csv', index=False)

In [None]:
df.to_csv('public_data_releases.csv', index='false')