# SHARE Query Requests from the Community

Here's where we can keep track of code for common things that members of the SHARE community might like to know!

## Setup

In [1]:
import json
import requests

SHARE_SEARCH_API = 'https://osf.io/api/v1/share/search/'
ALL_PROVIDER_INFO = requests.get('https://osf.io/api/v1/share/providers/').json()['providerMap']

def query_share(url, query):
    headers = {'Content-Type': 'application/json'}
    data = json.dumps(query)
    return requests.post(url, headers=headers, data=data, verify=False).json()

def get_longname_for_shortname(shortname):
    for source in ALL_PROVIDER_INFO.keys():
        if source == shortname:
            return ALL_PROVIDER_INFO[source]['long_name']
    

## Queries

In [2]:
# What's the earliest and latest document from each source?

import pandas as pd

date_stats_agg = {
    "aggregations": {
        "sources": {
            "terms": {"field": "_type", "size": 0},
            "aggregations": {
                "source_stats": {
                    "stats": {"field": "providerUpdatedDateTime"}
                }
            }
        }
    }
}

date_results = query_share(SHARE_SEARCH_API, date_stats_agg)['aggregations']['sources']['buckets']

date_results_df = pd.DataFrame()
date_results_df['source_shortname'] = [result['key'].encode('utf-8') for result in date_results]
date_results_df['source_longname'] = [get_longname_for_shortname(name).encode('utf-8') for name in date_results_df['source_shortname']]
date_results_df['earliest_date'] = [result['source_stats']['min_as_string'] for result in date_results]
date_results_df['latest_date'] = [result['source_stats']['max_as_string'] for result in date_results]
date_results_df



Unnamed: 0,source_shortname,source_longname,earliest_date,latest_date
0,datacite,DataCite MDS,2015-07-26T00:03:30.000Z,2016-04-26T01:59:00.000Z
1,crossref,CrossRef,2014-08-03T00:00:00.000Z,2016-04-28T00:00:00.000Z
2,figshare,figshare,2014-10-28T00:00:00.000Z,2016-04-28T17:21:00.000Z
3,pubmedcentral,PubMed Central,2014-12-28T00:00:00.000Z,2016-04-28T00:00:00.000Z
4,dataone,DataONE: Data Observation Network for Earth,2015-04-11T00:00:00.000Z,2016-04-28T00:00:00.000Z
5,arxiv_oai,ArXiv,2014-10-03T00:00:00.000Z,2016-04-28T00:00:00.000Z
6,scitech,DoE's SciTech Connect Database,2014-10-03T00:00:00.000Z,2016-04-28T00:00:00.000Z
7,rcaap,RCAAP - Repositório Científico de Acesso Abert...,2015-12-27T02:00:54.000Z,2016-04-28T04:34:54.000Z
8,citeseerx,CiteSeerX Scientific Literature Digital Librar...,2008-07-01T00:00:00.000Z,2016-04-28T00:00:00.000Z
9,cyberleninka,CyberLeninka - Russian open access scientific ...,2015-12-22T00:00:00.000Z,2016-04-27T19:10:35.000Z


In [3]:
# Uncomment the following lines if running locally - will save to file formats

# date_results_df.to_csv('SHARE_Min_Max_dates.csv')
# date_results_df.to_excel('SHARE_Min_Max_dates.xlsx')