# SHARE Query Requests from the Community

Here's where we can keep track of code for common things that members of the SHARE community might like to know!

## Setup

In [1]:
import json
import requests

SHARE_SEARCH_API = 'https://osf.io/api/v1/share/search/'
ALL_PROVIDER_INFO = requests.get('https://osf.io/api/v1/share/providers/').json()['providerMap']

def query_share(url, query):
    headers = {'Content-Type': 'application/json'}
    data = json.dumps(query)
    return requests.post(url, headers=headers, data=data, verify=False).json()

def get_longname_for_shortname(shortname):
    for source in ALL_PROVIDER_INFO.keys():
        if source == shortname:
            return ALL_PROVIDER_INFO[source]['long_name']
    

## Queries

In [None]:
# What's the earliest and latest document from each source?

import pandas as pd

date_stats_agg = {
    "aggregations": {
        "sources": {
            "terms": {"field": "_type", "size": 0},
            "aggregations": {
                "source_stats": {
                    "stats": {"field": "providerUpdatedDateTime"}
                }
            }
        }
    }
}

date_results = query_share(SHARE_SEARCH_API, date_stats_agg)['aggregations']['sources']['buckets']

date_results_df = pd.DataFrame()
date_results_df['source_shortname'] = [result['key'].encode('utf-8') for result in date_results]
date_results_df['source_longname'] = [get_longname_for_shortname(name).encode('utf-8') for name in date_results_df['source_shortname']]
date_results_df['earliest_date'] = [result['source_stats']['min_as_string'] for result in date_results]
date_results_df['latest_date'] = [result['source_stats']['max_as_string'] for result in date_results]
date_results_df

In [None]:
# Uncomment the following lines if running locally - will save to file formats

# date_results_df.to_csv('SHARE_Min_Max_dates.csv')
# date_results_df.to_excel('SHARE_Min_Max_dates.xlsx')

## Lucene Search and NOT Queries

A user wanted to know how to query for one term but exclude another

In [None]:
query = '?q=pedigree NOT child'

In [None]:
results = requests.get(SHARE_SEARCH_API + query).json()
results

## Querying by Document Type

Currently, document type is not curated by SHARE. However, we do collected many sources that are using the OAI-PMH metadata protocol, which includes dc:type. You can search that field in SHARE for now, until the harvesters collect and curate document type.

In [None]:
query = '?q=otherProperties.properties.type:article'


In [None]:
results = requests.get(SHARE_SEARCH_API + query).json()

for result in results['results']:
    for prop in result['otherProperties']:
        if prop['name'] == 'type':
            print(prop)
    print(result['title'])
    print(result['uris']['canonicalUri'])
    

Here is an analysis of the top terms found in SHARE's collected dc:type field

In [None]:
import pandas as pd
from sharepa import ShareSearch, basic_search
from sharepa.helpers import pretty_print

type_search = ShareSearch()
total_documents = basic_search.count()

type_search.aggs.bucket(
    'typeTermFilter',  # Every aggregation needs a name
    'terms',  # There are many kinds of aggregations
    field='otherProperties.properties.type',
    exclude= "of|and|or",
    size=50,
)

type_results_executed = type_search.execute()

type_results = type_results_executed.aggregations.typeTermFilter.to_dict()['buckets']

type_dataframe = pd.DataFrame(type_results)
type_dataframe['percent'] = (type_dataframe['doc_count'] / total_documents)*100

In [None]:
type_dataframe

## Query by Exact Phrase

Question -- Is there a way to search SHARE for a specific phrase? For example, information literacy, information AND literacy, and "information literacy" give results with both terms, but not necessarily as the phrase "information literacy." Information and literacy can be in different parts of the record.

In [7]:
phrase_query = {
    "query": {
        "match_phrase" : {
            "title" : "information literacy"
        }
    }
}

results = query_share(SHARE_SEARCH_API, phrase_query)

for result in results['results']:
    print(
        '{} -- from {} -- {}'.format(
            result['title'].encode('utf-8'),
            result['shareProperties']['source'].encode('utf-8'),
            result['uris']['canonicalUri']
        )
    )


Information Literacy -- from uiucideals -- http://hdl.handle.net/2142/41497
Information Literacy Inventory -- from crossref -- http://dx.doi.org/10.1037/t32581-000
Information Literacy in Schools -- from crossref -- http://dx.doi.org/10.1201/b19843-8
Information Literacy Doll -- from datacite -- http://dx.doi.org/10.6084/M9.FIGSHARE.1012828.V1
MOBILE INFORMATION LITERACY CURRICULUM -- from uwashington -- http://hdl.handle.net/1773/34803
Information literacy at work -- from crossref -- http://dx.doi.org/10.1108/el-04-2014-0063




UnicodeEncodeError: 'ascii' codec can't encode character u'\u201c' in position 0: ordinal not in range(128)

In [None]:

# Using sharepa

phrase_search = ShareSearch()

phrase_search = phrase_search.query(
    'match_phrase',
    title="information literacy"
)

results = phrase_search.execute()

for result in results:
    print(
        '{} -- from {} -- {}'.format(
            result.title.encode('utf-8'),
            result.shareProperties.source.encode('utf-8'),
            result.uris.canonicalUri
        )
    )