In [1]:
import requests
import json
import pandas as pd
import numpy as np
import urllib
import os
import inspect

# Getting datasets by DOI from R script

In [2]:
# read in dataframe and drop NAs
doi_to_id = pd.read_csv("/Users/chrischen/r_dataverse_ids.csv").dropna()

# only keep columns we need
doi_to_id = doi_to_id[["doi", "id", "filename"]]

# make doi directory friendly 
doi_to_id['doi_folder'] = [doi.replace("/", "-").replace(":", "--") for doi in doi_to_id['doi']]

In [3]:
doi_to_id

Unnamed: 0,doi,id,filename,doi_folder
0,doi:10.7910/DVN/HAU0EX,"2794357,2794256,2794232,2794422,2794195,279443...","00-checkDependencies.r,01-runEstimation.md,02-...",doi--10.7910-DVN-HAU0EX
1,doi:10.7910/DVN/26589,"2475186,2475195,2481464,2475187,2475193,247519...","00-install-packages.R,01-get-twitter-data.R,02...",doi--10.7910-DVN-26589
3,doi:10.7910/DVN/SI5KBL,300786530078643007863,"00-preprocessing.R,01-analysis.R,WV6.tab",doi--10.7910-DVN-SI5KBL


In [4]:
# take the first set of filenames and files as an example
my_doi = doi_to_id.loc[3]['doi']
my_doi_folder = doi_to_id.loc[3]['doi_folder']
my_files = doi_to_id.loc[3]['id'].split(",")
my_filenames = doi_to_id.loc[3]['filename'].split(",")

assert(len(my_filenames) == len(my_files))

In [5]:
# create the current directory 
if not os.path.exists(my_doi_folder):   
    os.makedirs(my_doi_folder)

In [6]:
for current_file, current_filename in zip(my_files, my_filenames):
    response = requests.get("https://dataverse.harvard.edu/api/access/datafile/" + current_file, 
                            params={"key": "670994aa-dbf5-4240-a3a6-74cca05a9f07"})
    # write the response to a new file with the correct filename
    with open(my_doi_folder + "/" + current_filename, 'w') as handle:
        handle.write(response.content)

# Attempting to get DOI purely using python and raw search API

In [2]:
from tqdm import tqdm as tqdm
from __future__ import print_function
import re

In [3]:
# defining some constants
r_file_query = "fileContentType:type/x-r-syntax"
dataverse_key = "670994aa-dbf5-4240-a3a6-74cca05a9f07"

## Getting total number of r files

In [76]:
total_number =  int(requests.get("https://dataverse.harvard.edu/api/search/", 
                                 params={"q": r_file_query, "type": "file",
                                         "key": dataverse_key})
                    .json()['data']['total_count'])

In [77]:
int(np.ceil(total_number / 10))

297

In [78]:
total_number

2972

## See how many pages of data we can get

In [75]:
page_num = 0
total_results = 0
while True:
    print(page_num, end=", ")
    num_results = len(requests.get("https://dataverse.harvard.edu/api/search/", 
                                   params= {"q": "fileContentType:type/x-r-syntax", "type": "file",
                                            "key": "670994aa-dbf5-4240-a3a6-74cca05a9f07",
                                            "start": str(page_num),
                                            "per_page": str(1000)}).json()['data']['items'])
    total_results += num_results
    page_num += 1
    if num_results < 1000:
        break
print(page_num)
print(num_results)

0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 

KeyboardInterrupt: 

It seems we can continue fetching data past the number of pages one would expect to be valid.

## Check if the API returns duplicates

We will do this by checking if the API returns duplicate `(name, dataset_citation)` tuples using a python set.

In [90]:
# initialize variables to store current state of scraping
results_set = set()
total_results = 0
page_num = 0
duplicates = 0
# Try to scrape until we obtain 100 duplicate (name, citation) tuples
while True:
    print(page_num, end=", ")
    # query the API for 1000 results
    myresults = requests.get("https://dataverse.harvard.edu/api/search/",
                             params= {"q": r_file_query, "type": "file",
                                      "key": dataverse_key,
                                      "start": str(page_num),
                                      "per_page": str(1000)}).json()['data']['items']
    # iterate through results
    for myresult in myresults:
        # record duplicates
        result_tuple = (myresult['name'], myresult['dataset_citation'])
        if result_tuple in results_set:
            duplicates += 1
        # stop if duplicates above 100
        if duplicates >= 100:
            print(total_results)
            print(page_num)
            print(duplicates)
            assert(False)
        results_set.add(result_tuple)
        total_results += 1
    page_num += 1

0, 1, 1097
1
100


AssertionError: 

After scraping 1097 files, above 100 of them are duplicates. This is very strange behavior. It looks like we will have to keep building up this set until the set is as large (or approximately as large) as the number of reported unique files. 

In [92]:
# initialize variables to store current state of scraping
result_dict = dict()
total_results = 0
page_num = 0
duplicates = 0
# Try to scrape until we obtain 100 duplicate (name, citation) tuples
while True:
    print(page_num, end=", ")
    # query the API for 1000 results
    myresults = requests.get("https://dataverse.harvard.edu/api/search/",
                             params= {"q": r_file_query, "type": "file",
                                      "key": dataverse_key,
                                      "start": str(page_num),
                                      "per_page": str(1000)}).json()['data']['items']
    # iterate through results
    for myresult in myresults:
        # record duplicates
        result_tuple = (myresult['name'], myresult['dataset_citation'])
        # if a duplicate is detected, print it
        if result_tuple in result_dict:
            print(myresult)
            print(result_dict[result_tuple])
            assert(False)
        # stop if duplicates above 100
        result_dict[result_tuple] = myresult
        total_results += 1
    page_num += 1

0, {u'name': u'Pollution_big.R', u'url': u'https://dataverse.harvard.edu/api/access/datafile/2809919', u'checksum': {u'type': u'MD5', u'value': u'c689cf0b9d2d08b508278280c6adc2ad'}, u'file_content_type': u'type/x-r-syntax', u'dataset_citation': u'Zigler, Cory, 2016, "Analysis Files", doi:10.7910/DVN/BPIXRS, Harvard Dataverse, V1', u'file_type': u'R Syntax', u'published_at': u'2016-06-30T14:05:25Z', u'file_id': u'2809919', u'type': u'file', u'size_in_bytes': 2560, u'md5': u'c689cf0b9d2d08b508278280c6adc2ad'}
{u'name': u'Pollution_big.R', u'url': u'https://dataverse.harvard.edu/api/access/datafile/2809947', u'checksum': {u'type': u'MD5', u'value': u'736035e7feb30797be47805272e783ae'}, u'file_content_type': u'type/x-r-syntax', u'dataset_citation': u'Zigler, Cory, 2016, "Analysis Files", doi:10.7910/DVN/BPIXRS, Harvard Dataverse, V1', u'file_type': u'R Syntax', u'published_at': u'2016-06-30T14:05:25Z', u'file_id': u'2809947', u'type': u'file', u'size_in_bytes': 2924, u'md5': u'736035e7feb3

AssertionError: 

here is a link to the dataset in question: https://dataverse.harvard.edu/dataset.xhtml?persistentId=doi:10.7910/DVN/BPIXRS. It seems there are two files with identical names but different file_ids and different file_sizes. Two conclusions: 
1. we should check files before upload in dataverse for duplicate names
2. the duplicates we flagged above weren't actually duplicates. We'll use `file_id` from now on to keep track of duplicates instead

In [99]:
# initialize variables to store current state of scraping
result_dict = dict()
total_results = 0
page_num = 0
duplicates = 0
# Try to scrape until we obtain 100 duplicate (name, citation) tuples
while True:
    print("Requesting page {} from API".format(page_num), end="... ")
    # query the API for 1000 results
    myresults = requests.get("https://dataverse.harvard.edu/api/search/",
                             params= {"q": r_file_query, "type": "file",
                                      "key": dataverse_key,
                                      "start": str(page_num),
                                      "per_page": str(1000)}).json()['data']['items']
    
    print("Parsing results from page {}".format(page_num), end="... ")
    # iterate through results
    for myresult in myresults:
        # record duplicates
        result_id = myresult['file_id']
        # if a duplicate is detected, print it
        if result_id in result_dict:
            print("\nDUPLICATE DETECTED \n")
            print("Page number: {}\n".format(page_num))
            print("Total results: {}\n".format(total_results))
            print("-----Current result:-----\n" + str(myresult))
            print("-----Duplicate result (already present):-----\n" + str(result_dict[result_id]))
            assert(False)
        # stop if duplicates above 100
        result_dict[result_id] = myresult
        total_results += 1
    page_num += 1

Requesting page 0 from API... Parsing results from page 0... Requesting page 1 from API... Parsing results from page 1... 
DUPLICATE DETECTED 

Page number: 1

Total results: 1000

-----Current result:-----
{u'name': u'Replication.R', u'url': u'https://dataverse.harvard.edu/api/access/datafile/2849010', u'checksum': {u'type': u'MD5', u'value': u'5adf202d220b916f0fb95a631906bb4d'}, u'file_content_type': u'type/x-r-syntax', u'dataset_citation': u'Schoonvelde, Martijn, 2016, "Replication data for: Media Freedom and the Institutional Underpinnings of Political Knowledge", doi:10.7910/DVN/24122, Harvard Dataverse, V1, UNF:6:y9R5zOnAx2D1LvpDes8+Bg==', u'file_type': u'R Syntax', u'published_at': u'2016-07-07T08:32:18Z', u'file_id': u'2849010', u'type': u'file', u'size_in_bytes': 6901, u'md5': u'5adf202d220b916f0fb95a631906bb4d'}
-----Duplicate result (already present):-----
{u'name': u'Replication.R', u'url': u'https://dataverse.harvard.edu/api/access/datafile/2849010', u'checksum': {u'type':

AssertionError: 

### THE SEARCH API DOES NOT ACTUALLY RETURN DUPLICATES; WE WERE JUST USING THE `start` PARAMETER INCORRECTLY

It turns out start doesn't refer to the desired page of results but rather which number result to start returning values for. To get pagination behavior, you have to specify `start = results_per_page * page_number`

## Scrape until we obtain the total number of R files

In [4]:
# initialize variables to store current state of scraping
id_to_citation = dict()
total_results = 0
page_num = 0
duplicates = 0
#  keep requesting until the API returns fewer than 1000 results
while True:
    print("Requesting page {} from API...".format(page_num))
    # query the API for 1000 results
    myresults = requests.get("https://dataverse.harvard.edu/api/search/",
                             params= {"q": r_file_query, "type": "file",
                                      "key": dataverse_key,
                                      "start": str(1000 * page_num),
                                      "per_page": str(1000)}).json()['data']['items']
    
    print("Parsing results from page {}...".format(page_num))
    # iterate through results, recording dataset_citations
    for myresult in myresults:
        # record duplicates if any, else add results to dictionary
        result_id = myresult['file_id']
        if result_id in id_to_citation:
            duplicates += 1
        else:
            id_to_citation[result_id] = myresult['dataset_citation']
            total_results += 1
    print("Unique results: {} | Duplicates: {}".format(total_results, duplicates))
    # if fewer than 1000 results were returned; we must have reached the end
    if len(myresults) < 1000:
        print("Reached last page of results. Done.")
        break
    page_num += 1

Requesting page 0 from API...
Parsing results from page 0...
Unique results: 1000 | Duplicates: 0
Requesting page 1 from API...
Parsing results from page 1...
Unique results: 2000 | Duplicates: 0
Requesting page 2 from API...
Parsing results from page 2...
Unique results: 2972 | Duplicates: 0
Reached last page of results. Done.


In [7]:
# backup the dictionary just in case
with open('id_to_citation.pkl', 'wb') as handle:
    pickle.dump(id_to_citation, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [8]:
# load it back and make sure the two are equal
with open('id_to_citation.pkl', 'rb') as handle:
    id_to_citation_pickle = pickle.load(handle)
assert(id_to_citation_pickle == id_to_citation)

## Extract DOI from dataset citations

In [9]:
# store all the doi's
r_dois = []
# iterate through dataset_citations (id_to_citation dictionary)
for citation in id_to_citation.values():
    # search the string for a regular expression matching DOI
    doi_match = re.search("(doi:[^,]*)", citation)
    if doi_match:
        r_dois.append(doi_match.group(1))
# remove duplicates
r_dois = list(set(r_dois))

In [10]:
len(r_dois)

729

A total of **729 datasets** on the Harvard Dataverse server contain R scripts!

## Get the file_ids of all files in each dataset (specified by a DOI)

In [5]:
# import for convenience
import pickle
from collections import defaultdict

In [12]:
# initialize as a defaultdict to make code cleaner
doi_to_fileids = defaultdict(list)

In [13]:
# iterate through dois
for i in tqdm(range(len(r_dois))):
    doi = r_dois[i]
    # handle exceptions in case the dataverse is not accessible
    try:
        # query the API for the relevant results
        myresults = requests.get("https://dataverse.harvard.edu/api/datasets/:persistentId",
                                     params= {"persistentId": doi,
                                              "key": dataverse_key})\
                        .json()['data']['latestVersion']['files']
        # parse the filename and file id out from the results
        for myresult in myresults:
            doi_to_fileids[doi].append((myresult['dataFile']['filename'], myresult['dataFile']['id']))
    except:
        pass

100%|██████████| 729/729 [23:13<00:00,  1.91s/it]


In [14]:
# backup the dictionary just in case
with open('doi_to_fileids.pkl', 'wb') as handle:
    pickle.dump(doi_to_fileids, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [15]:
# load it back and make sure the two are equal
with open('doi_to_fileids.pkl', 'rb') as handle:
    doi_to_fileids_pickle = pickle.load(handle)
assert(doi_to_fileids_pickle == doi_to_fileids)

In [24]:
doi_to_fileids['doi:10.7910/DVN/0DOFEF']

[(u'CC_China_2015_Adult_Final_9.07.15_cutduplicate88.tab', 2966406),
 (u'china-stm.R', 2966405),
 (u'chinopen-prep.R', 2966404),
 (u'chinopen-substitutions-lean.tab', 2839590),
 (u'CN_segmented_text_for_R.tab', 2966401),
 (u'Open_End_Question_Id_Vector.tab', 2966403),
 (u'stm-analysis.R', 2966402)]

## Download all datasets into separate directories

In [9]:
# make a doi a friendlier for a directory name
def doi_to_direct(doi_string):
    return doi_string.replace("/", "-").replace(":", "--")
# convert a directory name back to a doi
def direct_to_doi(direct_string):
    return direct_string.replace("--", ":").replace("-", "/")

In [10]:
# iterate through the key/value pairs in the dictionary
for mydoi, myfile_tuples in doi_to_fileids.items():
    # make the DOI a friendly directory name 
    mydirect = 'Rdatasets/' + doi_to_direct(mydoi)
    # if the dataset does not have a directory, create it
    if not os.path.exists(mydirect):   
        os.makedirs(mydirect)
    # iterate through list of filename, fileid tuples
    for filename, fileid in myfile_tuples:
        response = requests.get("https://dataverse.harvard.edu/api/access/datafile/" + str(fileid), 
                                params={"key": dataverse_key})
        # write the response to a new file with the correct filename
        with open(mydirect + "/" + filename, 'w') as handle:
            handle.write(response.content)

KeyboardInterrupt: 