# Scraping 3ie's Impact Evaluation Registry
The following code downloads nearly all of the data from the 3ie impact evaluation registry

## Download list of valid study numbers
In 3ie's registry, details of each impact evaluation are listed on a webpage with a url of the form "http://www.3ieimpact.org/en/evidence/impact-evaluations/details/_study_number" where study_number appears to be an arbitrarily assigned number.  First I get all of the valid study numbers from the main webpage listing links to all of the impact evaluation webpages.

In [7]:
from bs4 import BeautifulSoup
import requests
import numpy as np
import re
import pandas as pd

r = requests.get("http://www.3ieimpact.org/en/evidence/impact-evaluations/?q=&all=on&sort_by=alphabet")
soup = BeautifulSoup(r.content, "lxml")
all_links =soup.findAll("a", href=True)
study_nums = []
for link in all_links:
    m = re.search(r"details/([\d]+)", link["href"])
    if m:
        study_nums.append(m.group(1))

valid_study_nums = np.asarray(study_nums).astype("int")
valid_study_nums = np.sort(valid_study_nums)

## Download metadata for all studies
Iterate through the list of valid study numbers to download the metadata for each study and then combine metadata for all studies into a dataframe.

In [68]:
# try the above for a few webpages
studies = []
# the maximum for the range should be 3402 (going up to 3401) but testing out on just a few pages for now
for idx, val in enumerate(valid_study_nums[:3402]):
    try:
        print("loading data for study: " + str(val))
        url = "http://www.3ieimpact.org/en/evidence/impact-evaluations/details/" + str(val)
        r = requests.get(url)
        soup = BeautifulSoup(r.content, "lxml")
        temp_dict = {}
        temp_dict['study_num'] = i
        # there appears to only be one HTML definition list in each webpage.  
        # within that definition list I search for all the dt elements
        dt_elements_of_dl = soup.find('dl').findAll('dt')
        for dt in dt_elements_of_dl:
            temp_dict[dt.next_element] = dt.next_sibling.next_element.next_element
        if soup.find('h1'):
            temp_dict['title'] = soup.find('h1').next_element
        if soup.find('time'):
            temp_dict['year'] = soup.find('time').next_element
        try:
            source = soup.find('section', class_ ="evidence_source")
            temp_dict['source'] = source.find('p').next_element
            temp_dict['source_link'] = source.find('a')['href']
        except:
            print('no evidence source found')
        # the following code to load the methodology and findings doesn't work for webpages in which 
        # there is a context or synopsis before the methodology section. since there are only a few pages like 
        # this i am ignoring it for now.
        try:
            method_findings =soup.findAll('section', class_ ='summary_item')
            temp_dict['methodology'] = method_findings[0].find('p').next_element
            temp_dict['findings'] = method_findings[1].find('p').next_element
        except:
            print("couldn't load methodology or findings")
        studies.append(temp_dict)
        # every Xth study save all the results so that they aren't lost if the connection goes down.
        # i didn't end up using these files
        x = 100
        if idx%x == 0:
            df = pd.DataFrame(studies)
            df.to_csv("/Users/douglasjohnson/Documents/coding/datasets/3ie/3ie_registry_"+ str(idx))
    except:
        print("loading metadata failed")
df = pd.DataFrame(studies)
df.to_csv("/Users/douglasjohnson/Documents/coding/datasets/3ie_final.csv")

loading data for study: 1
couldn't load methodology or findings
loading data for study: 2
loading data for study: 3
couldn't load methodology or findings
loading data for study: 4
loading data for study: 5
no evidence source found
loading data for study: 6
loading data for study: 7
loading data for study: 8
loading data for study: 9
loading data for study: 10
loading data for study: 11
loading data for study: 12
loading data for study: 13
loading data for study: 15
loading data for study: 16
loading data for study: 17
loading data for study: 18
loading data for study: 19
loading data for study: 20
loading data for study: 21
loading data for study: 22
loading data for study: 23
loading data for study: 24
loading data for study: 25
loading data for study: 26
loading data for study: 27
loading data for study: 29
loading data for study: 30
couldn't load methodology or findings
loading data for study: 32
loading data for study: 33
loading data for study: 34
loading data for study: 35
loadin