In [1]:
%load_ext autoreload
%autoreload 2

### Objective
Scrape data from a research paper

In [2]:
import requests
from bs4 import BeautifulSoup, NavigableString
from urllib.parse import urlparse

In [3]:
url = "https://dspace.mit.edu/handle/1721.1/39126?show=full"

In [4]:
res = requests.get(url)

In [6]:
res.status_code

200

In [8]:
soup = BeautifulSoup(res.text, "html.parser")

In [20]:
metadata = soup.find("table")
table_rows = metadata.select(".ds-table-row")

In [32]:
# transform table rows into a better datastructure that we can iterate over and get properties
rows = []


for table_row in table_rows:
    name = table_row.find("td", attrs={"class": "label-cell"}).text
    value = table_row.find("td", attrs={"class": "word-break"}).text
    
    rows.append({"name": name, "value": value})

In [33]:
rows

[{'name': 'dc.contributor.author', 'value': 'Raghabednra Chattopadhyay'},
 {'name': 'dc.contributor.author', 'value': 'Esther Duflo'},
 {'name': 'dc.date.accessioned', 'value': '2007-10-05T05:20:05Z'},
 {'name': 'dc.date.available', 'value': '2007-10-05T05:20:05Z'},
 {'name': 'dc.date.issued', 'value': '2007-10-05T05:20:05Z'},
 {'name': 'dc.identifier', 'value': 'hdl:1902.1/USBFNOMLAT'},
 {'name': 'dc.identifier.uri', 'value': 'http://hdl.handle.net/1721.1/39126'},
 {'name': 'dc.description.abstract',
  'value': "This data set uses political reservations for women in India to study the impact of women's leadership on policy decisions. Using a dataset we collected on 265 village councils in West Bengal and Rajasthan, we compare the type of public goods provided in reserved and unreserved village?s councils. Data sets based upon information provided by GP Pradhans, local villagers, and the 1991 Indian Census"},
 {'name': 'dc.subject', 'value': 'gender'},
 {'name': 'dc.subject', 'value': 

### 1. Get Title

In [11]:
title = soup.select(".page-header.first-page-header")[0].text
title

'Women as Policy Makers: Evidence from a Randomized Policy Experiment in India'

### 2. Get Authors

In [34]:
authors = []

for row in rows:
    if row["name"] == "dc.contributor.author":
        authors.append(row["value"])

In [35]:
authors

['Raghabednra Chattopadhyay', 'Esther Duflo']

### 3. Get Abstract

In [37]:
abstracts = [row["value"] for row in rows if row["name"] == "dc.description.abstract"]
abstract = "\n".join(abstracts)

In [39]:
print(abstract)

This data set uses political reservations for women in India to study the impact of women's leadership on policy decisions. Using a dataset we collected on 265 village councils in West Bengal and Rajasthan, we compare the type of public goods provided in reserved and unreserved village?s councils. Data sets based upon information provided by GP Pradhans, local villagers, and the 1991 Indian Census


### 4. Get Date Issued

In [43]:
date_issued = ""

for row in rows:
    if row["name"] == "dc.date.issued":
        date_issued = row["value"]
        break

In [44]:
date_issued

'2007-10-05T05:20:05Z'

### 5. Get Keywords

In [50]:
keywords = []

for row in rows:
    if row["name"] == "dc.subject":
        split = row["value"].split(",")
        
        for keyword in split:
            keywords.append(keyword.strip())

In [51]:
keywords

['gender',
 'decentralization',
 'affirmative action',
 'political economy',
 'gender',
 'decentralization',
 'affirmative action',
 'political economy']

### 6. Get URL

In [53]:
paper_url = ""

for row in rows:
    if row["name"] == "dc.identifier.uri":
        paper_url = row["value"]

In [54]:
paper_url

'http://hdl.handle.net/1721.1/39126'

### 7. Get all files

In [58]:
files_rows = soup.select(".file-list .file-wrapper.row")

In [64]:
files = []

for file_row in files_rows:
    link = "https://dspace.mit.edu" + file_row.find("a")["href"]
    files.append(link)

In [65]:
files

['https://dspace.mit.edu/bitstream/handle/1721.1/39126/study.xml?sequence=1&isAllowed=y',
 'https://dspace.mit.edu/bitstream/handle/1721.1/39126/womenpolicymakers_census.dta?sequence=2&isAllowed=y',
 'https://dspace.mit.edu/bitstream/handle/1721.1/39126/womenpolicymakers_census_dta.tab?sequence=3&isAllowed=y',
 'https://dspace.mit.edu/bitstream/handle/1721.1/39126/womenpolicymakers_parta.dta?sequence=4&isAllowed=y',
 'https://dspace.mit.edu/bitstream/handle/1721.1/39126/womenpolicymakers_parta_dta.tab?sequence=5&isAllowed=y',
 'https://dspace.mit.edu/bitstream/handle/1721.1/39126/womenpolicymakers_partb.dta?sequence=6&isAllowed=y',
 'https://dspace.mit.edu/bitstream/handle/1721.1/39126/womenpolicymakers_partb_dta.tab?sequence=7&isAllowed=y',
 'https://dspace.mit.edu/bitstream/handle/1721.1/39126/womenpolicymakers_partc.dta?sequence=8&isAllowed=y',
 'https://dspace.mit.edu/bitstream/handle/1721.1/39126/womenpolicymakers_partc_dta.tab?sequence=9&isAllowed=y',
 'https://dspace.mit.edu/bit