In [1]:
from time import sleep
from bs4 import BeautifulSoup
import requests
import dataset
import json

In [2]:
def scrape_report(uri):
    result = requests.get(uri)
    
    # Make sure the server returned a good status code
    if result.status_code != 200:
        return None
    
    content = result.content
    soup = BeautifulSoup(content,"lxml")
    
    # Make sure we are not on a Page Not Found page
    if soup.find("h1").text == 'Page not found':
        return None

    # Establish the structure of the report
    report = {"title":"",
              "description":"", 
              "about":{},
              "link":uri}

    # Build the title of the resource
    report_title = soup.find("h1","restrict").text

    try:
        doc_extra_title = soup.find("h2","green").text
        report_title += " - " + doc_extra_title
    except Exception:
        pass

    report['title'] = report_title

    # Get data for "About" this resource
    resource_data = soup.find("table","resourcedata")
    table_cells = resource_data.find_all("td")
    for idx in range(0,len(table_cells)):
        if idx % 2 == 0:
            sec = table_cells[idx].text.replace("\xa0"," ").strip(":")
            val = table_cells[idx+1].text.replace("\xa0"," ")
            report["about"][sec] = val

    # Add the description
    resource_description = soup.find_all("p")[0].text.replace("\xa0"," ")
    report["description"] = resource_description

    return report

----

## Scraping search page results

Everything added to the site after Jan of 2000. 142 pages of results.

`http://www.syrialearning.org/resources.aspx?page=1&date=2010m1t2017m6`

In [3]:
# Harvest the report numbers from the search result pages
pages = 143
articles = []

for n in range(1,pages):
    uri = "http://www.syrialearning.org/resources.aspx?page="
    uri += str(n)
    uri += "&date=2000m1t2017m6"
    
    # Request the contents of the page
    result = requests.get(uri)
    
    # Get the raw content of the page from the result request
    content = result.content
    
    # Create a BeautifulSoup object to parse the contents
    soup = BeautifulSoup(content,"lxml")
    
    # Look for all "Header 4" elements on the page - used for 
    # report titles in the search results
    headers = soup.find_all("h4")
    
    # Iterate the Header 4 elements and find those with links
    # and then extract the links and append them to a list
    for h in headers:
        try:
            num = h.find("a")['href'].split("/")[-1]
            title = h.find("a").text
            articles.append((num,title))
        except Exception:
            pass
        
    # Wait a few seconds between each request in an attempt to not
    # overload the Syria Learning server.
    sleep(2)

In [4]:
print(articles[0])
len(articles)

('24760', 'Palliative Health Care in Jordan for Syrian Refugees')


40

In [5]:
# Scrape the report pages to gather the data we need to insert into Discourse
reports = []

for article in articles:
    uri_num = article[0]
    uri = "http://www.syrialearning.org/resource/" + uri_num
    
    try:
        rep = scrape_report(uri)
        if rep:
            reports.append(rep)
    except Exception:
        print("Failed",uri)
        
    sleep(4)

In [6]:
reports[0]

{'about': {'Agency': 'The Humanitarian Health Ethics Network',
  'Author(s)': 'McDonald, M. ',
  'Countries': 'Jordan, Syria',
  'Date published': '24 May 2017',
  'Keywords': 'Conflict, violence & peace, Health, Refugees/IDPs',
  'Language': 'English',
  'Pages': '35pp',
  'Resource type': 'Research, reports and studies'},
 'description': 'The Hashemite Kingdom of Jordan shares its northern border with the Syrian Arab Republic, and has been one of the main receiving countries of fleeing refugees since the beginning of the Syrian conflict in 2011.',
 'link': 'http://www.syrialearning.org/resource/24760',
 'title': 'Palliative Health Care in Jordan for Syrian Refugees'}

In [7]:
db = dataset.connect("sqlite:///article_metadata.sqlite")

In [8]:
tab_articles = db['articles']

In [9]:
fields = set()
for report in reports:
    about_keys = report["about"].keys()
    for key in about_keys:
        fields.add(key)
        
fields_lookup = {}

for field in fields:
    db_col = field.replace(" ","_").replace("(","").replace(")","").lower()
    fields_lookup[db_col] = field

fields_lookup

{'agency': 'Agency',
 'authors': 'Author(s)',
 'countries': 'Countries',
 'date_published': 'Date published',
 'keywords': 'Keywords',
 'language': 'Language',
 'pages': 'Pages',
 'publisher': 'Publisher',
 'resource_type': 'Resource type'}

In [10]:
for report in reports:
    try:
        title = report["title"]
    except:
        title = ""
        
    try:
        link = report["link"]
    except:
        link = ""
        
    try:
        description = report["description"]
    except:
        description = ""
        
    try:
        about = json.dumps(report["about"])
    except:
        about = ""
        
    record = {
        "title":title,
        "link":link,
        "description":description,
        "about":about
    }
    
    for db_col in fields_lookup.keys():
        try:
            record[db_col] = report["about"][fields_lookup[db_col]]
        except:
            record[db_col] = ""
    
    tab_articles.insert(record)

In [11]:
for n in range(1,10):
    print(n)

1
2
3
4
5
6
7
8
9
