# INM430 Week 10 Practical - Part 1
Scrape London Datastore  
Here we build an inventory of what is available and where  

Part 1 of 3 
1. Get LDS column names
2. Get Health data column names  
3. Cross reference and determine scope  

In [223]:
# helper functions

def getLDSDownloadLinksPageCount():
    # get the number of London Datastore pages we can scrape
    # using the same general format from week 09
    from bs4 import BeautifulSoup
    import urllib.request as ur
    urlToScrape = "https://data.london.gov.uk/dataset"
    r = ur.urlopen(urlToScrape).read()
    soup = BeautifulSoup(r, "lxml")
    # looking for the paging links found near footer
    linkList = soup.find_all('li', attrs={'class': 'dp-search__pagelink'})
    # all being well, the list will look like this (where each line is a list element)
    
    #<li class="dp-search__pagelink dp-search__pagelink--disabled"><span>«</span></li>
    #<li class="dp-search__pagelink dp-search__pagelink--active"><span>1</span></li>
    #<li class="dp-search__pagelink"><a href="/dataset?page=2">2</a></li>
    #<li class="dp-search__pagelink"><a href="/dataset?page=3">3</a></li>
    #<li class="dp-search__pagelink"><a href="/dataset?page=4">4</a></li>
    #<li class="dp-search__pagelink"><span>...</span></li>
    #<li class="dp-search__pagelink"><a href="/dataset?page=78">78</a></li>
    #<li class="dp-search__pagelink"><a href="/dataset?page=2">»</a></li>
    
    # The line we are interested in is the next to last (page 78), number 78 being
    # the text property of the link (a href attribute), which is the 6th element of
    # the linkList list starting from index 0
    try:
        iPagenums = linkList[6].text
    except:
        # string data type for consistency
        iPagenums = "0"

    return int(iPagenums)

def getLDSDownloadLinks(iPagenum):
    # each page number will have a number of links,
    # with a label (name) and a url (href)
    from bs4 import BeautifulSoup
    import urllib.request as ur
    urlToScrape = "https://data.london.gov.uk/dataset?page=" + str(iPagenum)
    r = ur.urlopen(urlToScrape).read()
    soup = BeautifulSoup(r, "lxml")
    # look for h3 headers
    linkList = soup.find_all('h3', attrs={'class': 'dp-searchresult__heading'})
    # our return list
    results = []
    for linkListItem in linkList:
        try:
            linkHeader = linkListItem.find('a', attrs={'class': "dp-searchresult__heading-link"})
            name = linkHeader.text
            href = linkHeader['href']
            ldslinks = {
                "name" : name,
                "href" : href,
            }
        except Exception as e:
            print("Error - no links found")
        results.append(ldslinks)
    return results

def getLDSFileDownloadLinks(href):
    # get the file download links - pdf, xls, etc, decide later what to do
    from bs4 import BeautifulSoup
    import urllib.request as ur
    urlToScrape = "https://data.london.gov.uk" + href
    r = ur.urlopen(urlToScrape).read()
    soup = BeautifulSoup(r, "lxml")
    download_links = soup.find_all('div', attrs={'class': 'dp-resource__indented'})
    results = []
    for download_link in download_links:
        try:
            link = download_link.find('a', attrs={'class': 'dp-resource__format'})
            fileurl = link['href']
            links = {
                "fileurl" : fileurl,
            }
            results.append(links)
        except:
            # TODO add href to error message
            print("Error occured parsing file download links")
    # return a list of dictionaries
    return results

def makeDir(path):
    # create directory if required
    from pathlib import Path
    # remove leading forward slash
    if(path[0] == '/'):
        path = path[1:]
    p = Path(path)
    if(p.exists() == False):
        Path(path).mkdir(parents=True, exist_ok=True)
        
def jsonifier(links, toplevel):
    import json
    jsonified = "{\"" + toplevel + "\":" + json.dumps(links) + "}"
    # to keep return type consistent with xmlifier
    return str.encode(jsonified)



In [224]:
import datetime
# Download London Datastore .xls files
# Timestamp start
now = datetime.datetime.now()
print ("London datastore scraper started:", now.strftime("%Y-%m-%d %H:%M:%S"))
# 1. get the number of pages
iPagenums = getLDSDownloadLinksPageCount()
# initialise our links dictionary
links = []
# 2. Get the links
maxi = 0
for i in range(1, iPagenums + 1):
    maxi = i
    links.extend(getLDSDownloadLinks(i))
print("Downloaded", maxi, "page links")
# 3. Create a new entry in our links dictionary, 
#    consisting of another dictionary with all the available files for download (pdf, xls, etc)
for i in range (0, len(links)):
    links[i]['fileurls'] = getLDSFileDownloadLinks(links[i]['href'])
print("Downloaded page link files")
# 3.5 Save json file to filesystems, store, load tomorrow, rebuild list of dictionaries and carry on
myjsonfile = jsonifier(links, "links")
f = open('lds-links.json', 'wb')
f.write(myjsonfile)
print("Wrote LDS download links json file to disk.")
f.close()
# Timestamp start
now = datetime.datetime.now()
print ("London datastore scraper ended:", now.strftime("%Y-%m-%d %H:%M:%S"))
# 4. Download all the files and save to local /dataset/<label> to keep local path
#    aligned with remote path
# NB might need to insert random delays between downloads to mask scraping activity a little
# 5. Open all the excel files, get the column headers and create a name cloud to get things moving

London datastore scraper started: 2018-11-29 22:35:14
Downloaded 80 page links
Error occured parsing file download links
Error occured parsing file download links
Downloaded page link files
Wrote LDS download links json file to disk.
London datastore scraper ended: 2018-11-29 22:46:58


In [192]:
# could we save the list as a json file?
links = []
links.extend(getLDSDownloadLinks(i))
for i in range (0, len(links)):
    links[i]['fileurls'] = getLDSFileDownloadLinks(links[i]['href'])

In [217]:
for i in range (0, len(links)):
    #print(links[i]['name'], links[i]['href'])
    for link in links[i]['fileurls']:
        # download .xls
        print(link['fileurl'])

/download/average-house-prices/f01b1cc7-6daa-4256-bd6c-94d8c83ee000/land-registry-house-prices-borough.xls
/download/average-house-prices/b1b0079e-698c-4c0b-b8c7-aa6189590ca4/land-registry-house-prices-borough.csv
/download/average-house-prices/fb8116f5-06f8-42e0-aa6c-b0b1bd69cdba/land-registry-house-prices-ward.xls
/download/average-house-prices/59be940c-ffb8-426d-a833-6146ea77de5c/land-registry-house-prices-ward.csv
/download/average-house-prices/fab83691-9c7e-4e53-97e8-564a010a56ce/land-registry-house-prices-MSOA.xls
/download/average-house-prices/bdf8eee7-41e1-4d24-90ce-93fe5cf040ae/land-registry-house-prices-MSOA.csv
/download/average-house-prices/af75f04c-5a21-4ab8-bf7c-aa57b42a84eb/land-registry-house-prices-LSOA.xls
/download/average-house-prices/9a92fbaf-c04e-498a-9f8c-6c85f280817e/land-registry-house-prices-LSOA.csv
/download/london-reservoir-levels/5ef2b35c-44d1-4255-ae59-5a599bf4642e/london-reservoir-levels.xls
/download/number-bicycle-hires/ac29363e-e0cb-47cc-a97a-e216d900

In [207]:
myjsonfile = jsonifier(links, "links")
f = open('ldslinks.json', 'wb')
f.write(myjsonfile)
print("Wrote city courses json file to disk.")
f.close()

Wrote city courses json file to disk.


In [222]:
# Validate links rebuilding from json file
#import json
#f = open('ldslinks.json', 'r')
#data = json.load(f)
#mylinks = data['links']
f.close()
#for i in range (0, len(mylinks)):
    #print(links[i]['name'], links[i]['href'])
#    for link in mylinks[i]['fileurls']:
        # download .xls
#        print(link['fileurl'])