# INM430 Week 10 Practical - Part 1
Scrape London Datastore  
Here we build an inventory of what is available and where  

Part 1 of 3 
1. Get LDS column names
2. Get Health data column names  
3. Cross reference and determine scope  

In [267]:
# helper functions

def getLDSDownloadLinksPageCount():
    # get the number of London Datastore pages we can scrape
    # using the same general format from week 09
    from bs4 import BeautifulSoup
    import urllib.request as ur
    urlToScrape = "https://data.london.gov.uk/dataset"
    r = ur.urlopen(urlToScrape).read()
    soup = BeautifulSoup(r, "lxml")
    # looking for the paging links found near footer
    linkList = soup.find_all('li', attrs={'class': 'dp-search__pagelink'})
    # all being well, the list will look like this (where each line is a list element)
    
    #<li class="dp-search__pagelink dp-search__pagelink--disabled"><span>«</span></li>
    #<li class="dp-search__pagelink dp-search__pagelink--active"><span>1</span></li>
    #<li class="dp-search__pagelink"><a href="/dataset?page=2">2</a></li>
    #<li class="dp-search__pagelink"><a href="/dataset?page=3">3</a></li>
    #<li class="dp-search__pagelink"><a href="/dataset?page=4">4</a></li>
    #<li class="dp-search__pagelink"><span>...</span></li>
    #<li class="dp-search__pagelink"><a href="/dataset?page=78">78</a></li>
    #<li class="dp-search__pagelink"><a href="/dataset?page=2">»</a></li>
    
    # The line we are interested in is the next to last (page 78), number 78 being
    # the text property of the link (a href attribute), which is the 6th element of
    # the linkList list starting from index 0
    try:
        iPagenums = linkList[6].text
    except:
        # string data type for consistency
        iPagenums = "0"

    return int(iPagenums)

def getLDSDownloadLinks(iPagenum):
    # each page number will have a number of links,
    # with a label (name) and a url (href)
    from bs4 import BeautifulSoup
    import urllib.request as ur
    urlToScrape = "https://data.london.gov.uk/dataset?page=" + str(iPagenum)
    r = ur.urlopen(urlToScrape).read()
    soup = BeautifulSoup(r, "lxml")
    # look for h3 headers
    linkList = soup.find_all('h3', attrs={'class': 'dp-searchresult__heading'})
    # our return list
    results = []
    for linkListItem in linkList:
        try:
            linkHeader = linkListItem.find('a', attrs={'class': "dp-searchresult__heading-link"})
            name = linkHeader.text
            href = linkHeader['href']
            ldslinks = {
                "name" : name,
                "href" : href,
            }
        except Exception as e:
            print("Error - no links found")
        results.append(ldslinks)
    return results

def getLDSFileDownloadLinks(href):
    # get the file download links - pdf, xls, etc, decide later what to do
    from bs4 import BeautifulSoup
    import urllib.request as ur
    urlToScrape = "https://data.london.gov.uk" + href
    r = ur.urlopen(urlToScrape).read()
    soup = BeautifulSoup(r, "lxml")
    download_links = soup.find_all('div', attrs={'class': 'dp-resource__indented'})
    results = []
    for download_link in download_links:
        try:
            link = download_link.find('a', attrs={'class': 'dp-resource__format'})
            fileurl = link['href']
            links = {
                "fileurl" : fileurl,
            }
            results.append(links)
        except:
            # TODO add href to error message
            print("Error occured parsing file download links for href =", href)
    # return a list of dictionaries
    return results

def checkPath(path):
    # check if path or file exist
    from pathlib import Path
    retval = False
    # remove leading forward slash
    if(path[0] == '/'):
        path = path[1:]
    p = Path(path)
    if(p.exists() == True):
        retval = True
    return retval

def makeDir(path):
    # create directory if required
    from pathlib import Path
    # remove leading forward slash
    if(path[0] == '/'):
        path = path[1:]
    p = Path(path)
    if(p.exists() == False):
        Path(path).mkdir(parents=True, exist_ok=True)
        
def jsonifier(links, toplevel):
    import json
    jsonified = "{\"" + toplevel + "\":" + json.dumps(links) + "}"
    # to keep return type consistent with xmlifier
    return str.encode(jsonified)

def checkXls(downloadlink):
    isXls = False
    filetype = downloadlink[-4:]
    filetypes = ['.xls', 'xlsx']
    if(filetype in filetypes):
        isXls = True
    return isXls

def downloadFile(url, localPath):
    import urllib.request
    print(url)
    print(localPath)
    makeDir(localPath)
    baseurl = 'https://data.london.gov.uk/'
    linkToFile = baseurl + url[1:]  
    urllib.request.urlretrieve(linkToFile, localDestination)
    # print("urllib.request.urlretrieve(", linkToFile, "," , localPath[1:], ")")

In [224]:
import datetime
# Download London Datastore .xls files
# Timestamp start
now = datetime.datetime.now()
print ("London datastore scraper started:", now.strftime("%Y-%m-%d %H:%M:%S"))
# 1. get the number of pages
iPagenums = getLDSDownloadLinksPageCount()
# initialise our links dictionary
links = []
# 2. Get the links
maxi = 0
for i in range(1, iPagenums + 1):
    maxi = i
    links.extend(getLDSDownloadLinks(i))
print("Downloaded", maxi, "page links")
# 3. Create a new entry in our links dictionary, 
#    consisting of another dictionary with all the available files for download (pdf, xls, etc)
for i in range (0, len(links)):
    links[i]['fileurls'] = getLDSFileDownloadLinks(links[i]['href'])
print("Downloaded page link files")
# 3.5 Save json file to filesystems, store, load tomorrow, rebuild list of dictionaries and carry on
myjsonfile = jsonifier(links, "links")
f = open('lds-links.json', 'wb')
f.write(myjsonfile)
print("Wrote LDS download links json file to disk.")
f.close()
# Timestamp start
now = datetime.datetime.now()
print ("London datastore scraper ended:", now.strftime("%Y-%m-%d %H:%M:%S"))
# 4. Download all the files and save to local /dataset/<label> to keep local path
links = []
links.extend(getLDSDownloadLinks(i))
for i in range (0, len(links)):
    links[i]['fileurls'] = getLDSFileDownloadLinks(links[i]['href'])
    
# 4.5 Save to local file in json format for future reference

myjsonfile = jsonifier(links, "links")
f = open('lds-links.json', 'wb')
f.write(myjsonfile)
print("Wrote LDS download links json file to disk.")
f.close()

#    aligned with remote path
# NB might need to insert random delays between downloads to mask scraping activity a little
# 5. Open all the excel files, get the column headers and create a name cloud to get things moving

London datastore scraper started: 2018-11-29 22:35:14
Downloaded 80 page links
Error occured parsing file download links
Error occured parsing file download links
Downloaded page link files
Wrote LDS download links json file to disk.
London datastore scraper ended: 2018-11-29 22:46:58


In [192]:
# could we save the list as a json file?
links = []
links.extend(getLDSDownloadLinks(i))
for i in range (0, len(links)):
    links[i]['fileurls'] = getLDSFileDownloadLinks(links[i]['href'])

In [207]:
myjsonfile = jsonifier(links, "links")
f = open('lds-links.json', 'wb')
f.write(myjsonfile)
print("Wrote LDS json file to disk.")
f.close()

Wrote city courses json file to disk.


In [39]:
# Validate links rebuilding from json file
import json
f = open('lds-links.json', 'r')
data = json.load(f)
mylinks = data['links']
f.close()
for i in range (0, len(mylinks)):
    # print(mylinks[i]['name'], mylinks[i]['href'])
    # TODO CHECK existence of dictory
    #makeDir(mylinks[i]['href'])
    for link in mylinks[i]['fileurls']:
        # download .xls
        #print(link['fileurl'])
        if(checkXls(link['fileurl'])):
            downloadFile(mylinks[i]['href'], link['fileurl'])
            # print(mylinks[i]['href'])
            # print(link['fileurl'])

/dataset/medium-term-economic-forecast
/download/medium-term-economic-forecast/4d7c2717-e599-4968-9cd5-8e20b87ec001/GLA-london-economic-outlook-2018-11.xls
/dataset/medium-term-economic-forecast
/download/medium-term-economic-forecast/b12edf95-e0fe-4f3f-801f-658a601e97af/GLA-london-economic-outlook-2018-05.xls
/dataset/medium-term-economic-forecast
/download/medium-term-economic-forecast/94836c59-967d-4152-8e06-cd7f21eff23e/GLA-london-economic-outlook-2017-11.xls
/dataset/medium-term-economic-forecast
/download/medium-term-economic-forecast/a45af590-36b1-4651-aa23-835ac5572f1c/GLA-london-economic-outlook-2017-6.xls
/dataset/medium-term-economic-forecast
/download/medium-term-economic-forecast/508c7702-7c13-49ae-9b70-0fec2597a57b/gla-london-economic-outlook-2016-11.xls
/dataset/medium-term-economic-forecast
/download/medium-term-economic-forecast/00197d81-cfb3-417c-8314-65ce5721df1a/gla-london-economic-outlook-2016-5.xls
/dataset/medium-term-economic-forecast
/download/medium-term-econo

In [230]:
# this is the path on website where download link is to be found e.g.
# https://data.london.gov.uk/dataset/diversity-london-report-data
path = '/dataset/diversity-london-report-data'
# note we will use the same path locally to store the downloaded file

# this is the file donwload location e.g.
# https://data.london.gov.uk/download/diversity-london-report-data/66ee75f0-3424-4333-8ebf-d227bc74b562/diversity-in-london-data.xlsx
downloadlink = '/download/diversity-london-report-data/66ee75f0-3424-4333-8ebf-d227bc74b562/diversity-in-london-data.xlsx'
# So we check if the file exists locally e.g.
# /dataset/diversity-london-report-data/66ee75f0-3424-4333-8ebf-d227bc74b562/diversity-in-london-data.xlsx'
# downloadlink = downloadlink.split('/')


In [268]:
# check link pattern has '/' * 5 i.e. split array has 5 elements
if(len(downloadlink.split('/')) == 5):
    # check if local file exists
    localfile = path + '/' + downloadlink.split('/')[4]

(linkToFile, localDestination)

localPath = '/dataset/medium-term-economic-forecast'
url = '/download/medium-term-economic-forecast/ec246c96-7e35-4661-820f-da01ab123d08/gla-london-economic-outlook-2015-11.xls'

downloadFile(url, localPath)

/download/medium-term-economic-forecast/ec246c96-7e35-4661-820f-da01ab123d08/gla-london-economic-outlook-2015-11.xls
/dataset/medium-term-economic-forecast


In [262]:
path[1:]

'dataset/diversity-london-report-data'

In [235]:
localDowloadPath = path + '/' + downloadlink.split('/')[4]
localDowloadPath[1:]

'dataset/diversity-london-report-data/diversity-in-london-data.xlsx'

In [234]:
downloadlink.split('/')[4]

'diversity-in-london-data.xlsx'

# TODO
1. Check existence of path
2. Check existence of file
3. Download and save to path