# INM430 Week 10 Practical - Part 1
Scrape London Datastore  
Here we build an inventory of what is available and where  

Part 1 of 3 
1. Get LDS column names
2. Get Health data column names  
3. Cross reference and determine scope  

In [39]:
# helper functions

def getLDSDownloadLinksPageCount():
    # get the number of London Datastore pages we can scrape
    # using the same general format from week 09
    from bs4 import BeautifulSoup
    import urllib.request as ur
    urlToScrape = "https://data.london.gov.uk/dataset"
    r = ur.urlopen(urlToScrape).read()
    soup = BeautifulSoup(r, "lxml")
    # looking for the paging links found near footer
    linkList = soup.find_all('li', attrs={'class': 'dp-search__pagelink'})
    # all being well, the list will look like this (where each line is a list element)
    
    #<li class="dp-search__pagelink dp-search__pagelink--disabled"><span>«</span></li>
    #<li class="dp-search__pagelink dp-search__pagelink--active"><span>1</span></li>
    #<li class="dp-search__pagelink"><a href="/dataset?page=2">2</a></li>
    #<li class="dp-search__pagelink"><a href="/dataset?page=3">3</a></li>
    #<li class="dp-search__pagelink"><a href="/dataset?page=4">4</a></li>
    #<li class="dp-search__pagelink"><span>...</span></li>
    #<li class="dp-search__pagelink"><a href="/dataset?page=78">78</a></li>
    #<li class="dp-search__pagelink"><a href="/dataset?page=2">»</a></li>
    
    # The line we are interested in is the next to last (page 78), number 78 being
    # the text property of the link (a href attribute), which is the 6th element of
    # the linkList list starting from index 0
    try:
        iPagenums = linkList[6].text
    except:
        # string data type for consistency
        iPagenums = "0"

    return int(iPagenums)

def getLDSDownloadLinks(iPagenum):
    # each page number will have a number of links,
    # with a label (name) and a url (href)
    from bs4 import BeautifulSoup
    import urllib.request as ur
    urlToScrape = "https://data.london.gov.uk/dataset?page=" + str(iPagenum)
    r = ur.urlopen(urlToScrape).read()
    soup = BeautifulSoup(r, "lxml")
    # look for h3 headers
    linkList = soup.find_all('h3', attrs={'class': 'dp-searchresult__heading'})
    # our return list
    results = []
    for linkListItem in linkList:
        try:
            linkHeader = linkListItem.find('a', attrs={'class': "dp-searchresult__heading-link"})
            name = linkHeader.text
            href = linkHeader['href']
            ldslinks = {
                "name" : name,
                "href" : href,
            }
        except Exception as e:
            print("Error - no links found")
        results.append(ldslinks)
    return results

def getLDSFileDownloadLinks(href):
    # get the file download links - pdf, xls, etc, decide later what to do
    from bs4 import BeautifulSoup
    import urllib.request as ur
    urlToScrape = "https://data.london.gov.uk" + href
    r = ur.urlopen(urlToScrape).read()
    soup = BeautifulSoup(r, "lxml")
    download_links = soup.find_all('div', attrs={'class': 'dp-resource__indented'})
    results = []
    for download_link in download_links:
        try:
            link = download_link.find('a', attrs={'class': 'dp-resource__format'})
            fileurl = link['href']
            links = {
                "fileurl" : fileurl,
            }
            results.append(links)
        except:
            # TODO add href to error message
            print("Error occured parsing file download links for href =", href)
    # return a list of dictionaries
    return results

def checkPath(path):
    # check if path or file exist
    from pathlib import Path
    retval = False
    # remove leading forward slash
    if(path[0] == '/'):
        path = path[1:]
    p = Path(path)
    if(p.exists() == True):
        retval = True
    return retval

def makeDir(path):
    # create directory if required
    from pathlib import Path
    # remove leading forward slash
    if(path[0] == '/'):
        path = path[1:]
    p = Path(path)
    if(p.exists() == False):
        Path(path).mkdir(parents=True, exist_ok=True)
        
def jsonifier(links, toplevel):
    import json
    jsonified = "{\"" + toplevel + "\":" + json.dumps(links) + "}"
    # to keep return type consistent with xmlifier
    return str.encode(jsonified)

def checkXls(downloadlink):
    isXls = False
    filetype = downloadlink[-4:]
    filetypes = ['.xls', 'xlsx']
    if(filetype in filetypes):
        isXls = True
    return isXls

def downloadFile(linkspath, downloadpath):
    # expected values
    # linkspath ~ /dataset/migration-indicators
    # downloadpath ~ /download/migration-indicators/0db19902-5013-42af-972d-0e5481d7ac44/Long%20term%20international%20migration.xlsx
    import urllib.request
    makeDir(linkspath)
    baseurl = 'https://data.london.gov.uk/'
    # file donwload link fdlink will look like
    # https://data.london.gov.uk/download/migration-indicators/0db19902-5013-42af-972d-0e5481d7ac44/Long%20term%20international%20migration.xlsx
    fdlink = baseurl + downloadpath[1:]  
    localfile = linkspath[1:] + '/' + downloadpath.split('/')[4]
    #print(fdlink)
    #print(localfile)
    urllib.request.urlretrieve(fdlink, localfile)
    
def getDownloadLinks():
    # get the number of pages
    iPagenums = getLDSDownloadLinksPageCount()
    # initialise our links dictionary
    links = []
    # get the links to pages containling download file links
    maxi = 0
    for i in range(1, iPagenums + 1):
        maxi = i
        links.extend(getLDSDownloadLinks(i))
    print("Scraped", maxi, "page links")
    # 3. Create a new entry in our links dictionary, 
    #    consisting of another dictionary with all the available files for download (pdf, xls, etc)
    for i in range (0, len(links)):
        links[i]['fileurls'] = getLDSFileDownloadLinks(links[i]['href'])
    print("Scraped file download links")
    return links

def saveDownloadLinksToJSON(links):
    myjsonfile = jsonifier(links, "links")
    f = open('lds-links.json', 'wb')
    f.write(myjsonfile)
    print("Wrote LDS download links json lds-links.json file to disk.")
    f.close()

In [4]:
import datetime
# Download London Datastore .xls(x) files
# Timestamp start
now = datetime.datetime.now()
print ("London datastore download scraper started:", now.strftime("%Y-%m-%d %H:%M:%S"))

# get file download links
links = getDownloadLinks()

# save json file for future reference
saveDownloadLinksToJSON(links)

# file download count
k = 0
# iterate through list of dictionaries
for i in range (0, len(links)):
    # and check the download links dictionary within dictionary
    for link in links[i]['fileurls']:
        if(checkXls(link['fileurl'])):
            k = k + 1
            downloadFile(links[i]['href'], link['fileurl'])

print("Downloaded", k, ".xls(x) files")

# Timestamp end download link scraping
now = datetime.datetime.now()
print ("London datastore download scraper scraper ended:", now.strftime("%Y-%m-%d %H:%M:%S"))            

London datastore download scraper started: 2018-12-04 16:18:54
Downloaded 80 page links
Error occured parsing file download links for href = /dataset/curio-canopy-cover-geodatabase
Error occured parsing file download links for href = /dataset/addressbase-plus-for-contractors
Downloaded page link files
Wrote LDS download links json lds-links.json file to disk.
/dataset/migration-indicators
/download/migration-indicators/0db19902-5013-42af-972d-0e5481d7ac44/Long%20term%20international%20migration.xlsx


PermissionError: [Errno 13] Permission denied: 'download/migration-indicators/0db19902-5013-42af-972d-0e5481d7ac44/Long%20term%20international%20migration.xlsx'

In [40]:
#def downloadFile2(url, localPath):
def downloadFile2(linkspath, downloadpath):
    # expected values
    # linkspath ~ /dataset/migration-indicators
    # downloadpath ~ /download/migration-indicators/0db19902-5013-42af-972d-0e5481d7ac44/Long%20term%20international%20migration.xlsx
    import urllib.request
    makeDir(linkspath)
    baseurl = 'https://data.london.gov.uk/'
    # file donwload link fdlink will look like
    # https://data.london.gov.uk/download/migration-indicators/0db19902-5013-42af-972d-0e5481d7ac44/Long%20term%20international%20migration.xlsx
    fdlink = baseurl + downloadpath[1:]  
    # local file path and name localfile will look like
    # dataset/migration-indicators/Long%20term%20international%20migration.xlsx
    localfile = linkspath[1:] + '/' + downloadpath.split('/')[4]
    #print(fdlink)
    #print(localfile)
    urllib.request.urlretrieve(fdlink, localfile)
    
# iterate through list of dictionaries
# for i in range (0, len(links)):
for i in range (0, 1):
    # and check the download links dictionary within dictionary
    for link in links[i]['fileurls']:
        if(checkXls(link['fileurl'])):
            downloadFile(links[i]['href'], link['fileurl'])

In [192]:
# as an aside, links could be rebuilt from json file
# download excel files
# Validate links rebuilding from json file
# import json
# f = open('lds-links.json', 'r')
# data = json.load(f)
# links = data['links']
# f.close()

# Extras, sanity checks

In [14]:
# this is the path on website where download link is to be found e.g.
# https://data.london.gov.uk/dataset/diversity-london-report-data
path = '/dataset/diversity-london-report-data'
# note we will use the same path locally to store the downloaded file

# this is the file donwload location e.g.
# https://data.london.gov.uk/download/diversity-london-report-data/66ee75f0-3424-4333-8ebf-d227bc74b562/diversity-in-london-data.xlsx
downloadlink = '/download/diversity-london-report-data/66ee75f0-3424-4333-8ebf-d227bc74b562/diversity-in-london-data.xlsx'
# So we check if the file exists locally e.g.
# /dataset/diversity-london-report-data/66ee75f0-3424-4333-8ebf-d227bc74b562/diversity-in-london-data.xlsx'
# downloadlink = downloadlink.split('/')
downloadFile(url, localPath)

NameError: name 'url' is not defined

In [16]:
# another sanity check
localPath = '/dataset/medium-term-economic-forecast'
url = '/download/medium-term-economic-forecast/ec246c96-7e35-4661-820f-da01ab123d08/gla-london-economic-outlook-2015-11.xls'
downloadFile2(url, localPath)

PermissionError: [Errno 13] Permission denied: 'download/medium-term-economic-forecast/ec246c96-7e35-4661-820f-da01ab123d08/gla-london-economic-outlook-2015-11.xls'

In [262]:
path[1:]

'dataset/diversity-london-report-data'

In [235]:
localDowloadPath = path + '/' + downloadlink.split('/')[4]
localDowloadPath[1:]

'dataset/diversity-london-report-data/diversity-in-london-data.xlsx'

In [234]:
downloadlink.split('/')[4]

'diversity-in-london-data.xlsx'