In [2]:
import os
import tqdm
from tqdm import trange, tqdm_notebook
import pandas as pd
import csv
import urllib3
import requests
import urllib.request
import numpy as np
import time
from bs4 import BeautifulSoup
import re
from IPython.display import Markdown, display
import urllib3
import boto3
import s3fs
from urllib3 import PoolManager
from tqdm import tqdm_notebook as tqdm
import ipywidgets as widgets
from ipywidgets import TwoByTwoLayout
from ipywidgets import interact, interactive, fixed, interact_manual
from ipywidgets import VBox, HBox, Label, Box
from ipywidgets import Button, Layout
import shutil
from os import path

urllib3.disable_warnings()
manager = PoolManager(10)
def printmd(string):
    display(Markdown(string))

#Set the display max output
pd.options.display.max_colwidth = 100

#Create connection to AWS S3
s3_client = boto3.client('s3')

#Set the base url
url_base = "https://data.london.gov.uk/dataset"

#Beautiful soup part
response = requests.get(url_base)
soup_main = BeautifulSoup(response.text, "html.parser")
dataframe_website_topic = pd.DataFrame(columns=['Topic', 'Topic_Count', 'Topic_href'])
dataframe_website_format = pd.DataFrame(columns=['Format', 'Format_Count', 'Format_href'])

# Method to get the available values and frequencies for the requested parameter.

Method parses the given URL page and stores all the values together with their count.

In [3]:
def get_subject_count(req_subject_url):
    count_array = []
    subjects_array = []
    href_array = []
    for subject in req_subject_url:
        string = (str(subject.find("span", {"class":"dp-facet__count"}).text))
        value = re.findall(r'\((.*?)\)', string)[0]
        count_array.append(int(value))
        subject_string = subject.find(text=True)
        subject_string_start = re.search(r'\w', subject_string)
        subject_string_end = subject_string.rfind('\n')
        subject_string = subject_string[int(str(subject_string_start.start(0))): int(str(subject_string_end))]
        subjects_array.append(subject_string)
        href_array.append(subject['href'][subject['href'].find('=') + 1:])
    return subjects_array, count_array, href_array

# Get all available topics with their count from the website

Find all the tags that correspond to a topic available on the website using BeatufiulSoup methods "find" and "find_all".  

In [4]:
url_topics = soup_main.find("div", {"class":"dp-facet dp-facet--topics"})
url_topics = url_topics.find_all("a", {"class": ["dp-facet__link--hasimg dp-facet__link dp-facet__link--facet-transition", "dp-facet__link--hasimg dp-facet__link dp-facet__link--facet-before-transition dp-facet__link--facet-transition"]},recursive=False)

## Store the topics with their count

Create a DataFrame to store the topics and their count.

In [5]:
dataframe_website_topic['Topic'], dataframe_website_topic['Topic_Count'], dataframe_website_topic['Topic_href'] = get_subject_count(url_topics)                   
printmd("## Available topics on the website data.london.gov.uk:\n")
dataframe_website_topic

## Available topics on the website data.london.gov.uk:


Unnamed: 0,Topic,Topic_Count,Topic_href
0,Demographics,165,248ec7c0-025e-4b5a-925d-0fccdd1f9e96
1,Transparency,147,1d5852ed-0315-4472-927a-3d1bdaa4f630
2,Environment,138,fb70a4e6-311c-41c1-8429-3fe27ebc928a
3,Employment and Skills,136,50f66ade-4ef9-4814-b2cb-94e5b316d7f6
4,Business and Economy,108,737cc837-707b-42a9-8f58-c70f35a8eb28
5,Housing,106,67b1cea4-806d-4b63-90d7-155cf3ac3c03
6,Planning,104,b781eefa-ff44-44b8-be63-fc3acd37547c
7,Health,94,4199df0d-d454-4373-b710-aeed29098a59
8,Transport,89,b35ef9b1-8875-4f7b-8aca-8373cff77d17
9,Education,69,2c4d2275-67a6-401b-89ca-4ed62556b901


# Get all available format on the website

Find all the tags that correspond to a topic available on the website using BeatufiulSoup methods "find" and "find_all".  

In [6]:
url_format = soup_main.find("div", {"class":"dp-facet dp-facet--format"})
url_format = url_format.find_all("a", {"class": ["dp-facet__link--hasimg dp-facet__link dp-facet__link--facet-transition", "dp-facet__link--hasimg dp-facet__link dp-facet__link--facet-before-transition dp-facet__link--facet-transition"]},recursive=False)

## Store the formats with their count

In [7]:
dataframe_website_format['Format'], dataframe_website_format['Format_Count'], dataframe_website_format['Format_href'] = get_subject_count(url_format)
printmd("## Available formats on the website data.london.gov.uk:\n")
dataframe_website_format

## Available formats on the website data.london.gov.uk:


Unnamed: 0,Format,Format_Count,Format_href
0,Spreadsheet,564,spreadsheet
1,CSV File,207,csv
2,PDF File,196,pdf
3,Website,122,html
4,ZIP File,78,zip
5,API Endpoint,24,api
6,GeoPackage,24,geopackage
7,Shapefile,24,shp
8,XML File,24,xml
9,Image,12,image


# Example request URL creation using input from the DataFrame

Creating a URL request to retrieve all resources that a using the DataFrames created previously.

In [12]:
# Get files from multiple topics/formats
url_file = url_base + "?topics=" + dataframe_website_topic[dataframe_website_topic['Topic']=='Demographics']['Topic_href']
print(url_file)

0    https://data.london.gov.uk/dataset?topics=248ec7c0-025e-4b5a-925d-0fccdd1f9e96
Name: Topic_href, dtype: object


# Create URL request using the provided method and widgets

Method to create a request url with the given topics/formats/search query. Select the parameters for your search query.

In [10]:
def create_url_request(req_topics, req_formats, req_search_query):
    url_request = url_base + "?"
    if req_topics != ():
        for topic in req_topics:
            index_topic = dataframe_website_topic.index[dataframe_website_topic['Topic'] == topic].tolist()[0]
            url_request = url_request + "topics=" + dataframe_website_topic.iloc[index_topic]['Topic_href'] + "&"
    if req_formats != ():
        for format in req_formats:
            try:
                index_format = dataframe_website_format.index[dataframe_website_format['Format'] == format].tolist()[0]
                url_request = url_request + "format=" + dataframe_website_format.iloc[index_format]['Format_href'] + "&"
            except IndexError:
                print("Please enter a valid format!")
                return 0
    if req_search_query != ():
        url_request = url_request + "q="
        for query_term in req_search_query:
            url_request = url_request + query_term + "%20"
    else:
        url_request = url_request[:-1]
    return url_request

In [14]:
widget_topic = widgets.SelectMultiple(
    options=dataframe_website_topic['Topic'],
    rows=10,
    description='Topics:',
    disabled=False
)

widget_format = widgets.SelectMultiple(
    options=dataframe_website_format['Format'],
    rows=10,
    description='Formats:',
    disabled=False
)

widget_search_query = widgets.Textarea(
    value='',
    placeholder='Input your search query',
    description='Search:',
    disabled=False
)

topics_arr = list()
format_arr = list()
search_q = list()
def topics(x):
    topics_arr.append(x)
def formats(x):
    format_arr.append(x)
def search_query(x):
    search_q.append(x)

    
w1 = interactive(topics,  x=widget_topic)
w2 = interactive(formats, x=widget_format)
w3 = interactive(search_query, x=widget_search_query)
printmd("## Create a search query:")
HBox([w1, w2, w3])


## Create a search query:

HBox(children=(interactive(children=(SelectMultiple(description='Topics:', options=('Demographics', 'Transpare…

## Generate the search query link

In [17]:
# Test the method
try:
    topics_elements = topics_arr[len(topics_arr) - 1] 
    format_elements= format_arr[len(format_arr) - 1] 
    search_elements = search_q[len(search_q) - 1]
except IndexError:
    pass
url_request = create_url_request(req_topics = topics_elements, req_formats= format_elements, req_search_query = search_elements.split())

## Print the request URL

In [18]:
printmd("### The request URL: " + url_request)

### The request URL: https://data.london.gov.uk/dataset?topics=67b1cea4-806d-4b63-90d7-155cf3ac3c03&q=

# Retrieve the number of pages for the generated search query

# Method to get the number of pages

In [22]:
def get_number_pages_result(req_url):
    response = requests.get(req_url)
    req_soup_main = BeautifulSoup(response.text, "html.parser")
    pages_links = req_soup_main.find_all("li", {"class": "dp-search__pagelink"})
    has_results = req_soup_main.find("div", {"class" : "dp-search__no-datasets"})
    if has_results != None:
        return 0
    else:
        pages = [val.string for val in pages_links]
        number_pages = pages[len(pages) - 2]
        if number_pages == 0:
            return number_pages + 1
        else:
            return number_pages

# Test the method

In [23]:
#Test the method
printmd("### The request url is: " + url_request)
number_pages = int(get_number_pages_result(url_request))
if number_pages == 0:
    printmd("### No results found.")
else:
    printmd("### The number of pages for the request is: " + str(number_pages))

### The request url is: https://data.london.gov.uk/dataset?topics=67b1cea4-806d-4b63-90d7-155cf3ac3c03&q=

### The number of pages for the request is: 11

# Get all available file formats for a resource 

The input for this method can be a resource URL(e.g. https://data.london.gov.uk/dataset/approved-food-establishments-)

In [305]:
def get_all_available_file_formats(req_url_resource):
    response = requests.get(req_url_resource)
    current_soup_main = BeautifulSoup(response.text, "html.parser")
    files_available = []
    resource_link = current_soup_main.find_all("a",{"class":"dp-resource__format"})
    for val in resource_link:
        try:
            files_available.append(val['href'][val['href'].rfind('.') + 1:])
        except:
            pass
    if(files_available == []):
        return pd.Series()
    files_available = pd.Series(files_available)
    return files_available.value_counts()

widget_format_desc = widgets.Textarea(
    value= '',
    placeholder='Input your resource link',
    description='Get formats:',
    disabled=False,
    layout=Layout(width='500px', height='90%')
)
formats_link = list()
def formats_desc(x):
    formats_link.append(x)

w5 = interactive(formats_desc, x=widget_format_desc)
HBox([w5])

HBox(children=(interactive(children=(Textarea(value='', description='Get formats:', layout=Layout(height='90%'…

## Test the method

In [25]:
link = formats_link[len(formats_link) - 1]
if link != "":
    printmd("### Available file format for the resource:")
    print(get_all_available_file_formats(link))
else:
    printmd("### Please insert a valid link.")

### Please insert a valid link.

# Get resource description

The input for this method can be a resource URL(e.g. https://data.london.gov.uk/dataset/approved-food-establishments-)

In [26]:
def get_resource_description(req_url_resource):
    response = requests.get(req_url_resource)
    current_soup_main = BeautifulSoup(response.text, "html.parser")
    description = current_soup_main.find("div",{"class":"dp-dataset__description content"}).text
    return description

widget_resource_desc = widgets.Textarea(
    value= '',
    placeholder='Input your resource link',
    description='Get desc.:',
    disabled=False,
    layout=Layout(width='500px', height='90%')
)
resource_link = list()
def resource_desc(x):
    resource_link.append(x)

w4 = interactive(resource_desc, x=widget_resource_desc)
HBox([w4])
#"https://data.london.gov.uk/dataset/annual-london-survey-2014"

HBox(children=(interactive(children=(Textarea(value='', description='Get desc.:', layout=Layout(height='90%', …

# Test the method

In [28]:
try:
    if resource_link[len(resource_link) - 1] != "":
        printmd("### Description of the resource:")
    print(get_resource_description(resource_link[len(resource_link) - 1]))
except:
    printmd("### Please enter a valid link.")

### Please enter a valid link.

# Get all resources for a search

def get_request_results(req_url_request):
    request_response = requests.get(req_url_request)
    request_soup_main = BeautifulSoup(request_response.text, "html.parser")
    #Get topics/count from the website
    request_url_result = request_soup_main.find("ul", {"class":"dp-search__results"})
    request_url_result = request_url_result.find_all("div", {"class":"dp-searchresult__content"});
    if request_url_result == []:
        printmd("**No results for your seach query.**")
        return pd.DataFrame(columns=['Title', 'Last_Modified', 'Description', 'Formats_Available', 'Href']), req_url_request
    else:
        dataframe_request_result = pd.DataFrame(columns=['Title', 'Last_Modified', 'Description', 'Formats_Available', 'Href'])
        number_pages = get_number_pages_result(req_url_request)
        if number_pages == 0:
            number_pages = 1
        printmd("## Number of pages for the query: " +str(number_pages))
        if(int(number_pages) >= 1):
            req_url_request = req_url_request + "&page=" 
            for index_page in tqdm(range(1, int(number_pages) + 1), desc = "Pages"):
                time.sleep(1)
                req_url_request = req_url_request[:req_url_request.rfind('=') + 1] + str(index_page)
                request_response = requests.get(req_url_request)
                request_soup_main = BeautifulSoup(request_response.text, "html.parser")
                request_url_result = request_soup_main.find("ul", {"class":"dp-search__results"})
                for result in request_url_result:
                    #Href
                    href = result.find("a", {"class":"dp-searchresult__heading-link"})['href']
                    #Title
                    title = result.find("a", {"class":"dp-searchresult__heading-link"}).text
                    #Last modified
                    last_modified = result.find("div", {"class":"dp-searchresult__modified"}).text
                    #Description
                    description = get_resource_description("https://data.london.gov.uk" + href)
                    #Published by
                    published_by = result.find("div", {"class":"dp-searchresult__publishedby"}).text
                    publishedby_start = re.search(r'\w', published_by)
                    publishedby_end = published_by.rfind('\n')
                    published_by = published_by[int(str(publishedby_start.start(0))): int(str(publishedby_end))]
                    #Available formats
                    url_files = "https://data.london.gov.uk" + href
                    formats_available = get_all_available_file_formats(url_files)
                    #Add to dataframe
                    dataframe_request_result = dataframe_request_result.append({'Title' : title, 'Last_Modified': last_modified, 'Description' : description, 'Published_By' : published_by, "Href" : href, "Formats_Available" : formats_available}, ignore_index = True)        
            #url_request = url_request[:url_request.rfind('=') + 1]
            req_url_request = req_url_request[:req_url_request.rfind("&")]
            return dataframe_request_result, req_url_request


# Store the results in a DataFrame
df_result, url_request = get_request_results(url_request)

In [430]:
def get_request_results(req_url_request):
    request_response = requests.get(req_url_request)
    request_soup_main = BeautifulSoup(request_response.text, "html.parser")
    #Get topics/count from the website
    request_url_result = request_soup_main.find("ul", {"class":"dp-search__results"})
    request_url_result = request_url_result.find_all("div", {"class":"dp-searchresult__content"});
    dataframe_request_result = pd.DataFrame(columns=['Title', 'Last_Modified', 'Description', 'Href', 'Published_By'])
    dataframe_request_formats = pd.DataFrame(columns=[req_format for req_format in dataframe_website_format['Format_href']])
    dataframe_request_result = pd.concat([dataframe_request_result, dataframe_request_formats],sort=False)
    if request_url_result == []:
        printmd("**No results for your seach query.**")
        return dataframe_request_result, req_url_request
    else:
        number_pages = get_number_pages_result(req_url_request)
        if number_pages == 0:
            number_pages = 1
        printmd("## Number of pages for the query: " +str(number_pages))
        if(int(number_pages) >= 1):
            req_url_request = req_url_request + "&page=" 
            for index_page in tqdm(range(1, int(number_pages) + 1), desc = "Pages"):
                time.sleep(1)
                req_url_request = req_url_request[:req_url_request.rfind('=') + 1] + str(index_page)
                request_response = requests.get(req_url_request)
                request_soup_main = BeautifulSoup(request_response.text, "html.parser")
                request_url_result = request_soup_main.find("ul", {"class":"dp-search__results"})
                for result in request_url_result:
                    #Href
                    href = result.find("a", {"class":"dp-searchresult__heading-link"})['href']
                    
                    #Title
                    title = result.find("a", {"class":"dp-searchresult__heading-link"}).text
                    
                    #Last modified
                    last_modified = result.find("div", {"class":"dp-searchresult__modified"}).text
                    
                    #Description
                    description = get_resource_description("https://data.london.gov.uk" + href)
                    
                    #Published by
                    published_by = result.find("div", {"class":"dp-searchresult__publishedby"}).text
                    publishedby_start = re.search(r'\w', published_by)
                    publishedby_end = published_by.rfind('\n')
                    published_by = published_by[int(str(publishedby_start.start(0))): int(str(publishedby_end))]
                    
                    #Available formats
                    url_files = "https://data.london.gov.uk" + href
                    formats_available = get_all_available_file_formats(url_files)
                    
                    #Add to dataframe
                    info_series = pd.Series({'Title' : title, 'Last_Modified': last_modified, 'Description' : description, 'Published_By' : published_by, "Href" : href})
                    info_series = pd.concat([info_series, formats_available])
                    dataframe_request_result = dataframe_request_result.append(info_series, ignore_index = True)        
            
            #url_request = url_request[:url_request.rfind('=') + 1]
            req_url_request = req_url_request[:req_url_request.rfind("&")]
            return dataframe_request_result, req_url_request


# Store the results in a DataFrame
df_result, url_request = get_request_results(url_request)
df_result = df_result.fillna(0)
column_names = df_result.columns[1:5].tolist()
column_names.append("Formats")


## Number of pages for the query: 11

HBox(children=(IntProgress(value=0, description='Pages', max=11, style=ProgressStyle(description_width='initia…




## Print the resource results for the query

In [493]:
if df_result.empty == True:
    printmd("## No results.")
else:
    printmd("## The resources available matching your query:")
    display(df_result)

## The resources available matching your query:

Unnamed: 0,Title,Last_Modified,Description,Href,Published_By,spreadsheet,csv,pdf,html,zip,...,uk/gla/rest/services/apps/planning_data_map_03/MapServer/208,uk/gla/rest/services/apps/planning_data_map_03/MapServer/209,uk/gla/rest/services/apps/planning_data_map_03/MapServer/201,uk/gla/rest/services/apps/planning_data_map_03/MapServer/107,uk/gla/rest/services/apps/planning_data_map_03/MapServer,uk/priorities/housing-land/land-assets/land-and-property-database,jpg,png,uk,asp
0,Planning permissions on the London Development Database (LDD),Updated 9 days ago,The London Development Database (LDD) records significant planning permissions in London.\nThe d...,/dataset/planning-permissions-on-the-london-development-database--ldd-,Greater London Authority (GLA),0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,London's Economy Today,Updated 13 days ago,"The most up-to-date information on London's economy, published by email every month. Each issue ...",/dataset/london-economy-today,Greater London Authority (GLA),0,0,1,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,Indices of Deprivation,Updated 14 days ago,\n This page contains all the English Indices of Deprivation (ID) data for London at LSOA and bo...,/dataset/indices-of-deprivation,"Ministry of Housing, Communities & Local Government (MHCLG)",0,1,2,0,1,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,GLA Grants data,Updated 14 days ago,The GLA Grants Dataset contains data relating to grants which have been awarded by the GLA since...,/dataset/gla-grants-data,Greater London Authority (GLA),0,1,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,Lending by Postcode Sector,Updated 15 days ago,"Value of outstanding residential mortgage lending by postcode sector for 9,273 Great Britain pos...",/dataset/lending-by-postcode-sector,Council of Mortgage Lenders,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
101,Census 2001 Key Statistics 20: Household Composition,Updated 6 years ago,Census Key Statistics Table KS20: Household composition\n\nA dependent child is a person in a ho...,/dataset/census-2001-key-statistics-20-household-composition,Office for National Statistics (ONS),0,1,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
102,"Solid wall and off gas network properties, (LSOA)",Created 6 years ago,Number and percentage of properties with solid walls and those not connected to gas mains at Low...,/dataset/solid-wall-and-off-gas-network-properties-lsoa,Greater London Authority (GLA),0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
103,"Dwelling Stock by Tenure and Condition, Borough",Updated 6 years ago,"Number and percentage of dwellings by tenure, and type of dwelling, including condition.\n\nDown...",/dataset/dwelling-stock-tenure-and-condition-borough,Office for National Statistics (ONS),0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
104,European Quality of Life Survey,Updated 6 years ago,The European Quality of Life survey (EQLS) examines both the objective circumstances of European...,/dataset/european-quality-life-survey,Greater London Authority (GLA),0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


# See details of a specific resource

In [597]:
def get_available_formats(req_df, req_resource):
    resource_formats = req_df[req_df['Title'] == req_resource].iloc[:, 5:].iloc[0]
    available_formats_resource = np.where(resource_formats > 0)[0] + 5
    resource_formats = req_df[req_df['Title'] == req_resource].iloc[:, available_formats_resource].squeeze(axis = 0)
    return resource_formats
    
    
def retrieve_information(req_df, req_resource, req_information):
    result = pd.DataFrame()
    if req_information != ():
        for current_resource_name in req_resource:
            current_resource_info = pd.Series()
            current_resource_info['Title'] = current_resource_name
            for info in req_information:
                    if info != 'Formats':
                        current_resource_info[info] = req_df[req_df['Title'] == current_resource_name][info].iloc[0]
                    else:
                        resource_formats = get_available_formats(req_df, current_resource_name)
                        current_resource_info = pd.concat([current_resource_info, resource_formats])
            result = result.append(current_resource_info, ignore_index=True)
    else:
        for current_resource_name in req_resource:
            current_resource_info = req_df[req_df['Title'] == current_resource_name].iloc[:, :5].iloc[0]    
            resource_formats = get_available_formats(req_df, current_resource_name)
            current_resource_info = pd.concat([current_resource_info, resource_formats])
            result = result.append(current_resource_info, ignore_index=True)
    return result.fillna(0)

In [454]:
widget_select_resource = widgets.SelectMultiple(
    options=df_result['Title'],
    rows=15,
    description='Resource:',
    disabled=False
)


widget_select_information = widgets.SelectMultiple(
    options=column_names,
    rows=6,
    description='Information:',
    disabled=False
)

resources_arr = list()
information_arr = list()

def resources(x):
    resources_arr.append(x)
def information(x):
    information_arr.append(x)
    
w1 = interactive(resources, x=widget_select_resource)
w2 = interactive(information, x=widget_select_information)
printmd("## Select resource and information required")
HBox([w1, w2])

## Select resource and information required

HBox(children=(interactive(children=(SelectMultiple(description='Resource:', options=('Planning permissions on…

In [607]:
# Test the method
try:
    resources_elements = resources_arr[len(resources_arr) - 1] 
    information_elements= information_arr[len(information_arr) - 1] 
except IndexError:
    pass
retrieve_information(df_result, resources_elements, information_elements)

Unnamed: 0,Description,Href,Last_Modified,Published_By,Title,xlsx
0,The London Development Database (LDD) records significant planning permissions in London.\nThe d...,/dataset/planning-permissions-on-the-london-development-database--ldd-,Updated 9 days ago,Greater London Authority (GLA),Planning permissions on the London Development Database (LDD),5.0


# Select which file/format to download. Select where to download the resource

In [606]:
# def get_available_formats_result(req_series):
#     result = set()
#     for val in req_series:
#         try:
#             a = val.index.tolist()
#             for val2 in a:
#                 result.add(val2)
#         except AttributeError:
#             pass
#     return result
# print(get_available_formats_result(df_result['Formats']))

In [605]:
def get_available_formats_result(req_series):
    result = set()
    for value_1 in req_series:
        try:
            val_format = value_1.index.tolist()
            for value_2 in val_format:
                result.add(value_2)
        except AttributeError:
            pass
    return result

widget_file = widgets.SelectMultiple(
    options = df_result['Title'],
    rows = 10,
    description = 'Topics',
    disabled = False,
    layout=Layout(width='90%', height='90%')
)

widget_file_format = widgets.SelectMultiple(
    options = df_result.columns[5:],
    rows = 10,
    description = 'Formats',
    disabled = False,
    layout=Layout(width='90%', height='100%')
)

widget_local = widgets.Checkbox(
    value=False,
    description='Local storage',
    disabled=False,
    indent=False
)

widget_s3 = widgets.Checkbox(
    value=False,
    description='S3 Storage',
    disabled=False,
    indent=False
)


file_arr = list()
file_format_arr = list()
def files(x):
    file_arr.append(x)
def files_formats(x):
    file_format_arr.append(x)
def store_local(x):
    pass
def store_s3(x):
    pass

    
w2_1= interactive(files,  x=widget_file)
w2_2 = interactive(files_formats, x=widget_file_format)
w2_3 = interactive(store_local, x=widget_local)
w2_4 = interactive(store_s3, x=widget_s3)
form_item_layout = Layout(
    display='inline-flex',
    flex_flow='row',
    justify_content='space-between'
)

form_items = [w2_1, w2_2, w2_3,w2_4]
if df_result.empty == False:
    form = Box(form_items, layout=Layout(
        display='flex',
        flex_flow='column',
        align_items='stretch',
        width='90%'
    ))
    display(form)
else:
    printmd("**No results for the search query.**")

Box(children=(interactive(children=(SelectMultiple(description='Topics', layout=Layout(height='90%', width='90…

## Get the selected files/formats

In [621]:
try:
    files_download = file_arr[len(file_arr) - 1] 
    files_format_download= file_format_arr[len(file_format_arr) - 1] 
except IndexError:
    pass
except NameError:
    pass

In [622]:
#Parameters = req_request_page -> url with the searchj/ req_files -> files to be downloaded
def download_items(req_request_page_url, req_files = (), req_format = ()):
    number_pages = int(get_number_pages_result(req_request_page_url))
    if number_pages == 0:
        number_pages = 1
    try:
        if path.exists("../London_DataStore") != True:
            os.mkdir("../London_DataStore")
        else:
            pass
    except OSError:
        print("Cannot create folder.")
        time.sleep(1)
        return 0
    for index_page in tqdm(range(1, number_pages + 1), desc = "Pages", leave = False):
        current_url = req_request_page_url + "&page=" + str(index_page)
        response = requests.get(current_url)
        current_soup_main = BeautifulSoup(response.text, "html.parser")
        search_results = current_soup_main.find_all("a", {"class": "dp-searchresult__heading-link"})
        search_results_headings = list(map(lambda x: x.text, search_results))
        if req_files != None:
            search_results_found = list(map(lambda x: True if x in req_files else False, search_results_headings))
            indexes_documents = np.where(np.array(search_results_found) == True)[0]
        else:
            indexes_documents = np.arange(len(search_results_headings))
        for current_file_found in tqdm_notebook(indexes_documents, desc = "Resources", leave = False):
            url_file = "https://data.london.gov.uk" + search_results[current_file_found]['href']
            response = requests.get(url_file)
            title_format = search_results_headings[current_file_found].replace("/", "_")
            time.sleep(1)
            current_soup_main = BeautifulSoup(response.text, "html.parser")
            search_results_buttons = current_soup_main.find_all("a", {"class": "dp-resource__button"})
            if  req_format != ():
                search_results_formats_found = list(map(lambda x: True if x['href'][x['href'].rfind(".") + 1:] in req_format else False, search_results_buttons))
                index_files = np.where(np.array(search_results_formats_found) == True)[0]
                if len(index_files) != 0:
                    for current_index in tqdm_notebook(index_files, desc = "Files", leave = False):
                        filedata = manager.urlopen('GET', url = "https://data.london.gov.uk" + search_results_buttons[current_index]['href'], preload_content=False)
                        datatowrite = filedata.read()
                        title_file = search_results_buttons[current_index]['href'][search_results_buttons[current_index]['href'].rfind("/"):]
                        #print(title_file)
                        #print("https://data.london.gov.uk" + search_results_buttons[current_index]['href'])
                        try:
                            os.mkdir("../London_DataStore/" + title_format)
                        except OSError:
                            time.sleep(1)
                        try:
                            with open("../London_DataStore/" + title_format + title_file, 'wb') as f:
                                f.write(datatowrite)              
                            if widget_s3.value == True:   
                                s3_client.upload_file("../London_DataStore/" + title_format + title_file, "uom.bioinformatics", "Demo/" + title_format + title_file) 
                            if widget_local.value != True:
                                shutil.rmtree("../London_DataStore/" + title_format + title_file)
                            print("File downloaded: " + title_file + " from " + title_format)
                        except:
                            pass
            else:
                for current_file in tqdm_notebook(search_results_buttons, desc = "Files", leave = False):
                    try:
                        filedata = manager.urlopen('GET', url = "https://data.london.gov.uk" + current_file['href'], preload_content=False)
                        datatowrite = filedata.read()
                        title_file = current_file['href'][current_file['href'].rfind("/"):]
                        try:
                            os.mkdir("../London_DataStore/" + title_format)
                        except OSError:
                            time.sleep(1)
                        try:
                            with open("../London_DataStore/" + title_format + title_file, 'wb') as f:
                                f.write(datatowrite)
                            if widget_s3.value == True:   
                                s3_client.upload_file("../London_DataStore/" + title_format + title_file, "uom.bioinformatics", "Demo/" + title_format + title_file)
                            if widget_local.value != True:
                                shutil.rmtree("../London_DataStore/" + title_format + title_file)
                            print("File downloaded: " + title_file + " from " + title_format)
                        except:
                            pass
                    except:
                        pass
    if widget_local.value != True:           
        shutil.rmtree("../London_DataStore")


In [None]:
if widget_local.value == False and widget_s3.value == False:
    printmd("**Please select a location to download the files.**")
else:
    download_items(url_request, req_files = files_download, req_format = files_format_download)
    printmd("**Finished downloading the files**")

HBox(children=(IntProgress(value=0, description='Pages', max=11, style=ProgressStyle(description_width='initia…

HBox(children=(IntProgress(value=0, description='Resources', max=1, style=ProgressStyle(description_width='ini…

HBox(children=(IntProgress(value=0, description='Files', max=5, style=ProgressStyle(description_width='initial…