In [288]:
import google # https://pypi.org/project/google/#files
from googlesearch import search 
import pandas as pd
import tabula
import regex as re
import requests
from urllib.parse import urljoin
from bs4 import BeautifulSoup, SoupStrainer
import httplib2
import validators
import urllib.parse
import time
import warnings
from requests.adapters import HTTPAdapter
from requests.packages.urllib3.util.retry import Retry
import sys
import copy

warnings.filterwarnings(action='ignore')

sys.setrecursionlimit(1000)
# More on the google module here:
#https://python-googlesearch.readthedocs.io/en/latest/index.html#module-googlesearch

# An alternative here maybe:
# https://github.com/Nv7-GitHub/googlesearch/blob/master/googlesearch/__init__.py

In [2]:
# Get links hosting each LAs account statments.
def getLinks(names):
    
    # Empty list for saving links.
    links = []
    
    # Initialize search number (for pausing after a number of searches)
    searchNum = 0
    
    for council in names:
        print(council)
        
        # Increment search number by 1
        searchNum += 1
        print(searchNum)
        
        # Search terms (on query/council)
        query = r'{} statement of accounts'.format(council)
        
        # Google search using 'query' above and save as object.
        queryObj = search(query, tld = 'com', stop = 10)
        
        # Quick and dirty way of getting something from a generator object.
        for j in queryObj:
            if not j.endswith('pdf'):
                links.append(j)
                break
        
        # Add a pause to the search every 30th term to avoid 'Too Many Requests'
        if searchNum%30 == 0:
            time.sleep(1000)
        
        
    return links    

In [None]:
# query = r'{} statement of accounts "gov.uk"'.format('bedford')

# # Google search using 'query' above and save as object.
# queryObj = search(query, tld = 'com', stop = 10)

# # Quick and dirty way of getting something from a generator object.
# for j in queryObj:
#     if not j.endswith('pdf'):
#         print(j)

In [4]:
# # Get list of all councils
# councilNames = pd.read_csv('./councilNames.csv', sep = ",")
# councilNames = councilNames['x'].str.lower()

# # Get link to page where account statements will
# # be for each council.
# councilLinks = getLinks(councilNames)

# # Save as df -> csv 
# dfLinks = pd.DataFrame(data = {"Councils": councilNames, "Link": councilLinks})
# dfLinks.to_csv("./councilLinks.csv", sep = ',', index = False)

# Load csv
councilLinks_df = pd.read_csv('./councilLinks.csv')
# councilLinks = councilLinks_df['Link'] 
councilLinks_df.set_index('Councils', inplace = True)

print(councilLinks_df.shape[0])
print(len(councilNames))

# They have the same length, but we don't know 
# if we actually got one link per LA 
# e.g. some searches might've returned news articles or
# other stuff.

# It's not straighforward to check, so will go ahead
# with the assumption that we got all relevant links
# and see if there's any anomalies when getting the 
# PDFs from the pages. 

370
370


In [None]:
# Changing some of the links manually (based on results below)
councilLinks_df.loc['woking']['Link'] = 'http://datashare.woking.gov.uk/View/financial-documents/statement-of-accounts'
councilLinks_df.loc['basingstoke & deane']['Link'] = 'https://www.basingstoke.gov.uk/finance'
councilLinks_df.loc['bassetlaw']['Link'] = 'https://data.bassetlaw.gov.uk/statement-of-accounts/'
councilLinks_df.loc['cumbria']['Link'] = 'https://cumbria.gov.uk/finance/finance/statementofaccounts.asp#'
councilLinks_df.loc['hartlepool ua']['Link'] = 'https://www.hartlepool.gov.uk/downloads/download/299/statement_of_accounts'
councilLinks_df.loc['central bedfordshire ua']['Link'] = 'https://www.centralbedfordshire.gov.uk/info/27/about_your_council/178/annual_accounts_fees_and_charges_budget_statements_and_budget_books'
councilLinks_df.loc['central bedfordshire ua']['Link'] = 'https://info.westberks.gov.uk/article/30394'
councilLinks_df.loc['redcar & cleveland ua']['Link'] = 'https://www.redcar-cleveland.gov.uk/about-the-council/budgets/statement-of-accounts/Pages/summary-statement-of-accounts.aspx'
councilLinks_df.loc['bolsover']['Link'] ='https://www.bolsover.gov.uk/a/196-accounts-and-budgets/244-accounts'


In [350]:
def addLink(pdfs, councilLink, pdfLink):
    
    
    # Join 'base' link of council with link for statement.
    fullLink = urllib.parse.urljoin(councilLink, pdfLink['href'])
    
    # Add link to pdfs dict.
    # 'Description': 'link'
    pdfs[pdfLink.text] = fullLink
    
    return pdfs
    

def makeSearchTerms(yearL, yearH, other = []):
    
    # Different combinations of years
    # Would be nice to get a nice regex here.
    substrings_v1 = [yearL, yearH]
    substrings_v2 = [yearL, yearH[2:]]
    substrings_v3 = [yearL[2:], yearH[2:]]
    substrings_v4 = [yearL + yearH]
    substrings_v5 = [yearL + yearH[2:]]
    
    allSubstrings = [substrings_v1, substrings_v2, substrings_v3, substrings_v4, substrings_v5]
    
    if other != []:
        for ss in allSubstrings:
            ss.extend(other)
    
    return allSubstrings
    


def accessLink(councilLink):
    try:
        response = requests.get(councilLink, 
                                timeout = 15,
                               headers = {'User-Agent': 'Mozilla/5.0'})
        soup = BeautifulSoup(response.text, parse_only=SoupStrainer('a', href = True))
        return soup
    except requests.exceptions.RequestException as e:
        return 0


def conditions(subStrVers, link):
    '''
    Checks if all elements of a list (of strings) are present in a link.
    subStrVers is a list of lists, so that different versions of the lists can be checked.
    Makes more sense after looking at findStatements() below.
    
    Input: 
        subStrVers - a list of lists.
        link - a string. 
    
    Outputs True if conditions are satisfied, False otherwise.
    '''
    satisfied = False
    
    # Search in file name/descriptions.
    for ss in subStrVers:
        if all(s in link.text.lower() for s in ss):
            satisfied = True
            break
            
    
    # Search in link name.
    for ss in subStrVers:
        if all(s in link['href'].lower() for s in ss):
            satisfied = True
            break
    

    return satisfied        

    
    

# List all pdfs from a url.
def findStatements(councilLink, yearL, yearH, otherTerms = [], pdfs = {}):  
      
    soup = accessLink(councilLink)
    
    # If site can't be accessed stop and return empty pdfs.
    if soup == 0:
        return pdfs

    
    allSubstrings = makeSearchTerms(yearL, yearH, other = otherTerms)
    
    # Find all files containing 'pdf'
    for link in soup.select("a[href*='pdf']"):
        
        if conditions(allSubstrings, link):
            addLink(pdfs, councilLink, link)
        

    # If none found above, look at all links and see if terms
    # can be found in the link or description
    if pdfs == {}:
        for link in soup.select("a"):
            if conditions(allSubstrings, link):
                
                # Add link to 'pdfs' dict. 
                addLink(pdfs, councilLink, link)
                

            
    # Some councils have statements for different years
    # on different webpages. If this is the case, figure out
    # the relevant page, and call findStatements recurseively. 
    if pdfs == {}:
        # This is really messy as I was basically trying 
        # to add conditions as I went, could probably use some cleaning up.
        
        # Initial candidate for new council link (i.e. selection)
        selection = soup.select("a[href*='{}-{}']".format(yearL,yearH))
        
        # If a link containing 'yearL'-'yearH' not available.
        if len(selection) == 0:
            # Try 'yearL' - 'yearH'[2:], e.g. '2017' - '18'
            selection = soup.select("a[href*='{}-{}']".format(yearL,yearH[2:]))
            
        # If 'yearL' - 'yearH'[2:] not available, try 'yearL'[2:] - 'yearH'[2:]
        if len(selection) == 0:
            selection = soup.select("a[href*='{}-{}']".format(yearL[2:],yearH[2:]))
                    
        # If still haven't found anything, try the above w/ different format.
        # e.g. '20172018' (instead of '2017'-'2018')
        if len(selection) == 0:
            selection = soup.select("a[href*='{}{}']".format(yearL,yearH))  
            
        if len(selection) == 0:
            selection = soup.select("a[href*='{}{}']".format(yearL,yearH[2:]))  
            
        if len(selection) == 0:
            selection = soup.select("a[href*='{}{}']".format(yearL[2:],yearH[2:]))     
            
        # If none are available look for 'historical' in link name.       
        if len(selection) == 0:
            selection = soup.select("a[href*='historical']")                  

        # W/ the final selection, clean up the link and call finalStatements again
        # with the new link.
        for link in selection:
#             print("New council link (relavant year)", link['href'])

            newLink = urllib.parse.urljoin(councilLink, link['href'])
            print("Old link:", councilLink)
            print("New line:", newLink)
    
            if newLink == councilLink:
                break
            else:    
                findStatements(newLink, yearL, yearH, otherTerms, pdfs)

    # Some of the accounts aren't actually pdf files
    # So looking at the description of the file as opposed to
    # the actual file extention for the term 'pdf'. 
    if pdfs == {}:
        for link in soup.find_all('a', string = re.compile("pdf", re.IGNORECASE)):
            
            # Add link to 'pdfs' dict.
            addLink(pdfs, councilLink, link)
            break
    
    # If all else fails, just get all the pdfs
    if pdfs == {}:
        for link in soup.select("a[href*='.pdf']"):
            
            # Add link to pdfs dict.    
            addLink(pdfs, councilLink, link)

    # If that also fails get all asp.        
    if pdfs == {}:
        for link in soup.select("a[href*='asp']"):

            # Add link to pdfs dict.    
            addLink(pdfs, councilLink, link)
        
            
    return pdfs 


def getStatements(yearL, yearH):

    allStatements = {el:{} for el in councilLinks_df.index.values.tolist()}
    noPDFs = []
    for el in allStatements:

        allStatements[el] = findStatements(councilLink = councilLinks_df.loc[el]['Link'], \
                                           yearL = yearL, \
                                           yearH = yearH, \
#                                            otherTerms = ['accounts'],\
                                           pdfs = {})
        
#         allStatements[el] = checkStatements(allStatements[el])
        print("Got council: {} ({} links found)".format(el, len(allStatements[el])))

        if len(allStatements[el]) == 0:
            noPDFs.append(el)
            
            
    return allStatements, noPDFs        


def conditionsDict(subStrVers, key):
    
    satisfied = False
    
    for ss in subStrVers:
        if all(s in key for s in ss):
            satisfied = True
            
    return satisfied   



def checkStatement(pdfs, yearL, yearH, otherTerms = []):
    
                   
    testTerms = makeSearchTerms(yearL, yearH, other = otherTerms)
    

    res = [term for key, 
           term in pdfs.items() 
           if (conditionsDict(testTerms, key) or
          conditionsDict(testTerms, term))]
    
    if res != []:
        return res[0]

    
    return res
  

def checkAllStatements(allStatements, yearL, yearH, otherTerms = []):

    for council in allStatements:
        
        allStatements[council] = checkStatement(allStatements[council],
                                               yearL, yearH,
                                               otherTerms = otherTerms)
            
    return allStatements        


def countLinks(allStatements):
    
    count_zero = 0
    count_more = 0

    for el in allStatements:
        if len(allStatements[el]) == 0:
#             print("Zero:", el)
            count_zero +=1

        if len(allStatements[el]) > 1:
#             print("More than 1: ", el)
            count_more +=1

    print("\tLAs with 0 links:", count_zero)
#     print("\tLAs with more than 1 link:", count_more)

In [394]:
# # 2017 - 2018
st_17_18, missing_17_18 = getStatements('2017', '2018')


Got council: bath & north east somerset ua (1 links found)
Got council: bristol ua (1 links found)
Got council: south gloucestershire ua (6 links found)
Got council: north somerset ua (1 links found)
Got council: luton ua (2 links found)
Got council: bedford ua (2 links found)
Got council: central bedfordshire ua (1 links found)
Got council: bracknell forest ua (2 links found)
Got council: west berkshire ua (7 links found)
Got council: reading ua (2 links found)
Got council: slough ua (1 links found)
Got council: windsor & maidenhead ua (4 links found)
Got council: wokingham ua (1 links found)
Got council: milton keynes ua (1 links found)
Got council: buckinghamshire (3 links found)
Got council: aylesbury vale (1 links found)
Got council: chiltern (1 links found)
Got council: south bucks (1 links found)
Got council: wycombe (1 links found)
Got council: peterborough ua (8 links found)
Got council: cambridgeshire (1 links found)
Got council: cambridge (1 links found)
Got council: east ca

Got council: king's lynn & west norfolk (2 links found)
Got council: north norfolk (3 links found)
Got council: norwich (1 links found)
Got council: south norfolk (1 links found)
Got council: york ua (1 links found)
Got council: north yorkshire (4 links found)
Got council: craven (1 links found)
Got council: hambleton (0 links found)
Got council: richmondshire (1 links found)
Got council: scarborough (2 links found)
Got council: harrogate (1 links found)
Got council: ryedale (15 links found)
Got council: selby (1 links found)
Got council: northamptonshire (2 links found)
Got council: corby (1 links found)
Got council: daventry (5 links found)
Got council: east northamptonshire (1 links found)
Got council: kettering (2 links found)
Got council: northampton (1 links found)
Got council: south northamptonshire (0 links found)
Got council: wellingborough (2 links found)
Got council: northumberland ua (4 links found)
Got council: city of nottingham ua (0 links found)
Got council: nottinghams

Got council: yorkshire dales national park authority (3 links found)
Got council: the broads authority (12 links found)
Got council: new forest national park authority (2 links found)
Got council: south downs national park authority (1 links found)
Got council: lee valley regional park authority (0 links found)


In [395]:
# 2018 - 2019
st_18_19, missing_18_19 = getStatements('2018', '2019')



Got council: bath & north east somerset ua (1 links found)
Got council: bristol ua (1 links found)
Got council: south gloucestershire ua (6 links found)
Got council: north somerset ua (1 links found)
Got council: luton ua (2 links found)
Got council: bedford ua (3 links found)
Got council: central bedfordshire ua (2 links found)
Got council: bracknell forest ua (2 links found)
Got council: west berkshire ua (7 links found)
Got council: reading ua (2 links found)
Got council: slough ua (1 links found)
Got council: windsor & maidenhead ua (5 links found)
Got council: wokingham ua (4 links found)
Got council: milton keynes ua (4 links found)
Got council: buckinghamshire (3 links found)
Got council: aylesbury vale (1 links found)
Got council: chiltern (2 links found)
Got council: south bucks (1 links found)
Got council: wycombe (2 links found)
Got council: peterborough ua (7 links found)
Got council: cambridgeshire (3 links found)
Got council: cambridge (1 links found)
Got council: east ca

Got council: king's lynn & west norfolk (2 links found)
Got council: north norfolk (4 links found)
Got council: norwich (2 links found)
Got council: south norfolk (1 links found)
Got council: york ua (2 links found)
Got council: north yorkshire (4 links found)
Got council: craven (2 links found)
Got council: hambleton (0 links found)
Got council: richmondshire (1 links found)
Got council: scarborough (2 links found)
Got council: harrogate (1 links found)
Got council: ryedale (15 links found)
Got council: selby (2 links found)
Got council: northamptonshire (2 links found)
Got council: corby (1 links found)
Got council: daventry (3 links found)
Got council: east northamptonshire (2 links found)
Got council: kettering (1 links found)
Got council: northampton (1 links found)
Got council: south northamptonshire (0 links found)
Got council: wellingborough (2 links found)
Got council: northumberland ua (4 links found)
Got council: city of nottingham ua (1 links found)
Got council: nottinghams

In [396]:
# 2019 - 2020
st_19_20, missing_19_20 = getStatements('2019', '2020')



Got council: bath & north east somerset ua (2 links found)
Got council: bristol ua (2 links found)
Got council: south gloucestershire ua (12 links found)
Got council: north somerset ua (2 links found)
Got council: luton ua (3 links found)
Got council: bedford ua (7 links found)
Got council: central bedfordshire ua (2 links found)
Got council: bracknell forest ua (7 links found)
Got council: west berkshire ua (2 links found)
Got council: reading ua (2 links found)
Got council: slough ua (2 links found)
Got council: windsor & maidenhead ua (12 links found)
Got council: wokingham ua (8 links found)
Got council: milton keynes ua (6 links found)
Got council: buckinghamshire (4 links found)
Got council: aylesbury vale (2 links found)
Got council: chiltern (3 links found)
Got council: south bucks (2 links found)
Got council: wycombe (3 links found)
Got council: peterborough ua (10 links found)
Got council: cambridgeshire (3 links found)
Got council: cambridge (1 links found)
Got council: east

Got council: ryedale (17 links found)
Got council: selby (1 links found)
Got council: northamptonshire (4 links found)
Got council: corby (2 links found)
Got council: daventry (5 links found)
Got council: east northamptonshire (3 links found)
Got council: kettering (3 links found)
Got council: northampton (1 links found)
Got council: south northamptonshire (0 links found)
Got council: wellingborough (5 links found)
Got council: northumberland ua (5 links found)
Got council: city of nottingham ua (4 links found)
Got council: nottinghamshire (3 links found)
Got council: ashfield (12 links found)
Got council: bassetlaw (6 links found)
Got council: broxtowe (4 links found)
Got council: gedling (3 links found)
Got council: mansfield (0 links found)
Got council: newark & sherwood (2 links found)
Got council: rushcliffe (3 links found)
Got council: oxfordshire (5 links found)
Got council: cherwell (7 links found)
Got council: oxford (3 links found)
Got council: south oxfordshire (5 links foun

In [397]:
# Checking statements and getting single links.
st_17_18_checked = copy.deepcopy(st_17_18)
st_17_18_checked = checkAllStatements(st_17_18_checked, '2017', '2018')

st_18_19_checked = copy.deepcopy(st_18_19)
st_18_19_checked = checkAllStatements(st_18_19_checked, '2018', '2019')

st_19_20_checked = copy.deepcopy(st_19_20)
st_19_20_checked = checkAllStatements(st_19_20_checked, '2019', '2020')

In [398]:
print("\nCounts before checks 2017 - 2018 (# LA):")
countLinks(st_17_18)

print("\nCounts after checks 2017 - 2018 (# LA):")
countLinks(st_17_18_checked)

print("\n\n")


print("\nCounts before checks 2018 - 2019 (# LA):")
countLinks(st_18_19)

print("\nCounts after checks 2018 - 2019 (# LA):")
countLinks(st_18_19_checked)

print("\n\n")

print("\nCounts before checks 2019 - 2020 (# LA):")
countLinks(st_19_20)

print("\nCounts after checks 2019 - 2020 (# LA):")
countLinks(st_19_20_checked)


Counts before checks 2017 - 2018 (# LA):
	LAs with 0 links: 40

Counts after checks 2017 - 2018 (# LA):
	LAs with 0 links: 88




Counts before checks 2018 - 2019 (# LA):
	LAs with 0 links: 36

Counts after checks 2018 - 2019 (# LA):
	LAs with 0 links: 69




Counts before checks 2019 - 2020 (# LA):
	LAs with 0 links: 26

Counts after checks 2019 - 2020 (# LA):
	LAs with 0 links: 30


In [399]:
import json

def save_statements(statements, filename):
    filename = filename
    with open(filename, 'w') as f:
        f.write(json.dumps(statements))

# save_statements(st_17_18_checked, 'st_17_18')
        
def load_statements(statements, filename):
    with open(filename) as f:
        pet = json.loads(f.read())
    return statements


In [400]:
# Save the statements (json-like documents)
save_statements(st_17_18_checked, 'st_17_18')
save_statements(st_18_19_checked, 'st_18_19')
save_statements(st_19_20_checked, 'st_19_20')

# to load: e.g. st_17_18_test = load_statements(st_17_18_checked, 'st_17_18')

In [401]:
# tables = tabula.read_pdf(urls['Spelthorne'], pages = "all", multiple_tables = True)
#tables[-25][tables[-25].apply(lambda row: row.astype(str).str.contains('Current Assets' and 'Current Liabilities').any(), axis=1)]

def getTables(statements, authority_idx):
    
    url = statements[councilLinks_df.index[authority_idx]]
    print(url)
    
    if url == []:
        return 
    
    tables = tabula.read_pdf(url, pages = 'all', multiple_tables = True)
    
    print("There are {} tables for {}".format(len(tables), councilLinks_df.index[authority_idx]))
    
    newTables = list()

    for i in range(len(tables)):

        terms1 = tables[i][tables[i].apply(lambda row: \
                                          row.astype(str).str.contains('Current Assets' \
                                                                       and 'Current Liabilities' \
                                                                       and 'Long term Liabilities'\
                                                                       and 'Long term Assets', \
                                                                       flags=re.IGNORECASE,
                                                                       regex = True
                                                                      ).any(), axis=1)]

        terms2 = tables[i][tables[i].apply(lambda row: \
                                          row.astype(str).str.contains('Current Assets' \
                                                                       and 'Current Liabilities' \
                                                                       and 'Long-term Liabilities'\
                                                                       and 'Long-term Assets', \
                                                                       flags=re.IGNORECASE,
                                                                       regex = True
                                                                      ).any(), axis=1)]
        
        if (terms1.shape[0] != 0) or (terms2.shape[0] != 0):
            newTables.append(tables[i])
    
    
    print("\n{} tables that might be of interest".format(len(newTables)))
    
    return newTables

In [375]:
test_all_tables  = getTables(st_18_19_checked,2)

https://www.southglos.gov.uk//documents/Audited-Annual-Financial-Report-201819.pdf
There are 97 tables for south gloucestershire ua

2 tables that might be of interest


In [376]:
test_all_tables[0]

Unnamed: 0.1,01 April,31 March,Unnamed: 0,31 March.1,Note
0,2017,2018,,2019,
1,Restated,Restated,,,
2,£'000,£'000,,£'000,
3,797477,854992,"Property, Plant and Equipment",895264,14.0
4,621,493,Heritage Assets,493,
5,5073,5351,Investment Property,18556,17.0
6,900,1290,Intangible Assets,1702,
7,15323,7115,Assets Held for Sale,945,17.0
8,7719,4709,Long term Investments,34338,15.0
9,1442,1804,Long Term Debtors,2884,


# Code dump

In [None]:
# import json

# def save_pet(pet):
#     filename = <Whatever filename you want>
#     with open(filename, 'w') as f:
#         f.write(json.dumps(pet))

# def load_pet(filename):
#     with open(filename) as f:
#         pet = json.loads(f.read())
#     return pet

In [None]:
# urls = dict({
#     'Spelthorne': 'https://www.spelthorne.gov.uk/media/23068/Draft-Statement-of-Accounts-2019-20/pdf/SoA_2019-20_100920_update1.pdf?m=637353494717970000',
#     'Nottingham': 'https://www.nottinghaminsight.org.uk/d/aaCFHALE'})

In [None]:
# # List all pdfs from a url.
# def findStatements(council, yearL, yearH, pdfs = []):  
    
#     councilLink = councilLinks_df.loc[council, ['Link']][-1]
                
#     #print("Council link:", councilLink)
    
#     session = requests.Session()
#     # This is needed for non-responsive links. Retry 3 time before moving on.
#     retry = Retry(connect=3, backoff_factor=0.5) 
#     adapter = HTTPAdapter(max_retries=retry)
#     session.mount('http://', adapter)
#     session.mount('https://', adapter)
#     response = session.get(councilLink, verify = False)
    
#     soup = BeautifulSoup(response.text, parse_only=SoupStrainer('a', href = True)) # parse_only=SoupStrainer('a', href = True)     

# #     term1 = 'statement'
# #     term2 = 'accounts'
# #     substrings_v1 = (term1, term2, yearL, yearH)
# #     substrings_v2 = (term1, term2, yearL, yearH[2:])
# #     substrings_v3 = (term1, term2, yearL[2:], yearH[2:])

#     # Different combinations of years
#     # Would be nice to get a nice regex here.
#     substrings_v1 = [yearL, yearH]
#     substrings_v2 = [yearL, yearH[2:]]
#     substrings_v3 = [yearL[2:], yearH[2:]]
#     substrings_v4 = [yearL + yearH]
#     substrings_v5 = [yearL + yearH[2:]]
#     allSubstrings = [substrings_v1, substrings_v2, substrings_v3, substrings_v4, substrings_v5]
    
#     # Find all files containing 'pdf'
#     for link in soup.select("a[href*='pdf']"):
        
#         if conditions(allSubstrings, link, council):
            
#             # Join 'base' link of council with link for statement.
#             fullLink = urllib.parse.urljoin(councilLink, link['href'])
#             #print("PDF link:", fullLink)
                
#             # Add link to pdfs.    
#             pdfs.append(fullLink)    
# #             print(pdfs)

#     # If none found above, look at all files containing 'download'
#     if pdfs == []:
#         for link in soup.select("a[href*='download']"):
#             if conditions(allSubstrings, link):

#                 # Join 'base' link of council with link for statement.
#                 fullLink = urllib.parse.urljoin(councilLink, link['href'])
#                 #print("PDF link:", fullLink)

#                 # Add link to pdfs.    
#                 pdfs.append(fullLink)    
# #                 print(pdfs)
            
#     # Some councils have statements for different years
#     # on different webpages. If this is the case, figure out
#     # the relevant page, and call findStatements recurseively. 
#     if pdfs == []:
        
#         if len(soup.select("a[href*='{}-{}']".format(yearL,yearH))) == 0:
#             yearH = yearH[2:]
            
#             if len(soup.select("a[href*='{}-{}']".format(yearL,yearH))) == 0:
#                 yearL = yearL[2:]
            

#         for link in soup.select("a[href*='{}-{}']".format(yearL,yearH)):
#             #print("New council link (relavant year)", link['href'])

#             newLink = urllib.parse.urljoin(councilLink, link['href'])
#             findStatements(newLink, yearL, yearH, pdfs)

#     # Some of the accounts aren't actually pdf files
#     # So looking at the description of the file as opposed to
#     # the actual file extention. 
#     if pdfs == []:
#         #print('Not a pdf')
#         for link in soup.find_all('a', string = re.compile("pdf", re.IGNORECASE)):
#             fullLink = urllib.parse.urljoin(councilLink, link['href'])
#             pdfs.append(fullLink)
#             break
    
#     # If all else fails, just get all the pdfs.
#     if pdfs == []:
#         for link in soup.select("a[href*='.pdf']"):
#             # Join 'base' link of council with link for statement.
#             fullLink = urllib.parse.urljoin(councilLink, link['href'])
#             #print("PDF link:", fullLink)
                
#             # Add link to pdfs.    
#             pdfs.append(fullLink)  
        
            
#     return pdfs 

In [None]:
# councilLink = 'https://www.centralbedfordshire.gov.uk/info/27/about_your_council/178/annual_accounts_fees_and_charges_budget_statements_and_budget_books'

# yearL = '2014'
# yearH = '2015'

# session = requests.Session()
# # This is needed for non-responsive links. Retry 3 time before moving on.
# retry = Retry(connect=3, backoff_factor=0.5) 
# adapter = HTTPAdapter(max_retries=retry)
# session.mount('http://', adapter)
# session.mount('https://', adapter)
# response = session.get(councilLink, verify = False)

# soup = BeautifulSoup(response.text, parse_only=SoupStrainer('a', href = True))


# substrings_v1 = [yearL, yearH]
# substrings_v2 = [yearL, yearH[2:]]
# substrings_v3 = [yearL[2:], yearH[2:]]
# substrings_v4 = [yearL + yearH]
# substrings_v5 = [yearL + yearH[2:]]
# allSubstrings = [substrings_v1, substrings_v2, substrings_v3, substrings_v4, substrings_v5]

# for link in soup.select("a"):
#     if link.has_attr('href'):
# #     if conditions(allSubstrings, link):
#         print(link.text)


In [None]:
# Accessible links

# for council in councilLinks_df.iterrows():
#     print(councilLinks_df.loc[council[0]]['Link'])

In [None]:
# councilLinks_df.loc['south gloucestershire ua']['Link']
# # bla = {}
# # headers = {'User-Agent': 'Mozilla/5.0'}
# # councilLink = councilLinks_df.loc['south gloucestershire ua']['Link']
# allSubstrings = makeSearchTerms('2018', '2019', other = ['accounts'])
# # response = requests.get(councilLinks_df.loc['south gloucestershire ua']['Link'], 
# #                         timeout = 10,
# #                         headers=headers)

# # soup = BeautifulSoup(response.text, parse_only=SoupStrainer('a', href = True))
# bla = {}
# soup = accessLink(councilLink)

# # Find all files containing 'pdf'
# for link in soup.select("a[href*='pdf']"):

#     if conditions(allSubstrings, link):
#         print(allSubstrings)
# #             print(link)

#         addLink(bla, councilLink, link)
    
# bla
# bla == {}