In [None]:
import google # https://pypi.org/project/google/#files
from googlesearch import search 
import pandas as pd
import tabula
import regex as re
import requests
from urllib.parse import urljoin
from bs4 import BeautifulSoup, SoupStrainer
import httplib2
import validators
import urllib.parse
import time
import warnings
from requests.adapters import HTTPAdapter
from requests.packages.urllib3.util.retry import Retry
import sys
import copy

warnings.filterwarnings(action='ignore')

sys.setrecursionlimit(1000)
# More on the google module here:
#https://python-googlesearch.readthedocs.io/en/latest/index.html#module-googlesearch

# An alternative here maybe:
# https://github.com/Nv7-GitHub/googlesearch/blob/master/googlesearch/__init__.py

In [None]:
# Get links hosting each LAs account statments.
def getLinks(names):
    
    # Empty list for saving links.
    links = []
    
    # Initialize search number (for pausing after a number of searches)
    searchNum = 0
    
    for council in names:
        print(council)
        
        # Increment search number by 1
        searchNum += 1
        print(searchNum)
        
        # Search terms (on query/council)
        query = r'{} statement of accounts'.format(council)
        
        # Google search using 'query' above and save as object.
        queryObj = search(query, tld = 'com', stop = 10)
        
        # Quick and dirty way of getting something from a generator object.
        for j in queryObj:
            if not j.endswith('pdf'):
                links.append(j)
                break
        
        # Add a pause to the search every 30th term to avoid 'Too Many Requests'
        if searchNum%30 == 0:
            time.sleep(1000)
        
        
    return links    

In [None]:
# Checking a single query

# query = r'{} statement of accounts "gov.uk"'.format('bedford')

# # Google search using 'query' above and save as object.
# queryObj = search(query, tld = 'com', stop = 10)

# # Quick and dirty way of getting something from a generator object.
# for j in queryObj:
#     if not j.endswith('pdf'):
#         print(j)

In [None]:
# # Get list of all councils
councilNames = pd.read_csv('./councilNames.csv', sep = ",")
councilNames = councilNames['x'].str.lower()


# Get link to page where account statements will be for each council.
councilLinks = getLinks(councilNames)

# Save as df -> csv 
dfLinks = pd.DataFrame(data = {"Councils": councilNames, "Link": councilLinks})
dfLinks.to_csv("./councilLinks.csv", sep = ',', index = False)

# If the above already executed:
# # Load csv
# councilLinks_df = pd.read_csv('./councilLinks.csv')
# # councilLinks = councilLinks_df['Link'] 
# councilLinks_df.set_index('Councils', inplace = True)

# Check if they have the same length.
print(councilLinks_df.shape[0])
print(len(councilNames))

# They have the same length, but we don't know 
# if we actually got one link per LA 
# e.g. some searches might've returned news articles or
# other stuff.

# It's not straighforward to check, so will go ahead
# with the assumption that we got all relevant links
# and see if there's any anomalies when getting the 
# PDFs from the pages. 

In [None]:
def addLink(pdfs, councilLink, pdfLink):
    
    
    # Join 'base' link of council with link for statement.
    fullLink = urllib.parse.urljoin(councilLink, pdfLink['href'])
    
    # Add link to pdfs dict.
    # 'Description': 'link'
    pdfs[pdfLink.text] = fullLink
    
    return pdfs
    

def makeSearchTerms(yearL, yearH, other = []):
    
    # Different combinations of years
    # Would be nice to get a nice regex here.
    substrings_v1 = [yearL, yearH]
    substrings_v2 = [yearL, yearH[2:]]
    substrings_v3 = [yearL[2:], yearH[2:]]
    substrings_v4 = [yearL + yearH]
    substrings_v5 = [yearL + yearH[2:]]
    
    allSubstrings = [substrings_v1, substrings_v2, substrings_v3, substrings_v4, substrings_v5]
    
    if other != []:
        for ss in allSubstrings:
            ss.extend(other)
    
    return allSubstrings
    


def accessLink(councilLink):
    try:
        response = requests.get(councilLink, 
                                timeout = 15,
                               headers = {'User-Agent': 'Mozilla/5.0'})
        soup = BeautifulSoup(response.text, parse_only=SoupStrainer('a', href = True))
        return soup
    except requests.exceptions.RequestException as e:
        return 0


def conditions(subStrVers, link):
    '''
    Checks if all elements of a list (of strings) are present in a link.
    subStrVers is a list of lists, so that different versions of the lists can be checked.
    Makes more sense after looking at findStatements() below.
    
    Input: 
        subStrVers - a list of lists.
        link - a string. 
    
    Outputs True if conditions are satisfied, False otherwise.
    '''
    satisfied = False
    
    # Search in file name/descriptions.
    for ss in subStrVers:
        if all(s in link.text.lower() for s in ss):
            satisfied = True
            break
            
    
    # Search in link name.
    for ss in subStrVers:
        if all(s in link['href'].lower() for s in ss):
            satisfied = True
            break
    

    return satisfied        

    
    

# List all pdfs from a url.
def findStatements(councilLink, yearL, yearH, otherTerms = [], pdfs = {}):  
      
    soup = accessLink(councilLink)
    
    # If site can't be accessed stop and return empty pdfs.
    if soup == 0:
        return pdfs

    
    allSubstrings = makeSearchTerms(yearL, yearH, other = otherTerms)
    
    # Find all files containing 'pdf'
    for link in soup.select("a[href*='pdf']"):
        
        if conditions(allSubstrings, link):
            addLink(pdfs, councilLink, link)
        

    # If none found above, look at all links and see if terms
    # can be found in the link or description
    if pdfs == {}:
        for link in soup.select("a"):
            if conditions(allSubstrings, link):
                
                # Add link to 'pdfs' dict. 
                addLink(pdfs, councilLink, link)
                

            
    # Some councils have statements for different years
    # on different webpages. If this is the case, figure out
    # the relevant page, and call findStatements recurseively. 
    if pdfs == {}:
        # This is really messy as I was basically trying 
        # to add conditions as I went, could probably use some cleaning up.
        
        # Initial candidate for new council link (i.e. selection)
        selection = soup.select("a[href*='{}-{}']".format(yearL,yearH))
        
        # If a link containing 'yearL'-'yearH' not available.
        if len(selection) == 0:
            # Try 'yearL' - 'yearH'[2:], e.g. '2017' - '18'
            selection = soup.select("a[href*='{}-{}']".format(yearL,yearH[2:]))
            
        # If 'yearL' - 'yearH'[2:] not available, try 'yearL'[2:] - 'yearH'[2:]
        if len(selection) == 0:
            selection = soup.select("a[href*='{}-{}']".format(yearL[2:],yearH[2:]))
                    
        # If still haven't found anything, try the above w/ different format.
        # e.g. '20172018' (instead of '2017'-'2018')
        if len(selection) == 0:
            selection = soup.select("a[href*='{}{}']".format(yearL,yearH))  
            
        if len(selection) == 0:
            selection = soup.select("a[href*='{}{}']".format(yearL,yearH[2:]))  
            
        if len(selection) == 0:
            selection = soup.select("a[href*='{}{}']".format(yearL[2:],yearH[2:]))     
            
        # If none are available look for 'historical' in link name.       
        if len(selection) == 0:
            selection = soup.select("a[href*='historical']")                  

        # W/ the final selection, clean up the link and call finalStatements again
        # with the new link.
        for link in selection:
#             print("New council link (relavant year)", link['href'])

            newLink = urllib.parse.urljoin(councilLink, link['href'])
            print("Old link:", councilLink)
            print("New line:", newLink)
    
            if newLink == councilLink:
                break
            else:    
                findStatements(newLink, yearL, yearH, otherTerms, pdfs)

    # Some of the accounts aren't actually pdf files
    # So looking at the description of the file as opposed to
    # the actual file extention for the term 'pdf'. 
    if pdfs == {}:
        for link in soup.find_all('a', string = re.compile("pdf", re.IGNORECASE)):
            
            # Add link to 'pdfs' dict.
            addLink(pdfs, councilLink, link)
            break
    
    # If all else fails, just get all the pdfs
    if pdfs == {}:
        for link in soup.select("a[href*='.pdf']"):
            
            # Add link to pdfs dict.    
            addLink(pdfs, councilLink, link)

    # If that also fails get all asp.        
    if pdfs == {}:
        for link in soup.select("a[href*='asp']"):

            # Add link to pdfs dict.    
            addLink(pdfs, councilLink, link)
        
            
    return pdfs 


def getStatements(yearL, yearH):

    allStatements = {el:{} for el in councilLinks_df.index.values.tolist()}
    noPDFs = []
    print(yearL, yearH)
    for el in allStatements:

        allStatements[el] = findStatements(councilLink = councilLinks_df.loc[el]['Link'], \
                                           yearL = yearL, \
                                           yearH = yearH, \
#                                            otherTerms = ['accounts'],\
                                           pdfs = {})
        
#         allStatements[el] = checkStatements(allStatements[el])
        print("Got council: {} ({} links found)".format(el, len(allStatements[el])))

        if len(allStatements[el]) == 0:
            noPDFs.append(el)
            
            
    return allStatements, noPDFs        


def conditionsDict(subStrVers, key):
    
    satisfied = False
    
    for ss in subStrVers:
        if all(s in key for s in ss):
            satisfied = True
            
    return satisfied   



def checkStatement(pdfs, yearL, yearH, otherTerms = []):
    
                   
    testTerms = makeSearchTerms(yearL, yearH, other = otherTerms)
    

    res = [term for key, 
           term in pdfs.items() 
           if (conditionsDict(testTerms, key) or
          conditionsDict(testTerms, term))]
    
    if res != []:
        return res[0]

    
    return res
  

def checkAllStatements(allStatements, yearL, yearH, otherTerms = []):

    for council in allStatements:
        
        allStatements[council] = checkStatement(allStatements[council],
                                               yearL, yearH,
                                               otherTerms = otherTerms)
            
    return allStatements        


def countLinks(allStatements):
    
    count_zero = 0
    count_more = 0

    for el in allStatements:
        if len(allStatements[el]) == 0:
#             print("Zero:", el)
            count_zero +=1

        if len(allStatements[el]) > 1:
#             print("More than 1: ", el)
            count_more +=1

    print("\tLAs with 0 links:", count_zero)
#     print("\tLAs with more than 1 link:", count_more)

In [None]:
# Get statements from 2017 - 2018
st_17_18, missing_17_18 = getStatements('2017', '2018')

# Checking statements and getting single links.
# st_17_18_checked = copy.deepcopy(st_17_18)
# st_17_18_checked = checkAllStatements(st_17_18_checked, '2017', '2018')

In [None]:
# Get statements from 2018 - 2019
st_18_19, missing_18_19 = getStatements('2018', '2019')

# Run some checks
st_18_19_checked = copy.deepcopy(st_18_19)
st_18_19_checked = checkAllStatements(st_18_19_checked, '2018', '2019')

In [None]:
# Get statements from 2019 - 2020
st_19_20, missing_19_20 = getStatements('2019', '2020')

# Run some checks
st_19_20_checked = copy.deepcopy(st_19_20)
st_19_20_checked = checkAllStatements(st_19_20_checked, '2019', '2020')

In [None]:
print("\nCounts before checks 2017 - 2018 (# LA):")
countLinks(st_17_18)

print("\nCounts after checks 2017 - 2018 (# LA):")
countLinks(st_17_18_checked)

print("\n\n")

print("\nCounts before checks 2018 - 2019 (# LA):")
countLinks(st_18_19)

print("\nCounts after checks 2018 - 2019 (# LA):")
countLinks(st_18_19_checked)

print("\n\n")

print("\nCounts before checks 2019 - 2020 (# LA):")
countLinks(st_19_20)

print("\nCounts after checks 2019 - 2020 (# LA):")
countLinks(st_19_20_checked)

In [None]:
import json

def save_statements(statements, filename):
    filename = filename
    with open(filename, 'w') as f:
        f.write(json.dumps(statements))
        
def load_statements(filename):
    with open(filename) as f:
        statements = json.loads(f.read())
    return statements


In [None]:
# Save the statements (json-like documents)
save_statements(st_17_18_checked, 'st_17_18')
save_statements(st_18_19_checked, 'st_18_19')
save_statements(st_19_20_checked, 'st_19_20')