# Scottish Widows Document Scraping

For the literature library search:  
https://adviser.scottishwidows.co.uk/literature-library.html

For specific searching cirteria, for example *guides*:  
https://adviser.scottishwidows.co.uk/literature-library.html?n=1000&filter=swe:literaturelibrary/contenttype/guides

In [None]:
import io, os
from urllib.parse import urlparse

import pandas as pd
import requests
from bs4 import BeautifulSoup
from tqdm.notebook import tqdm
import PyPDF2


pd.options.display.max_rows = 100
pd.options.display.max_columns = 100


## PDF Content Extraction

In [None]:
# Using pypdf2 to read a pdf uri

pdf_url = "https://adviser.scottishwidows.co.uk/assets/literature/docs/42365.pdf"
#pdf_url = "https://adviser.scottishwidows.co.uk//assets/literature/docs/27316.pdf"

response = requests.get(url=pdf_url)

assert response.status_code == requests.codes.ok


pdf_reader = PyPDF2.PdfReader( io.BytesIO(response.content) )

print(f"Total pages: {len(pdf_reader.pages)}")

for i, page in enumerate(pdf_reader.pages):
    page_text = page.extract_text()
    print(f"Page {i+1}: {page_text}")


In [None]:
def get_pdf_pages(pdf_url):
    """Extract content of a pdf file page by page and return in a DataFrame with page_number and page_text columns"""
    
    url_parsed = urlparse(pdf_url)
    if url_parsed.scheme in ('file', ''): # possibly a local file
        assert os.path.exists(url_parsed.path)
        pdf_file = url_parsed.path
    else: # possibly a remote url, need to fetch it first
        response = requests.get(url=pdf_url)
        assert response.status_code == requests.codes.ok
        pdf_file = io.BytesIO(response.content)

    pdf_reader = PyPDF2.PdfReader( pdf_file )
        
    return pd.DataFrame([
            {"page_number": i+1, "page_text": page.extract_text()} 
            for i, page in enumerate(pdf_reader.pages)
        ])


In [None]:
# local_file = "../data/56036.pdf"
# df = get_pdf_pages(local_file)

df = get_pdf_pages(pdf_url)

df.head()              

## Search the literature library

In [None]:
# single pdf
# https://adviser.scottishwidows.co.uk/literature-library.html?filter=swe:literaturelibrary/contenttype/guides#search

# search default: 10 itmes in a page 
search_url = "https://adviser.scottishwidows.co.uk/literature-library.html?filter=swe:literaturelibrary/contenttype/guides#search"

# search and display all with number of items set to 1000
search_url = "https://adviser.scottishwidows.co.uk/literature-library.html?n=1000&filter=swe:literaturelibrary/contenttype/guides"


In [None]:
search_response = requests.get(url=search_url)

soup = BeautifulSoup(search_response.content, "html.parser")

#search_response.content
print(soup.title)

### All the links in a page

In [None]:
print(soup.title.string)

# all links in the page
nb_links = len(soup.find_all('a'))
print(f"There are {nb_links} links in this page.\n")

# text from the page
#print(soup.get_text())

_ = [print(a) for a in soup.find_all('a')]

In [None]:
# all the resulting pdf files are in the anchor elements with "title" class
download_links = soup.find_all(class_="title")

print(f"Total downlaodable links: {len(download_links)}")

download_links[0]

In [None]:
print(download_links[0].get("href"))
print(download_links[0].string.strip())

In [None]:
pdf_uris = [download_link.get("href") for download_link in download_links]
_ = [print("https://adviser.scottishwidows.co.uk/" + uri) for uri in pdf_uris]

In [None]:
def get_all_pdf_links(entry_page_url):
    """Extract all pdf links from an url and return a DataFrame with title and pdf url as columns"""
    
    response = requests.get(url=entry_page_url)
    soup = BeautifulSoup(response.content, "html.parser")

    download_links = soup.find_all(class_="title")
    
    df = pd.DataFrame([
        {"title": pdf_link.string.strip(), "url": "https://adviser.scottishwidows.co.uk" + pdf_link.get("href")} 
        for pdf_link in download_links 
    ])
    
    return df


In [None]:
pdf_urls_df = get_all_pdf_links(search_url)
pdf_urls_df.tail()

In [None]:
pdf_urls_df.shape

### Validating the end points

In [None]:
def check_url_exist(url):
    """To check the url endpoint does exist"""
    
    response = requests.get(url=url)
    
    return response.status_code == requests.codes.ok

def clean_pdf_urls(urls_df):
    """To remove all the invalid urls from the urls_df"""
    
    exist = urls_df["url"].apply(check_url_exist) # ToDo: do it in parallel
    
    return urls_df.loc[exist]

def is_encrypted(pdf_url):
    response = requests.get(url=pdf_url)
    assert response.status_code == requests.codes.ok
    
    pdf_reader = PyPDF2.PdfReader( io.BytesIO(response.content) )

    return pdf_reader.is_encrypted

print(is_encrypted(pdf_url))

clean_pdf_urls(pdf_urls_df[0:10])

In [None]:
pdf_urls_df[0:10].loc[lambda _s: _s.url.apply(check_url_exist)]

In [None]:
#%%timeit -n 1 -r 1

df1 = clean_pdf_urls(pdf_urls_df)

(pdf_urls_df.shape, df1.shape)

In [None]:
#%%timeit -n 1 -r 1

df2 = df1.url.apply(is_encrypted)

df2.shape

In [None]:
# some files are encrypted
df1.loc[df2]

## Collect all the PDF Contents

In [None]:
for index, row in pdf_urls_df.iloc[0:2].iterrows():
    print(f"-------------{index}----")
    print(get_pdf_pages(row.url).assign(title=row.title))


In [None]:

df = pd.concat(
    [
        get_pdf_pages(row.url).assign(title=row.title)
        for index, row in pdf_urls_df.iloc[0:10].iterrows() if check_url_exist(row.url)
    ],
    axis=0, 
    ignore_index=True)

print(df.shape)
df.head()

In [None]:
df.tail()

In [None]:
#%%timeit -n 1 -r 1 # about 3.5 minutes to run
import time

start = time.time()
guides_df = pd.concat(
    [
        get_pdf_pages(row.url).assign(title=row.title)
        for index, row in pdf_urls_df.iterrows() if check_url_exist(row.url)
    ],
    axis=0, 
    ignore_index=True)
print(time.time() - start)

print(guides_df.shape)

guides_df.tail()

In [None]:
guides_df.memory_usage(deep=True)

In [None]:
all_guides_file = "../data/scottish_widows_all_guides.pq"

In [None]:
guides_df.to_parquet(all_guides_file)

In [None]:
df2 = pd.read_parquet(all_guides_file)
df2.head()

In [None]:
df2.shape

In [None]:
pd.testing.assert_frame_equal(df2, guides_df)

## Scratch

In [None]:
#try_url = "https://adviser.scottishwidows.co.uk//assets/literature/docs/42365.pdf"
#try_url = "https://adviser.scottishwidows.co.uk//assets/literature/docs/fsaSWplcFSAReturn2007.pdf"
#try_url = "https://adviser.scottishwidows.co.uk//assets/literature/docs/27316.pdf"
#try_url = "https://adviser.scottishwidows.co.uk//assets/literature/docs/28742a.pdf"
#try_url = "https://adviser.scottishwidows.co.uk//assets/literature/docs/56241.pdf"

#try_url = "https://adviser.scottishwidows.co.uk/assets/literature/docs/52125.pdf"
#try_url = "https://adviser.scottishwidows.co.uk/assets/literature/docs/56696.pdf"
try_url = "https://adviser.scottishwidows.co.uk/assets/literature/docs/56036.pdf"
    
get_pdf_pages(try_url)