### Scraping NLB books that I have bookmarked 

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

In [3]:
from bs4 import BeautifulSoup as bs
from selenium import webdriver
from glob import glob
from tqdm import tqdm
import pandas as pd
import warnings
import pygsheets
import math
import time
import re
import os

In [4]:
# Some notebook configs
warnings.filterwarnings('ignore')
pd.set_option('display.max_colwidth', 1000)

In [5]:
from nlb_fun import *

### Clean files first 
- If you have ran this script before, information from each book is saved as a rtf file in your local machine. 
- To ensure that there is no overlaps, these rtf files are checked and removed everytime you re-run your script

In [6]:
file_list = glob("*.rtf")
len(file_list)

0

In [7]:
for files in file_list:
    os.remove(files)

### Go to start the scraping 

In [8]:
browser = activate_chrome_selenium_latest(have_pic=False, is_headless=False)



Current google-chrome version is 94.0.4606
Get LATEST driver version for 94.0.4606
Driver [/Users/cliff/.wdm/drivers/chromedriver/mac64/94.0.4606.61/chromedriver] found in cache


In [9]:
auth_csv_file: str = os.environ['nlb_login']

info = pd.read_csv(auth_csv_file)
account_name: str = info['values'][0]
password: str = info['values'][1]

browser = log_in_nlb(browser, account_name, password)

### Add hit to get number of page iterations needed 

In [10]:
url_link = "https://www.nlb.gov.sg/mylibrary/Bookmarks"    
browser.get(url_link)
soup = bs(browser.page_source, 'html5lib')

In [11]:
max_records = float(soup.find_all("div", text=re.compile("Showing"))[0].text.split(" ")[-2])
range_list = range(1, int(math.ceil(max_records / 20)) + 1)
range_list

range(1, 5)

### Loop through the pages! 

In [12]:
book_urls_dict = dict()

browser.get("https://www.nlb.gov.sg/mylibrary/Bookmarks")
soup = bs(browser.page_source, 'html5lib')
book_urls_dict[0] = list(set(get_book_urls_on_page(soup)))
time.sleep(1)

for i in range_list:
    try:
        browser.find_element_by_xpath('//*[@id="bookmark-folder-content"]/div[2]/button[6]').click()
        time.sleep(1)
        soup = bs(browser.page_source, 'html5lib')
        book_urls_dict[i] = list(set(get_book_urls_on_page(soup)))
    except:
        break

In [13]:
all_book_url_lists = list()
for i in range(0, len(book_urls_dict)):
    all_book_url_lists = all_book_url_lists + book_urls_dict[i]

In [14]:
len(all_book_url_lists)

74

In [15]:
browser.close()

#### Note
- This is a troublsome script to go to each link that I have, and see if I am on the link with the correct book info. If not, it means that I still need to do more clickings.
- **Brace yourself.** Because this portion of the code goes through each book to get the relevant information, this part can be quite slow if you have quite a few books in your bookmark

In [16]:
def return_needed_url(id_: str):
    return f"https://eservice.nlb.gov.sg/item_holding.aspx?id={id_}&type=bid&app=mylibrary"

def nlb_page_crawl(selenium_browser, url_link: str):
    id_ = re.findall(r'\d+', url_link)[-1]
    new_url = return_needed_url(id_)

    selenium_browser.get(new_url)
    time.sleep(10)
    return bs(selenium_browser.page_source, 'html5lib')

def extract_table_values(selenium_obj, tag: str) -> list:
    """
    Extracting values from a HTML` table structure based on tag
    """
    output = []
    for col in selenium_obj.find_all(tag):
        output.append(col)
    return output

def extract_table_values_nested(selenium_obj, input_1, input_2):
    """
    Accounts for nested table tags in a HTML table.
    """
    output = []

    for i in extract_table_values(selenium_obj, input_1):
        output.append(extract_table_values(i, input_2))
        
    return output

In [43]:
def extract_info_from_nlb_book_page(selenium_browser, url_link :str):
    selenium_page = nlb_page_crawl(selenium_browser, url_link)

    for i in selenium_page.find_all(class_= 'table table-bordered table-striped table-list'):
        col_names = extract_table_values(i, 'th')
        col_values = extract_table_values_nested(i, 'tr', 'td')

    bk_table = pd.DataFrame([[x for x in i] for i in col_values])
    bk_table.columns = [i.text for i in col_names]
    bk_table['Title'] = selenium_page.find('h4').text
    bk_table = bk_table[bk_table.Library.notnull()]

    for col in ["Call No", "Status/Desc", "Due Date", "Library"]:
        bk_table[col] = [i.text.split(": ")[-1] for i in bk_table[col]]
    
    bk_table['Library'] = [i.split("B3")[0].split("B2")[0] for i in bk_table['Library']]
    bk_table['Due Date'] = ["Available" if i == '-' else i for i in bk_table['Due Date']]
    
    id_ = re.findall(r'\d+', url_link)[-1]
    bk_table[['Title', 'Library', 'Call No', 'Due Date']].to_csv(f"{id_}.csv", index=False)
    
    # print(f"{selenium_page.find('h4').text} is done")

In [44]:
sel_browser = activate_chrome_selenium_latest(have_pic=False, is_headless=False)

for url in tqdm(all_book_url_lists[-5:]):
    try:
        extract_info_from_nlb_book_page(sel_browser, url)
    except:
        print(f"Most probably hit an eBook {url}")

sel_browser.close()



Current google-chrome version is 94.0.4606
Get LATEST driver version for 94.0.4606
Driver [/Users/cliff/.wdm/drivers/chromedriver/mac64/94.0.4606.61/chromedriver] found in cache
100%|██████████| 5/5 [01:03<00:00, 12.62s/it]


In [36]:
csv_files = glob("*.csv")

In [38]:
final_table = pd.DataFrame()

for filename in csv_files:
    final_table = final_table.append(pd.read_csv(filename))

In [39]:
final_table.shape

(45, 5)

In [40]:
final_table.head(1)

Unnamed: 0.1,Unnamed: 0,Title,Library,Call No,Due Date
0,1,Indistractable : how to control your attention and choose your life / Nir Eyal with Julie Li.,['Ang Mo Kio Public Library'],153.8 EYA,Available


In [None]:
from multiprocessing import Pool
import requests
from bs4 import BeautifulSoup

base_url = 'http://quotes.toscrape.com/page/'

all_urls = list()

def generate_urls():
    for i in range(1,11):
        all_urls.append(base_url + str(i))
    
def scrape(url):
    res = requests.get(url)
    print(res.status_code, res.url)

generate_urls()

p = Pool(10)
p.map(scrape, all_urls)
p.terminate()
p.join()

### Taking locally saved files and loading into Google 

In [23]:
final_table.columns = ['title', "library", "number", 'availability']
final_table = final_table[['library', 'title', 'number', 'availability']]

In [24]:
unique_book_count = len(final_table.title.drop_duplicates().tolist())
unique_book_count

2

### Thinking about how to include testing into my script

In [None]:
# final_table[~final_table.availability.isin(['Available', 'For Reference Only'])]

In [None]:
final_table = final_table[final_table.availability.notnull()]
final_table = final_table[final_table.availability != "For Reference Only"]

### Thinking about testing my code 

In [None]:
final_table[final_table.availability.isnull()].shape

In [None]:
final_table[final_table['number'].isnull()].shape

In [None]:
final_table[final_table['number'].isnull()]

### Processing 

In [None]:
final_table.title = [i.split(" | ")[0] for i in final_table.title]
final_table['number'] = [i.replace("English", "").replace("Chinese", "") for i in final_table['number']]
final_table.loc[final_table.library == "Repository Used Book Collection", 'availability'] = "For Reference Only"
final_table['title'] = [i.split(r"/")[0].strip() for i in final_table['title']]

In [None]:
ffinal_table = final_table[(final_table.library=="Bishan Public Library")]
ffinal_table = ffinal_table.sort_values('availability')
ffinal_table.shape

### Cleaning Bookmarks Sheet 

In [None]:
google_auth = os.environ['gsheet_cred']
gc = pygsheets.authorize(service_file=google_auth)
sh = gc.open('NLB Project')

### Checking just Bishan library

In [None]:
bishan = sh.worksheet_by_title("Bookmarks")
bishan.clear('A2:E1000')

bishan_table = final_table[final_table.library.str.contains("Bishan")]
bishan.set_dataframe(bishan_table,(1,1))

### Checking in all libraries

In [None]:
all_ = sh.worksheet_by_title("All")
all_.clear('A2:F1000') 

all_.set_dataframe(final_table,(1,1))

### [Link](https://docs.google.com/spreadsheets/d/1s5oYU59jyU_QO3IIhCClyWGoC_MpW9L_h4l4djDUKO0/edit#gid=1021888748) to my Google Sheet

### Popular books 
- Identify popular books

In [None]:
# total = final_table.groupby('title').availability.count().reset_index()
# available = final_table[final_table.availability == "Available"].groupby('title').availability.count().reset_index()
# total.columns = ['title', 'total']
# available.columns = ['title', 'avail_count']
# total.merge(available)

In [None]:
available_books = list(set(final_table[final_table.availability == 'Available'].title.tolist()))

In [None]:
final_table[~final_table.title.isin(available_books)]