### Scraping NLB books that I have bookmarked 

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
from multiprocessing import Pool
from concurrent.futures import ThreadPoolExecutor

from selenium import webdriver
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By

from bs4 import BeautifulSoup as bs
from itertools import cycle
from glob import glob
from tqdm import tqdm
import pandas as pd
import numpy as np

import warnings
import pygsheets
import math
import time
import re
import os

In [3]:
# Some notebook configs
warnings.filterwarnings('ignore')
pd.set_option('display.max_colwidth', 1000)

In [4]:
from nlb_fun import *

### Clean files first 
- If you have ran this script before, information from each book is saved as a rtf file in your local machine. 
- To ensure that there is no overlaps, these rtf files are checked and removed everytime you re-run your script

In [5]:
file_list = glob("*.rtf")
len(file_list)

0

In [6]:
for files in file_list:
    os.remove(files)

### Go to start the scraping 

In [7]:
browser = activate_chrome_selenium_latest(have_pic=False, is_headless=True)



Current google-chrome version is 94.0.4606
Get LATEST driver version for 94.0.4606
Driver [/Users/cliff/.wdm/drivers/chromedriver/mac64/94.0.4606.61/chromedriver] found in cache


In [8]:
auth_csv_file: str = os.environ['nlb_login']

info = pd.read_csv(auth_csv_file)
account_name: str = info['values'][0]
password: str = info['values'][1]

browser = log_in_nlb(browser, account_name, password)

### Add hit to get number of page iterations needed 

In [9]:
url_link = "https://www.nlb.gov.sg/mylibrary/Bookmarks"    
browser.get(url_link)
soup = bs(browser.page_source, 'html5lib')

In [10]:
max_records = float(soup.find_all("div", text=re.compile("Showing"))[0].text.split(" ")[-2])
range_list = range(1, int(math.ceil(max_records / 20)) + 1)
range_list

range(1, 4)

### Loop through the pages! 

In [11]:
book_urls_dict = dict()

browser.get("https://www.nlb.gov.sg/mylibrary/Bookmarks")
soup = bs(browser.page_source, 'html5lib')
book_urls_dict[0] = list(set(get_book_urls_on_page(soup)))
time.sleep(1)

for i in range_list:
    try:
        browser.find_element_by_xpath('//*[@id="bookmark-folder-content"]/div[2]/button[5]').click()
        time.sleep(1)
        soup = bs(browser.page_source, 'html5lib')
        book_urls_dict[i] = list(set(get_book_urls_on_page(soup)))
    except:
        break

In [12]:
all_book_url_lists = list()
for i in range(0, len(book_urls_dict)):
    all_book_url_lists = all_book_url_lists + book_urls_dict[i]

In [13]:
len(all_book_url_lists)

60

In [14]:
browser.close()

#### Note
- This is a troublsome script to go to each link that I have, and see if I am on the link with the correct book info. If not, it means that I still need to do more clickings.
- **Brace yourself.** Because this portion of the code goes through each book to get the relevant information, this part can be quite slow if you have quite a few books in your bookmark

In [15]:
def return_needed_url(id_: str):
    return f"https://eservice.nlb.gov.sg/item_holding.aspx?id={id_}&type=bid&app=mylibrary"

def nlb_page_crawl(selenium_browser, url_link: str):
    id_ = re.findall(r'\d+', url_link)[-1]
    new_url = return_needed_url(id_)

    selenium_browser.get(new_url)
    time.sleep(10)
    return bs(selenium_browser.page_source, 'html5lib')

def extract_table_values(selenium_obj, tag: str) -> list:
    """
    Extracting values from a HTML` table structure based on tag
    """
    output = []
    for col in selenium_obj.find_all(tag):
        output.append(col)
    return output

def extract_table_values_nested(selenium_obj, input_1, input_2):
    """
    Accounts for nested table tags in a HTML table.
    """
    output = []

    for i in extract_table_values(selenium_obj, input_1):
        output.append(extract_table_values(i, input_2))
        
    return output

In [16]:
def extract_info_from_nlb_book_page(url_link :str, selenium_browser):
    try:
        selenium_page = nlb_page_crawl(selenium_browser, url_link)

        for i in selenium_page.find_all(class_= 'table table-bordered table-striped table-list'):
            col_names = extract_table_values(i, 'th')
            col_values = extract_table_values_nested(i, 'tr', 'td')

        bk_table = pd.DataFrame([[x for x in i] for i in col_values])
        bk_table.columns = [i.text for i in col_names]
        bk_table['Title'] = selenium_page.find('h4').text
        bk_table = bk_table[bk_table.Library.notnull()]

        for col in ["Call No", "Status/Desc", "Due Date", "Library"]:
            bk_table[col] = [i.text.split(": ")[-1] for i in bk_table[col]]

        bk_table['Library'] = [i.split("B3")[0].split("B2")[0] for i in bk_table['Library']]
        bk_table['Due Date'] = ["Available" if i == '-' else i for i in bk_table['Due Date']]

        id_ = re.findall(r'\d+', url_link)[-1]
        bk_table[['Title', 'Library', 'Call No', 'Due Date']].to_csv(f"{id_}.csv", index=False)

        print(f"{selenium_page.find('h4').text} is done")
    
    except:
        print(f"Most probably hit an eBook {selenium_page.find('h4').text}")

In [17]:
# sel_browser = activate_chrome_selenium_latest(have_pic=False, is_headless=True)

# for url in tqdm(all_book_url_lists[-5:]):
#     try:
#         extract_info_from_nlb_book_page(url, sel_browser)
#     except:
#         print(f"Most probably hit an eBook {url}")

# sel_browser.close()

In [18]:
def setup_workers():
    workers = 3
    files = all_book_url_lists
    drivers = cycle([activate_chrome_selenium_latest(have_pic=False, is_headless=False) for _ in range(workers)])

    with ThreadPoolExecutor(max_workers=workers) as executor:
        executor.map(extract_info_from_nlb_book_page, files, drivers)

    [driver.quit() for driver in drivers]
    ThreadPoolExecutor.shutdownNow()
    print("Process has ended")

In [19]:
setup_workers()



Current google-chrome version is 94.0.4606
Get LATEST driver version for 94.0.4606
Driver [/Users/cliff/.wdm/drivers/chromedriver/mac64/94.0.4606.61/chromedriver] found in cache


Current google-chrome version is 94.0.4606
Get LATEST driver version for 94.0.4606
Driver [/Users/cliff/.wdm/drivers/chromedriver/mac64/94.0.4606.61/chromedriver] found in cache


Current google-chrome version is 94.0.4606
Get LATEST driver version for 94.0.4606
Driver [/Users/cliff/.wdm/drivers/chromedriver/mac64/94.0.4606.61/chromedriver] found in cache


Value proposition design : how to create products and services customers want. get started with... / written by Alex Osterwalder, Yves Pigneur, Greg Bernarda, Alan Smith ; designed by Trish Papadakos. is doneReact design patterns and best practices : design, build and deploy production-ready web applications using standard industry practices / Carlos Santana Roldán. is done

Most probably hit an eBook Genius makers : the mavericks who brought A.I. to Google, Facebook, and the world / Cade Metz.
React Hooks in action : with suspense and concurrent mode / John Larsen. is done
Genius makers : the mavericks who brought A.I. to Google, Facebook, and the world / Cade Metz. is done
Clean code in JavaScript : develop reliable, maintainable, and robust JavaScript / James Padolsey. is done
React 17 design patterns and best practices : design, build, and deploy production-ready web applications using industry-standard practices / Carlos Santana Roldan. is done
Most probably hit an eBook IQ, EQ, 

KeyboardInterrupt: 

In [20]:
csv_files = glob("*.csv")
len(csv_files)

56

In [21]:
final_table = pd.DataFrame()

for filename in csv_files:
    final_table = final_table.append(pd.read_csv(filename))

In [22]:
final_table.shape

(729, 4)

In [23]:
final_table.head(1)

Unnamed: 0,Title,Library,Call No,Due Date
0,"IQ, EQ, DQ : new intelligence in the AI age / Yuhyun Park.",Ang Mo Kio Public Library,303.4833 PAR,Available


### Taking locally saved files and loading into Google 

In [24]:
final_table.columns = ['title', "library", "number", 'availability']
final_table = final_table[['library', 'title', 'number', 'availability']]

In [25]:
unique_book_count = len(final_table.title.drop_duplicates().tolist())
unique_book_count

56

### Thinking about how to include testing into my script

In [26]:
# final_table[~final_table.availability.isin(['Available', 'For Reference Only'])]

In [27]:
final_table = final_table[final_table.availability.notnull()]
final_table = final_table[final_table.availability != "For Reference Only"]

### Thinking about testing my code 

In [28]:
final_table[final_table.availability.isnull()].shape

(0, 4)

In [29]:
final_table[final_table['number'].isnull()].shape

(0, 4)

In [30]:
final_table[final_table['number'].isnull()]

Unnamed: 0,library,title,number,availability


### Processing 

In [31]:
final_table.title = [i.split(" | ")[0] for i in final_table.title]
final_table['number'] = [i.replace("English", "").replace("Chinese", "") for i in final_table['number']]
final_table.loc[final_table.library == "Repository Used Book Collection", 'availability'] = "For Reference Only"
final_table['title'] = [i.split(r"/")[0].strip() for i in final_table['title']]

In [32]:
ffinal_table = final_table[(final_table.library=="Bishan Public Library")]
ffinal_table = ffinal_table.sort_values('availability')
ffinal_table.shape

(36, 4)

### Cleaning Bookmarks Sheet 

In [33]:
google_auth = os.environ['gsheet_cred']
gc = pygsheets.authorize(service_file=google_auth)
sh = gc.open('NLB Project')

### Checking just Bishan library

In [34]:
bishan = sh.worksheet_by_title("Bookmarks")
bishan.clear('A2:E1000')

bishan_table = final_table[final_table.library.str.contains("Bishan")]
bishan.set_dataframe(bishan_table,(1,1))

### Checking in all libraries

In [35]:
all_ = sh.worksheet_by_title("All")
all_.clear('A2:F1000') 

all_.set_dataframe(final_table,(1,1))

### [Link](https://docs.google.com/spreadsheets/d/1s5oYU59jyU_QO3IIhCClyWGoC_MpW9L_h4l4djDUKO0/edit#gid=1021888748) to my Google Sheet

In [36]:
for files in csv_files:
    os.remove(files)

### Popular books 
- Identify popular books

In [37]:
# total = final_table.groupby('title').availability.count().reset_index()
# available = final_table[final_table.availability == "Available"].groupby('title').availability.count().reset_index()
# total.columns = ['title', 'total']
# available.columns = ['title', 'avail_count']
# total.merge(available)

In [38]:
available_books = list(set(final_table[final_table.availability == 'Available'].title.tolist()))

In [39]:
final_table[~final_table.title.isin(available_books)]

Unnamed: 0,library,title,number,availability
