### Scraping NLB books that I have bookmarked 

In [1]:
%load_ext autoreload
%autoreload 2

from multiprocessing import Pool
from concurrent.futures import ThreadPoolExecutor

from selenium import webdriver
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By

from bs4 import BeautifulSoup as bs
from itertools import cycle
from glob import glob
from tqdm import tqdm
import pandas as pd
import numpy as np

import warnings
import pygsheets
import math
import time
import re
import os

# Some notebook configs
warnings.filterwarnings('ignore')
pd.set_option('display.max_colwidth', 1000)

In [2]:
from nlb_fun import *

### Clean files first 
- If you have ran this script before, information from each book is saved as a rtf file in your local machine. 
- To ensure that there is no overlaps, these rtf files are checked and removed everytime you re-run your script

In [3]:
file_list = glob("*.rtf")
len(file_list)

0

In [4]:
for files in file_list:
    os.remove(files)

### Go to start the scraping 

In [5]:
browser = activate_chrome_selenium_latest(have_pic=False, is_headless=False)



Current google-chrome version is 104.0.5112
Get LATEST chromedriver version for 104.0.5112 google-chrome
Driver [/Users/cliff/.wdm/drivers/chromedriver/mac64/104.0.5112.79/chromedriver] found in cache


In [6]:
auth_csv_file: str = os.environ['nlb_login']

info = pd.read_csv(auth_csv_file)
account_name: str = info['values'][0]
password: str = info['values'][1]

browser = log_in_nlb(browser, account_name, password)

### Add hit to get number of page iterations needed 

In [57]:
url_link = "https://www.nlb.gov.sg/mylibrary/Bookmarks"
browser.get(url_link)
time.sleep(5)
soup = bs(browser.page_source, 'html5lib')

max_records = float(soup.find_all("div", text=re.compile("Showing"))[0].text.split(" ")[-2])
range_list = range(1, int(math.ceil(max_records / 20)) + 1)

# To indicate when the NEXT button is at
counter = range_list[-1] + 2
print(counter)
range_list

5


range(1, 4)

### Loop through the pages! 

In [58]:
book_urls_dict = dict()
# time.sleep(5)
# soup = bs(browser.page_source, 'html5lib')
book_urls_dict[0] = list(set(get_book_urls_on_page(soup)))
next_button = "//*[contains(text(), 'Next')]"

for i in range(1,counter-2):
    print(i)
    time.sleep(5)
    browser.find_element_by_xpath(next_button).click()
    soup = bs(browser.page_source, 'html5lib')
    book_urls_dict[i] = list(set(get_book_urls_on_page(soup)))
    time.sleep(2)

1


ElementClickInterceptedException: Message: element click intercepted: Element is not clickable at point (413, 2702)
  (Session info: chrome=104.0.5112.79)
Stacktrace:
0   chromedriver                        0x000000010f595149 chromedriver + 4469065
1   chromedriver                        0x000000010f51f233 chromedriver + 3985971
2   chromedriver                        0x000000010f1b5fe8 chromedriver + 409576
3   chromedriver                        0x000000010f1f3938 chromedriver + 661816
4   chromedriver                        0x000000010f1f14c3 chromedriver + 652483
5   chromedriver                        0x000000010f1eeb84 chromedriver + 641924
6   chromedriver                        0x000000010f1ed7f5 chromedriver + 636917
7   chromedriver                        0x000000010f1e1579 chromedriver + 587129
8   chromedriver                        0x000000010f209c62 chromedriver + 752738
9   chromedriver                        0x000000010f1e0e35 chromedriver + 585269
10  chromedriver                        0x000000010f209d6e chromedriver + 753006
11  chromedriver                        0x000000010f21c611 chromedriver + 828945
12  chromedriver                        0x000000010f209b53 chromedriver + 752467
13  chromedriver                        0x000000010f1df905 chromedriver + 579845
14  chromedriver                        0x000000010f1e0955 chromedriver + 584021
15  chromedriver                        0x000000010f5666ad chromedriver + 4277933
16  chromedriver                        0x000000010f56ab3a chromedriver + 4295482
17  chromedriver                        0x000000010f56fcdf chromedriver + 4316383
18  chromedriver                        0x000000010f56b857 chromedriver + 4298839
19  chromedriver                        0x000000010f54464f chromedriver + 4138575
20  chromedriver                        0x000000010f5861f8 chromedriver + 4407800
21  chromedriver                        0x000000010f58637f chromedriver + 4408191
22  chromedriver                        0x000000010f59ccb5 chromedriver + 4500661
23  libsystem_pthread.dylib             0x00007fff207f68fc _pthread_start + 224
24  libsystem_pthread.dylib             0x00007fff207f2443 thread_start + 15


In [None]:
book_urls_dict

In [None]:
for i in range_list:
    print(i)
    try:
        print(i)
        x_path = f'//*[@id="bookmark-folder-content"]/div[2]/button[{counter}]'
        print(x_path)
        time.sleep(7)
        browser.find_element_by_xpath(x_path).click()
        soup = bs(browser.page_source, 'html5lib')
        book_urls_dict[i] = list(set(get_book_urls_on_page(soup)))
    except:
        print("Reached the end")
        break

In [None]:
all_book_url_lists = list()
for i in range(0, len(book_urls_dict)):
    all_book_url_lists = all_book_url_lists + book_urls_dict[i]

In [None]:
len(all_book_url_lists)

In [None]:
browser.close()

#### Executing the main scraping of books 

In [None]:
def return_needed_url(id_: str):
    return f"https://eservice.nlb.gov.sg/item_holding.aspx?id={id_}&type=bid&app=mylibrary"

def nlb_page_crawl(selenium_browser, url_link: str):
    id_ = re.findall(r'\d+', url_link)[-1]
    new_url = return_needed_url(id_)

    selenium_browser.get(new_url)
    time.sleep(10)
    return bs(selenium_browser.page_source, 'html5lib'), new_url

def extract_table_values(selenium_obj, tag: str) -> list:
    """
    Extracting values from a HTML` table structure based on tag
    """
    output = []
    for col in selenium_obj.find_all(tag):
        output.append(col)
    return output

def extract_table_values_nested(selenium_obj, input_1, input_2):
    """
    Accounts for nested table tags in a HTML table.
    """
    output = []

    for i in extract_table_values(selenium_obj, input_1):
        output.append(extract_table_values(i, input_2))
        
    return output

In [None]:
# processed_urls = [return_needed_url(re.findall(r'\d+', url)[-1]) for url in all_book_url_lists]
# len(processed_urls)

In [None]:
def extract_info_from_nlb_book_page(url_link :str, selenium_browser):
    try:
        selenium_page, new_url = nlb_page_crawl(selenium_browser, url_link)

        for i in selenium_page.find_all(class_= 'table table-bordered table-striped table-list'):
            col_names = extract_table_values(i, 'th')
            col_values = extract_table_values_nested(i, 'tr', 'td')

        bk_table = pd.DataFrame([[x for x in i] for i in col_values])
        bk_table.columns = [i.text for i in col_names]
        bk_table['Title'] = selenium_page.find('h4').text
        bk_table = bk_table[bk_table.Library.notnull()]

        for col in ["Call No", "Status/Desc", "Due Date", "Library"]:
            bk_table[col] = [i.text.split(": ")[-1] for i in bk_table[col]]

        bk_table['Library'] = [i.split("B3")[0].split("B2")[0] for i in bk_table['Library']]
        bk_table['Due Date'] = ["Available" if i == '-' else i for i in bk_table['Due Date']]
        bk_table['url'] = new_url

        id_ = re.findall(r'\d+', new_url)[-1]
        bk_table[['Title', 'Library', 'Call No', 'Due Date', 'url']].to_csv(f"{id_}.csv", index=False)
    
    except:
        id_ = re.findall(r'\d+', url_link)[-1]
        new_url = return_needed_url(id_)
        print(f"Got issue with {new_url}")

In [None]:
# sel_browser = activate_chrome_selenium_latest(have_pic=False, is_headless=True)

# for url in tqdm(all_book_url_lists[-5:]):
#     try:
#         extract_info_from_nlb_book_page(url, sel_browser)
#     except:
#         print(f"Most probably hit an eBook {url}")

# sel_browser.close()

In [None]:
def setup_workers():
    workers = 3
    files = all_book_url_lists
    drivers = cycle([activate_chrome_selenium_latest(have_pic=False, is_headless=False) for _ in range(workers)])

    with ThreadPoolExecutor(max_workers=workers) as executor:
        executor.map(extract_info_from_nlb_book_page, files, drivers)

    [driver.quit() for driver in drivers]
    ThreadPoolExecutor.shutdownNow()
    print("Process has ended")

In [None]:
setup_workers()

In [None]:
csv_files = glob("*.csv")
len(csv_files)

In [None]:
final_table = pd.DataFrame()

for filename in csv_files:
    final_table = final_table.append(pd.read_csv(filename))

In [None]:
final_table.shape

In [None]:
final_table.head(1)

### Taking locally saved files and loading into Google 

In [None]:
final_table.columns = ['title', "library", "number", 'availability', 'url']
final_table = final_table[['library', 'title', 'number', 'availability', 'url']]

In [None]:
unique_book_count = len(final_table.title.drop_duplicates().tolist())
unique_book_count

### Thinking about how to include testing into my script

In [None]:
# final_table[~final_table.availability.isin(['Available', 'For Reference Only'])]

In [None]:
final_table = final_table[final_table.availability.notnull()]
final_table = final_table[final_table.availability != "For Reference Only"]

### Thinking about testing my code 

In [None]:
final_table[final_table.availability.isnull()].shape

In [None]:
final_table[final_table['number'].isnull()].shape

In [None]:
final_table[final_table['number'].isnull()]

### Processing 

In [None]:
final_table.title = [i.split(" | ")[0] for i in final_table.title]
# final_table['number'] = [i.replace("English", "").replace("Chinese", "") for i in final_table['number']]
final_table.loc[final_table.library == "Repository Used Book Collection", 'availability'] = "For Reference Only"
final_table['title'] = [i.split(r"/")[0].strip() for i in final_table['title']]

ffinal_table = final_table[(final_table.library=="Bishan Public Library")]
ffinal_table = ffinal_table.sort_values('availability')
ffinal_table.shape

### Cleaning Bookmarks Sheet 

In [None]:
google_auth = os.environ['gsheet_cred']
gc = pygsheets.authorize(service_file=google_auth)
sh = gc.open('NLB Project')

### Checking just Bishan library

In [None]:
bishan = sh.worksheet_by_title("Bookmarks")
bishan.clear('A2:E1000')

bishan_table = final_table[final_table.library.str.contains("Bishan")]
bishan.set_dataframe(bishan_table,(1,1))

### Checking in all libraries

In [None]:
all_ = sh.worksheet_by_title("All")
all_.clear('A2:F1000') 

all_.set_dataframe(final_table,(1,1))

### [Link](https://docs.google.com/spreadsheets/d/1s5oYU59jyU_QO3IIhCClyWGoC_MpW9L_h4l4djDUKO0/edit#gid=1021888748) to my Google Sheet

In [None]:
for files in csv_files:
    os.remove(files)

### Popular books 
- Identify popular books

In [None]:
# total = final_table.groupby('title').availability.count().reset_index()
# available = final_table[final_table.availability == "Available"].groupby('title').availability.count().reset_index()
# total.columns = ['title', 'total']
# available.columns = ['title', 'avail_count']
# total.merge(available)

In [None]:
available_books = list(set(final_table[final_table.availability == 'Available'].title.tolist()))

In [None]:
final_table[~final_table.title.isin(available_books)]