### Scraping NLB books that I have bookmarked 

In [44]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [45]:
from multiprocessing import Pool
from concurrent.futures import ThreadPoolExecutor

from selenium import webdriver
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By

from bs4 import BeautifulSoup as bs
from itertools import cycle
from glob import glob
from tqdm import tqdm
import pandas as pd
import numpy as np

import warnings
import pygsheets
import math
import time
import re
import os

In [46]:
# Some notebook configs
warnings.filterwarnings('ignore')
pd.set_option('display.max_colwidth', 1000)

In [47]:
from nlb_fun import *

### Clean files first 
- If you have ran this script before, information from each book is saved as a rtf file in your local machine. 
- To ensure that there is no overlaps, these rtf files are checked and removed everytime you re-run your script

In [48]:
file_list = glob("*.rtf")
len(file_list)

0

In [49]:
for files in file_list:
    os.remove(files)

### Go to start the scraping 

In [50]:
browser = activate_chrome_selenium_latest(have_pic=False, is_headless=True)



Current google-chrome version is 94.0.4606
Get LATEST driver version for 94.0.4606
Driver [/Users/cliff/.wdm/drivers/chromedriver/mac64/94.0.4606.61/chromedriver] found in cache


In [51]:
auth_csv_file: str = os.environ['nlb_login']

info = pd.read_csv(auth_csv_file)
account_name: str = info['values'][0]
password: str = info['values'][1]

browser = log_in_nlb(browser, account_name, password)

### Add hit to get number of page iterations needed 

In [52]:
url_link = "https://www.nlb.gov.sg/mylibrary/Bookmarks"    
browser.get(url_link)
soup = bs(browser.page_source, 'html5lib')

In [53]:
max_records = float(soup.find_all("div", text=re.compile("Showing"))[0].text.split(" ")[-2])
range_list = range(1, int(math.ceil(max_records / 20)) + 1)

# To indicate when the NEXT button is at
counter = range_list[-1] + 2
range_list

range(1, 4)

### Loop through the pages! 

In [54]:
book_urls_dict = dict()

browser.get("https://www.nlb.gov.sg/mylibrary/Bookmarks")
soup = bs(browser.page_source, 'html5lib')
book_urls_dict[0] = list(set(get_book_urls_on_page(soup)))
time.sleep(1)

for i in range_list:
    try:
        browser.find_element_by_xpath(f'//*[@id="bookmark-folder-content"]/div[2]/button[{counter}]').click()
        time.sleep(1)
        soup = bs(browser.page_source, 'html5lib')
        book_urls_dict[i] = list(set(get_book_urls_on_page(soup)))
    except:
        break

In [55]:
all_book_url_lists = list()
for i in range(0, len(book_urls_dict)):
    all_book_url_lists = all_book_url_lists + book_urls_dict[i]

In [56]:
len(all_book_url_lists)

60

In [57]:
browser.close()

#### Executing the main scraping of books 

In [100]:
def return_needed_url(id_: str):
    return f"https://eservice.nlb.gov.sg/item_holding.aspx?id={id_}&type=bid&app=mylibrary"

def nlb_page_crawl(selenium_browser, url_link: str):
    id_ = re.findall(r'\d+', url_link)[-1]
    new_url = return_needed_url(id_)

    selenium_browser.get(new_url)
    time.sleep(10)
    return bs(selenium_browser.page_source, 'html5lib'), new_url

def extract_table_values(selenium_obj, tag: str) -> list:
    """
    Extracting values from a HTML` table structure based on tag
    """
    output = []
    for col in selenium_obj.find_all(tag):
        output.append(col)
    return output

def extract_table_values_nested(selenium_obj, input_1, input_2):
    """
    Accounts for nested table tags in a HTML table.
    """
    output = []

    for i in extract_table_values(selenium_obj, input_1):
        output.append(extract_table_values(i, input_2))
        
    return output

In [None]:
# processed_urls = [return_needed_url(re.findall(r'\d+', url)[-1]) for url in all_book_url_lists]
# len(processed_urls)

In [105]:
def extract_info_from_nlb_book_page(url_link :str, selenium_browser):
    try:
        selenium_page, new_url = nlb_page_crawl(selenium_browser, url_link)

        for i in selenium_page.find_all(class_= 'table table-bordered table-striped table-list'):
            col_names = extract_table_values(i, 'th')
            col_values = extract_table_values_nested(i, 'tr', 'td')

        bk_table = pd.DataFrame([[x for x in i] for i in col_values])
        bk_table.columns = [i.text for i in col_names]
        bk_table['Title'] = selenium_page.find('h4').text
        bk_table = bk_table[bk_table.Library.notnull()]

        for col in ["Call No", "Status/Desc", "Due Date", "Library"]:
            bk_table[col] = [i.text.split(": ")[-1] for i in bk_table[col]]

        bk_table['Library'] = [i.split("B3")[0].split("B2")[0] for i in bk_table['Library']]
        bk_table['Due Date'] = ["Available" if i == '-' else i for i in bk_table['Due Date']]
        bk_table['url'] = new_url

        id_ = re.findall(r'\d+', new_url)[-1]
        bk_table[['Title', 'Library', 'Call No', 'Due Date', 'url']].to_csv(f"{id_}.csv", index=False)
    
    except:
        id_ = re.findall(r'\d+', url_link)[-1]
        new_url = return_needed_url(id_)
        print(f"Got issue with {new_url}")

In [102]:
# sel_browser = activate_chrome_selenium_latest(have_pic=False, is_headless=True)

# for url in tqdm(all_book_url_lists[-5:]):
#     try:
#         extract_info_from_nlb_book_page(url, sel_browser)
#     except:
#         print(f"Most probably hit an eBook {url}")

# sel_browser.close()

In [106]:
def setup_workers():
    workers = 2
    files = all_book_url_lists
    drivers = cycle([activate_chrome_selenium_latest(have_pic=False, is_headless=False) for _ in range(workers)])

    with ThreadPoolExecutor(max_workers=workers) as executor:
        executor.map(extract_info_from_nlb_book_page, files, drivers)

    [driver.quit() for driver in drivers]
    ThreadPoolExecutor.shutdownNow()
    print("Process has ended")

In [107]:
setup_workers()



Current google-chrome version is 94.0.4606
Get LATEST driver version for 94.0.4606
Driver [/Users/cliff/.wdm/drivers/chromedriver/mac64/94.0.4606.61/chromedriver] found in cache


Current google-chrome version is 94.0.4606
Get LATEST driver version for 94.0.4606
Driver [/Users/cliff/.wdm/drivers/chromedriver/mac64/94.0.4606.61/chromedriver] found in cache


Got issue with https://eservice.nlb.gov.sg/item_holding.aspx?id=203977460&type=bid&app=mylibrary
Got issue with https://eservice.nlb.gov.sg/item_holding.aspx?id=205444828&type=bid&app=mylibrary
Got issue with https://eservice.nlb.gov.sg/item_holding.aspx?id=205459582&type=bid&app=mylibrary
Got issue with https://eservice.nlb.gov.sg/item_holding.aspx?id=203905600&type=bid&app=mylibrary


KeyboardInterrupt: 

In [108]:
csv_files = glob("*.csv")
len(csv_files)

59

In [109]:
final_table = pd.DataFrame()

for filename in csv_files:
    final_table = final_table.append(pd.read_csv(filename))

In [110]:
final_table.shape

(781, 5)

In [111]:
final_table.head(1)

Unnamed: 0,Title,Library,Call No,Due Date,url
0,"IQ, EQ, DQ : new intelligence in the AI age / Yuhyun Park.",Ang Mo Kio Public Library,303.4833 PAR,Available,https://eservice.nlb.gov.sg/item_holding.aspx?id=205513426&type=bid&app=mylibrary


### Taking locally saved files and loading into Google 

In [112]:
final_table.columns = ['title', "library", "number", 'availability', 'url']
final_table = final_table[['library', 'title', 'number', 'availability', 'url']]

In [113]:
unique_book_count = len(final_table.title.drop_duplicates().tolist())
unique_book_count

59

### Thinking about how to include testing into my script

In [26]:
# final_table[~final_table.availability.isin(['Available', 'For Reference Only'])]

In [114]:
final_table = final_table[final_table.availability.notnull()]
final_table = final_table[final_table.availability != "For Reference Only"]

### Thinking about testing my code 

In [115]:
final_table[final_table.availability.isnull()].shape

(0, 5)

In [116]:
final_table[final_table['number'].isnull()].shape

(0, 5)

In [117]:
final_table[final_table['number'].isnull()]

Unnamed: 0,library,title,number,availability,url


### Processing 

In [118]:
final_table.title = [i.split(" | ")[0] for i in final_table.title]
final_table['number'] = [i.replace("English", "").replace("Chinese", "") for i in final_table['number']]
final_table.loc[final_table.library == "Repository Used Book Collection", 'availability'] = "For Reference Only"
final_table['title'] = [i.split(r"/")[0].strip() for i in final_table['title']]

In [119]:
ffinal_table = final_table[(final_table.library=="Bishan Public Library")]
ffinal_table = ffinal_table.sort_values('availability')
ffinal_table.shape

(39, 5)

### Cleaning Bookmarks Sheet 

In [120]:
google_auth = os.environ['gsheet_cred']
gc = pygsheets.authorize(service_file=google_auth)
sh = gc.open('NLB Project')

### Checking just Bishan library

In [121]:
bishan = sh.worksheet_by_title("Bookmarks")
bishan.clear('A2:E1000')

bishan_table = final_table[final_table.library.str.contains("Bishan")]
bishan.set_dataframe(bishan_table,(1,1))

### Checking in all libraries

In [122]:
all_ = sh.worksheet_by_title("All")
all_.clear('A2:F1000') 

all_.set_dataframe(final_table,(1,1))

### [Link](https://docs.google.com/spreadsheets/d/1s5oYU59jyU_QO3IIhCClyWGoC_MpW9L_h4l4djDUKO0/edit#gid=1021888748) to my Google Sheet

In [123]:
for files in csv_files:
    os.remove(files)

### Popular books 
- Identify popular books

In [78]:
# total = final_table.groupby('title').availability.count().reset_index()
# available = final_table[final_table.availability == "Available"].groupby('title').availability.count().reset_index()
# total.columns = ['title', 'total']
# available.columns = ['title', 'avail_count']
# total.merge(available)

In [124]:
available_books = list(set(final_table[final_table.availability == 'Available'].title.tolist()))

In [125]:
final_table[~final_table.title.isin(available_books)]

Unnamed: 0,library,title,number,availability,url
