In [1]:
%load_ext autoreload
%autoreload 2

from multiprocessing import Pool
from concurrent.futures import ThreadPoolExecutor

from selenium import webdriver
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By

from bs4 import BeautifulSoup as bs
from itertools import cycle
from glob import glob
import pandas as pd
import numpy as np
import rpa as r

import warnings
import pygsheets
import math
import time
import re
import os

# Some notebook configs
warnings.filterwarnings('ignore')
pd.set_option('display.max_colwidth', 1000)

In [2]:
from nlb_fun import *

In [4]:
r.init(headless_mode=True)

url_link = "https://cassamv2.nlb.gov.sg/cas/login"
r.url(url_link)

auth_csv_file: str = os.environ['nlb_login']

In [5]:
info = pd.read_csv(auth_csv_file)
account_name: str = info['values'][0]
password: str = info['values'][1]

r.type('//*[@id="username"]', f'{account_name}')
r.type('//*[@id="password"]', f'{password}')

login_button_2 = """//*[@id="fm1"]/section/input[4]"""
r.click(login_button_2)

True

### Add hit to get number of page iterations needed 

In [9]:
r.url("https://www.nlb.gov.sg/mylibrary/Bookmarks")
time.sleep(5)

soup = bs(r.read('page'), 'html5')
soup.find_all("div", text=re.compile("Showing"))[0].text.split(" ")[-2]

max_records = float(soup.find_all("div", text=re.compile("Showing"))[0].text.split(" ")[-2])
range_list = range(1, int(math.ceil(max_records / 20)) + 1)

# To indicate when the NEXT button is at
counter = range_list[-1] + 2
print(counter)
range_list

In [11]:
book_urls_dict = dict()

soup = bs(r.read('page'), 'html5')
book_urls_dict[0] = list(set(get_book_urls_on_page(soup)))

for i in range(1,counter+1):
    print(i)
    time.sleep(2)
    click_thru_pages = f'//*[@id="bookmark-folder-content"]/nav/ul/li[{counter}]/a'
    r.click(click_thru_pages)
    time.sleep(2)
    soup = bs(r.read('page'), 'html5')
    book_urls_dict[i] = list(set(get_book_urls_on_page(soup)))

In [12]:
r.close()

True

In [15]:
all_book_url_lists = list()
for i in range(0, len(book_urls_dict)):
    all_book_url_lists = all_book_url_lists + book_urls_dict[i]

print(len(list(set(all_book_url_lists))))
print(len(all_book_url_lists))

#### Executing the main scraping of books 

In [None]:
def return_needed_url(id_: str) -> str:
    return f"https://eservice.nlb.gov.sg/item_holding.aspx?id={id_}&type=bid&app=mylibrary"

def nlb_page_crawl(selenium_browser, url_link: str):
    id_ = re.findall(r'\d+', url_link)[-1]
    new_url = return_needed_url(id_)

    selenium_browser.get(new_url)
    time.sleep(10)
    return bs(selenium_browser.page_source, 'html5lib'), new_url

def extract_table_values(selenium_obj, tag: str) -> list:
    """
    Extracting values from a HTML table structure based on tag
    """
    output = []
    for col in selenium_obj.find_all(tag):
        output.append(col)
    return output

def extract_table_values_nested(selenium_obj, input_1, input_2):
    """
    Accounts for nested table tags in a HTML table.
    """
    output = []

    for i in extract_table_values(selenium_obj, input_1):
        output.append(extract_table_values(i, input_2))
        
    return output

In [None]:
def extract_info_from_nlb_book_page(url_link :str, selenium_browser):
    try:
        selenium_page, new_url = nlb_page_crawl(selenium_browser, url_link)
        time.sleep(5)

        for i in selenium_page.find_all(class_= 'table table-bordered table-striped table-list'):
            col_names = extract_table_values(i, 'th')
            col_values = extract_table_values_nested(i, 'tr', 'td')

        bk_table = pd.DataFrame([[x for x in i] for i in col_values])
        bk_table.columns = [i.text for i in col_names]
        bk_table['Title'] = selenium_page.find('h4').text
        bk_table = bk_table[bk_table.Library.notnull()]

        for col in ["Call No", "Status/Desc", "Due Date", "Library"]:
            bk_table[col] = [i.text.split(": ")[-1] for i in bk_table[col]]
        
        bk_table.loc[bk_table["Status/Desc"] == 'Reserved', 'Due Date'] = 'Reserved'
        bk_table['Library'] = [i.split("B3")[0].split("B2")[0] for i in bk_table['Library']]
        bk_table['Due Date'] = ["Available" if i == '-' else i for i in bk_table['Due Date']]
        bk_table['url'] = new_url

        id_ = re.findall(r'\d+', new_url)[-1]
        bk_table[['Title', 'Library', 'Call No', 'Due Date', 'url']].to_csv(f"{id_}.csv", index=False)
    
    except:
        id_ = re.findall(r'\d+', url_link)[-1]
        new_url = return_needed_url(id_)
        print(f"Got issue with {new_url}")

In [None]:
def setup_workers():
    workers = 3
    files = list(set(all_book_url_lists))
    drivers = cycle([activate_chrome_selenium_latest(have_pic=False, is_headless=False) for _ in range(workers)])

    with ThreadPoolExecutor(max_workers=workers) as executor:
        executor.map(extract_info_from_nlb_book_page, files, drivers)

    [driver.quit() for driver in drivers]
    ThreadPoolExecutor.shutdownNow()
    print("Process has ended")

In [None]:
setup_workers()

In [None]:
csv_files = glob("*.csv")
len(csv_files)

In [None]:
final_table = pd.DataFrame()

for filename in csv_files:
    final_table = final_table.append(pd.read_csv(filename))

final_table.shape

### Taking locally saved files and loading into Google 

In [None]:
final_table.columns = ['title', "library", "number", 'availability', 'url']
final_table = final_table[['library', 'title', 'number', 'availability', 'url']]
unique_book_count = len(final_table.title.drop_duplicates().tolist())
unique_book_count

### Thinking about how to include testing into my script

In [None]:
final_table = final_table[final_table.availability.notnull()]
final_table = final_table[final_table.availability != "For Reference Only"]

### Thinking about testing my code 

In [None]:
final_table[final_table.availability.isnull()].shape

In [None]:
final_table[final_table['number'].isnull()].shape

In [None]:
final_table[final_table['number'].isnull()]

### Processing 

In [None]:
final_table.title = [i.split(" | ")[0] for i in final_table.title]
final_table.loc[final_table.library == "Repository Used Book Collection", 'availability'] = "For Reference Only"
final_table['title'] = [i.split(r"/")[0].strip() for i in final_table['title']]

### Cleaning Bookmarks Sheet 

In [None]:
google_auth = os.environ['gsheet_cred']
gc = pygsheets.authorize(service_file=google_auth)
sh = gc.open('NLB Project')

### Checking in all libraries

In [None]:
all_ = sh.worksheet_by_title("All")
all_.clear('A2:F1000') 

all_.set_dataframe(final_table,(1,1))

### [Link](https://docs.google.com/spreadsheets/d/1s5oYU59jyU_QO3IIhCClyWGoC_MpW9L_h4l4djDUKO0/edit#gid=1021888748) to my Google Sheet

In [None]:
for files in csv_files:
    os.remove(files)