### Scraping NLB books that I have bookmarked 

In [14]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [15]:
from bs4 import BeautifulSoup as bs
from selenium import webdriver
from glob import glob
from tqdm import tqdm
import pandas as pd
import numpy as np
import warnings
import pygsheets
import math
import time
import re
import os

In [16]:
# Some notebook configs
warnings.filterwarnings('ignore')
pd.set_option('display.max_colwidth', 1000)

In [17]:
from nlb_fun import *

### Clean files first 
- If you have ran this script before, information from each book is saved as a rtf file in your local machine. 
- To ensure that there is no overlaps, these rtf files are checked and removed everytime you re-run your script

In [18]:
file_list = glob("*.rtf")
len(file_list)

0

In [19]:
for files in file_list:
    os.remove(files)

### Go to start the scraping 

In [20]:
browser = activate_chrome_selenium_latest(have_pic=False, is_headless=False)



Current google-chrome version is 93.0.4577
Get LATEST driver version for 93.0.4577
Driver [/Users/cliff/.wdm/drivers/chromedriver/mac64/93.0.4577.63/chromedriver] found in cache


In [21]:
auth_csv_file = os.environ['nlb_login']

info = pd.read_csv(auth_csv_file)
account_name = info['values'][0]
password = info['values'][1]

browser = log_in_nlb(browser, account_name, password)

### Add hit to get number of page iterations needed 

In [22]:
url_link = "https://www.nlb.gov.sg/mylibrary/Bookmarks"    
browser.get(url_link)
soup = bs(browser.page_source, 'html5lib')

In [23]:
max_records = float(soup.find_all("div", text=re.compile("Showing"))[0].text.split(" ")[-2])
range_list = range(1, int(math.ceil(max_records / 20)) + 1)
range_list

range(1, 5)

### Loop through the pages! 

In [35]:
book_urls_dict = dict()

browser.get("https://www.nlb.gov.sg/mylibrary/Bookmarks")
soup = bs(browser.page_source, 'html5lib')
book_urls_dict[0] = list(set(get_book_urls_on_page(soup)))
time.sleep(1)

for i in range_list:
    try:
        browser.find_element_by_xpath('//*[@id="bookmark-folder-content"]/div[2]/button[6]').click()
        time.sleep(1)
        soup = bs(browser.page_source, 'html5lib')
        book_urls_dict[i] = list(set(get_book_urls_on_page(soup)))
    except:
        break

In [36]:
all_book_url_lists = list()
for i in range(0, len(book_urls_dict)):
    all_book_url_lists = all_book_url_lists + book_urls_dict[i]

In [37]:
len(all_book_url_lists)

67

In [39]:
final_table = pd.DataFrame()

#### Note
- This is a troublsome script to go to each link that I have, and see if I am on the link with the correct book info. If not, it means that I still need to do more clickings.
- **Brace yourself.** Because this portion of the code goes through each book to get the relevant information, this part can be quite slow if you have quite a few books in your bookmark

In [40]:
# Write iteration count
count = 1
for urls in tqdm(all_book_url_lists):
    browser.get(urls)
    book = bs(browser.page_source, 'html5lib')
    time.sleep(3)

    try:
        link_on_book = """//*[@id="result-content-grid"]/div/div/div/div[2]/h5/a"""
        browser.find_element_by_xpath(link_on_book).click()
    except:
        pass

    link_on_availability = """//*[@id="mainContent"]/div[3]/div[2]/div[3]/div[1]/div/div/div[1]/a"""
    browser.find_element_by_xpath(link_on_availability).click()
    time.sleep(3)
    
    book = bs(browser.page_source, 'html5lib')
    with open("{} {}.rtf".format(count, urls.split('=')[-1]), "wb") as text_file:
        text_file.write(book.encode('utf-8'));
    count += 1

100%|██████████| 67/67 [13:39<00:00, 12.23s/it]


### Taking locally saved files and loading into Google 

In [41]:
file_list = glob("*.rtf")
len(file_list)

67

In [42]:
final_table = pd.DataFrame()
for files in tqdm(file_list):
    with open(files, encoding="utf8") as f:
        book = bs(f.read())

    for i in book.find(class_= 'table table-stacked'):

        lib = list()
        code = list()
        avail = list()

        for tr in i.find_all('tr'):
            for td in tr.find_all('td'):
                tmp = td.text
                if tmp.split(" ")[-1] == 'Library' or tmp.split(" ")[-1] == "7" or 'library' in tmp or "Repository Used Book" in tmp or 'LLiBrary' in tmp:
                    if '.' not in tmp:
                        lib.append(tmp)
                elif 'English' in tmp or 'Chinese' in tmp:
                    code.append(tmp)
                elif 'Available' == tmp or 'Onloan' in tmp or 'Transit' in tmp or "For Reference Only" in tmp or "Reserved" == tmp:
                    avail.append(tmp)

        try:
            len(lib) == len(code) == len(avail)
        except:
            print("Error in length")
        
        book_table = pd.DataFrame([lib, code, avail]).T
        book_table['title'] = book.find('title').get_text()

        final_table = final_table.append(book_table)

100%|██████████| 67/67 [00:07<00:00,  8.95it/s]


In [43]:
len(final_table.title.drop_duplicates().tolist())

67

In [44]:
final_table.columns = ['library', "number", "availability", 'title']
final_table = final_table[['library', 'title', 'number', 'availability']]

### Thinking about how to include testing into my script

In [45]:
# final_table[~final_table.availability.isin(['Available', 'For Reference Only'])]

In [46]:
final_table = final_table[final_table.availability.notnull()]
final_table = final_table[final_table.availability != "For Reference Only"]

### Thinking about testing my code 

In [47]:
final_table[final_table.availability.isnull()].shape

(0, 4)

In [48]:
final_table[final_table['number'].isnull()].shape

(0, 4)

In [49]:
final_table[final_table['number'].isnull()]

Unnamed: 0,library,title,number,availability


### Processing 

In [50]:
final_table.title = [i.split(" | ")[0] for i in final_table.title]
final_table['number'] = [i.replace("English", "").replace("Chinese", "") for i in final_table['number']]
final_table.loc[final_table.library == "Repository Used Book Collection", 'availability'] = "For Reference Only"
final_table['availability'] = [i.replace("Onloan - Due: ", "") for i in final_table['availability']]

In [51]:
final_table['title'] = [i.split(r"/")[0].strip() for i in final_table['title']]

In [52]:
ffinal_table = final_table[(final_table.library=="Bishan Public Library")]
ffinal_table = ffinal_table.sort_values('availability')
ffinal_table.shape

(44, 4)

### Cleaning Bookmarks Sheet 

In [53]:
google_auth = os.environ['gsheet_cred']
gc = pygsheets.authorize(service_file=google_auth)
sh = gc.open('NLB Project')

### Checking just Bishan library

In [54]:
bishan = sh.worksheet_by_title("Bookmarks")
bishan.clear('A2:E1000')

bishan_table = final_table[final_table.library.str.contains("Bishan")]
bishan.set_dataframe(bishan_table,(1,1))

### Checking in all libraries

In [55]:
all_ = sh.worksheet_by_title("All")
all_.clear('A2:F1000') 

all_.set_dataframe(final_table,(1,1))

### [Link](https://docs.google.com/spreadsheets/d/1s5oYU59jyU_QO3IIhCClyWGoC_MpW9L_h4l4djDUKO0/edit#gid=1021888748) to my Google Sheet

### Popular books 
- Identify popular books

In [60]:
# total = final_table.groupby('title').availability.count().reset_index()
# available = final_table[final_table.availability == "Available"].groupby('title').availability.count().reset_index()
# total.columns = ['title', 'total']
# available.columns = ['title', 'avail_count']
# total.merge(available)

In [57]:
available_books = list(set(final_table[final_table.availability == 'Available'].title.tolist()))

In [59]:
final_table[~final_table.title.isin(available_books)]

Unnamed: 0,library,title,number,availability
0,Pasir Ris Public Library,The culture map : breaking through the invisible boundaries of global business,658.049 MEY -[BIZ],04 Sep 2021
1,Woodlands Regional Library,The culture map : breaking through the invisible boundaries of global business,658.049 MEY -[BIZ],In-Transit (Set: 19 Sep 2021)
