## 16a_Scrape_URLs_Random_Sample

Author: Daniel Hui

License: MIT

This notebook scrapes the URLS of the Seattle Public Library online catalog, which is managed by a 3rd party provider, Bibliocommons of Toronto

In [41]:
import pandas as pd
import requests
import numpy as np

from __future__ import print_function, division

### Global Variables

In [42]:
location = "random"         #Set Target Branch Library
max_range = 250          #max number of rows in each subfile

### Extract ISBN List From Target Branch

In [43]:
#Load inventory 
inventory = pd.read_csv('../01_Data/04_Scraped/Random_Sample.csv',index_col=0)

In [44]:
isbn_df = inventory[["BibNum","isbn"]].drop_duplicates()
isbn_df = isbn_df[isbn_df["isbn"] != "0"]
isbn_df.head()

Unnamed: 0,BibNum,isbn
121017,3161651,1620401371
5167,3203068,985673486
187823,2851182,1618101110
56292,2640953,375864326
15312,2301442,792271351


In [45]:
len(isbn_df)     #Number of ISBNs at this location                                       

3497

### Selenium Scrape
Load Selenium and point it to a start page

In [51]:
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
import time

import os
chromedriver = "/Applications/chromedriver"      # path to the chromedriver executable
os.environ["webdriver.chrome.driver"] = chromedriver

In [52]:
driver = webdriver.Chrome(chromedriver)
driver.get("https://seattle.bibliocommons.com/item/show/3339147030")  #SPL's online catalog starting point

### Scraping Function

In [48]:
def get_url(isbn):
    driver.get(f'https://seattle.bibliocommons.com/v2/search?query={isbn}&searchType=smart') #search URL 
    try:                                                                                     
        title_link = driver.find_element_by_xpath("//a[@data-key = 'bib-title']")            #find link 
        title_link.click()                                                                  
        current_url = driver.current_url
        return current_url
    except:                                     #for errors that happen when an ISBN isnt in the catalog
        return "N/A"   
        #time.sleep(5)                          #delay option, but it doesnt look like it matters

### Scraping Loop

In [53]:
#split the ISBN List into chunks so you can incrementally save
total_loops = (len(isbn_df) // max_range) + 1
isbn_dframes = np.array_split(isbn_df, total_loops)

In [50]:
for i in range(14,len(isbn_dframes)):     #adjust lower number to new starting point if necessary
    isbn_dframes[i]["link"] = isbn_dframes[i]["isbn"].apply(get_url)
    isbn_dframes[i].to_csv(f"../01_Data/04_Scraped/isbn_url_{location}_{i}.csv")

### Unite the CSVs Together

In [41]:
total_files = len(isbn_df) // max_range + 1

#initialize the dictionary
book_df = pd.read_csv(f'../01_Data/04_Scraped/isbn_url_{location}_0.csv',index_col=0) 

for i in range(1,total_files):
    current_df = pd.read_csv(f'../01_Data/04_Scraped/isbn_url_{location}_{i}.csv',index_col=0)
    book_df = pd.concat([book_df,current_df])

book_df = book_df.reset_index(drop=True) 

In [42]:
book_df.to_csv(f"../01_Data/04_Scraped/isbn_url_{location}.csv")