* Purpose: Script to extract editions information from worldcat given oclc_number
* License: GPLv3 (Free Software) 
* Date: Oct 15, 2018

In [1]:
# Load required libraries
from urllib.request import urlopen
from bs4 import BeautifulSoup
import numpy as np
import pandas as pd

In [2]:
# Finds how many total number of editions are for a given book
# Based on Oct 15, 2018 OCLC editions page structure
def find_total_editions_count(page_url):
    page = urlopen(page_url)
    page_html = BeautifulSoup(page, "html.parser")

    table_results_info = page_html.findAll("table",{"class":"tableResultsInfo"})
    count_info = table_results_info[0].findAll("strong")
    items_count = int(count_info[1].text)
    return items_count

In [3]:
# Finds how many total number of pages contain edition info for a given book
# Based on Oct 15, 2018 OCLC editions page structure
def get_editions_page_count(oclc_number):
    page_url = "https://www.worldcat.org/oclc/" + str(oclc_number) + "/editions"
    items_count = find_total_editions_count(page_url)
    page_count = round(items_count / 10)
    return page_count

In [4]:
# Gets a page of editions info and extracts required info
def get_edition_info(oclc_number, start_edition, edition_info_df):
    page_url = "https://www.worldcat.org/oclc/" + str(oclc_number) + "/editions?start_edition=" + str(start_edition)
    page = urlopen(page_url)
    page_html = BeautifulSoup(page, "html.parser")
    
    table_results = page_html.findAll("table", {"class": "table-results"})
    result_nodes = table_results[0].findAll("td", attrs={"class": "result"})
    
    for td_node in result_nodes:
        title_link = td_node.find('a', href=True)
        title = title_link.text.strip()
        
        author_div = td_node.find('div', attrs={"class": "author"})
        if author_div is not None:
            author = author_div.text.strip()
        
        book_format_td = td_node.findNext('td')
        if book_format_td is not None:
            book_format = book_format_td.text.strip()
        
        language_td = book_format_td.findNext('td')
        if language_td is not None:
            language = language_td.text.strip()
        
        date_td = language_td.findNext('td')
        if date_td is not None:
            date = date_td.text.strip()
        
        publisher_td = date_td.findNext('td')
        if publisher_td is not None:
            publisher = publisher_td.text.strip()

        edition_info_df = edition_info_df.append({'oclc_number':oclc_number, 'title': title, 'author': author, 'format': book_format, 'language': language, 'date':date, 'publisher': publisher}, ignore_index=True)
    return edition_info_df

In [5]:
# Read input info, i.e oclc_number
book_info_df = pd.read_csv("data/source/book_info.csv") 
book_info_df.head()

Unnamed: 0,oclc_number,title
0,62123162,Sociological theory
1,549028728,The structure of sociological theory


In [6]:
# Loop through the oclc numbers, get the edition info, and store that into a panada's dataframe
edition_info_df = pd.DataFrame()

for index, row in book_info_df.iterrows():
    oclc_number = row["oclc_number"]
    page_count = get_editions_page_count(oclc_number)
    
    for page_number in range(page_count):
        start_edition = (page_number * 10) + 1
        edition_info_df = get_edition_info(oclc_number, start_edition, edition_info_df)    

In [8]:
# Write the result set dataframe into a file
edition_info_df.to_csv("data/extracted/edition_info.csv", header=True, columns=["oclc_number","title","author","format","language","date","publisher"])