* Purpose: Script to extract  books metadata information from worldcat api results   
* License: GPLv3 (Free Software) 
* Date: Oct 15, 2018
* Additional Notes: API GET Request: http://www.worldcat.org/webservices/catalog/content/62123162?wskey={built-in-api-key}&recordSchema=info%3Asrw%2Fschema%2F1%2Fdc

In [1]:
# Load required libraries
from urllib.request import urlopen
from bs4 import BeautifulSoup
import numpy as np
import pandas as pd
import xml.etree.ElementTree as ET
from geopy.geocoders import Nominatim

In [2]:
def get_metadata(oclc_number):
    oclc_url = 'http://www.worldcat.org/webservices/catalog/content/'+ oclc_number + '?wskey={built-in-api-key}&recordSchema=info%3Asrw%2Fschema%2F1%2Fdc'
    oclc_info = urlopen(oclc_url).read()
    page_xml = BeautifulSoup(oclc_info, "xml")
    return page_xml

In [3]:
def parse_metadata(page_xml, oclc_number, booK_metadata_df):
    root = page_xml.findAll("oclcdcs")[0]
    title = root.find("dc:title").text.strip()

    creator = ""
    if root.find("dc:creator") is not None:
        creator = root.find("dc:creator").text.strip()

    contributor = ""
    if root.find("dc:contributor") is not None:
        description = root.find("dc:contributor").text.strip()
    
    description = ""
    if root.find("dc:description") is not None:
        description = root.find("dc:description").text.strip()
    
    date = ""
    if root.find("dc:date") is not None:
        date = root.find("dc:date").text.strip()
    
    book_format = ""
    if root.find("dc:format") is not None:
        book_format = root.find("dc:format").text.strip()
    
    language = ""
    if root.find("dc:language") is not None:
        language = root.find("dc:language").text.strip()

    publisher = ""
    if root.find("dc:publisher") is not None:
        publisher = root.find("dc:publisher").text.strip()

    oclc_record_creationdate = ""
    if root.find("oclcterms:recordCreationDate") is not None:
        oclc_record_creationdate = root.find("oclcterms:recordCreationDate").text.strip()

    booK_metadata_df = booK_metadata_df.append({'oclc_number':oclc_number, 'title': title, 'creator': creator, 'contributor': contributor, 'description': description, 'date': date, 'book_format': book_format, 'language': language, 'publisher': publisher}, ignore_index=True)
    return booK_metadata_df

In [15]:
booK_metadata_df = pd.DataFrame()
book_info_df = pd.read_csv("data/source/sociology_oclc_info.csv") 

for index, row in book_info_df.iterrows():
    oclc_number = str(row["oclc_number"])
    page_xml = get_metadata(oclc_number)
    booK_metadata_df = parse_metadata(page_xml, oclc_number, booK_metadata_df)
booK_metadata_df.head()

Unnamed: 0,book_format,contributor,creator,date,description,language,oclc_number,publisher,title
0,246 pages 24 cm,,,"[1973, ©1972]","""This book consists of working papers which ar...",eng,824231,MIT Press,New directions in sociological theory
1,[288] p. ; 24 cm,,"Horowitz, Irving Louis.",1995,New ed.,eng,69205232,Oxford University Press,Decomposition of sociology
2,"viii, 137 pages ; 22 cm",,"Carter, Hugh, 1895-","[1968, ©1927]",Reprint. Originally published: University of N...,eng,152432104,Kennikat Press,The social theories of L.T. Hobhouse
3,,,"Schutz, Alfred, 1899-1959.",1964,,eng,270808917,Nijhoff,"Collected papers [of] Alfred Schutz. Vol. 2, S..."
4,220 p,,"Mouzelis, Nicos P.",1995,,eng,906380481,Routledge,Sociological theory : what went wrong? : diagn...


In [16]:
booK_metadata_df_reorder = booK_metadata_df[["oclc_number","title","creator","contributor", "date","publisher","language", "description", "book_format"]]
booK_metadata_df_reorder.to_csv('data/extracted/books_metadata.csv', index=False)

In [17]:
booK_metadata_df

Unnamed: 0,book_format,contributor,creator,date,description,language,oclc_number,publisher,title
0,246 pages 24 cm,,,"[1973, ©1972]","""This book consists of working papers which ar...",eng,824231,MIT Press,New directions in sociological theory
1,[288] p. ; 24 cm,,"Horowitz, Irving Louis.",1995,New ed.,eng,69205232,Oxford University Press,Decomposition of sociology
2,"viii, 137 pages ; 22 cm",,"Carter, Hugh, 1895-","[1968, ©1927]",Reprint. Originally published: University of N...,eng,152432104,Kennikat Press,The social theories of L.T. Hobhouse
3,,,"Schutz, Alfred, 1899-1959.",1964,,eng,270808917,Nijhoff,"Collected papers [of] Alfred Schutz. Vol. 2, S..."
4,220 p,,"Mouzelis, Nicos P.",1995,,eng,906380481,Routledge,Sociological theory : what went wrong? : diagn...
5,"XIII, 381 str. ; 23 cm.",,,1978,,eng,439634729,Goodyear Publishing Company,Contemporary sociological theories
6,"XII, 368 str. : graf. prikazi ; 24 cm.",,"Waters, Malcolm, 1946-","2000, cop. 1994",,eng,443605828,Sage,Modern sociological theory
7,272 s,,"Tucker, Kenneth H.",2002,,eng,472253792,Blackwell Publishers,Classical social theory : a contemporary approach
8,"XII, 350 Seiten",,"Runciman, Walter Garrison.",1990,[Nachdr.].,eng,614201007,Cambridge Univ. Press,A treatise on social theory / 1. The methodolo...
9,1 online resource (186 pages),,"López, José, 1966-",2003,Social theorizing as a language-borne practice...,eng,632723490,Continuum,"Society and its metaphors : language, social t..."
