* Purpose: Script to extract holding/location information from worldcat api results   
* License: GPLv3 (Free Software) 
* Date: Oct 15, 2018
* Aditional Notes: API Get Request: #http://www.worldcat.org/webservices/catalog/content/libraries/62123162?wskey={built-in-api-key}

In [9]:
from urllib.request import urlopen
import numpy as np
import pandas as pd
import xml.etree.ElementTree as ET
from geopy.geocoders import Nominatim

In [5]:
# Get info from API result xml
oclc_number = "62123162"
root = ET.parse('data/source/location_info.xml')
holding_list = root.findall("./holding")
    
location_info_df = pd.DataFrame()

for holding in holding_list:
    institutionIdentifier = holding.findtext("./institutionIdentifier/value")
    physicalLocation = holding.findtext("./physicalLocation")
    physicalAddress = holding.findtext("./physicalAddress/text")
    copiesCount = holding.findtext("./holdingSimple/copiesSummary/copiesCount")
    location_info_df = location_info_df.append({'oclc_number':oclc_number, 'institutionIdentifier': institutionIdentifier, 'physicalLocation': physicalLocation, 'physicalAddress': physicalAddress, 'copiesCount': copiesCount}, ignore_index=True)

In [6]:
location_info_df.head(2)

Unnamed: 0,copiesCount,institutionIdentifier,oclc_number,physicalAddress,physicalLocation
0,1,OTC,62123162,"Westerville, OH 43081 United States",Otterbein University
1,1,ANTCH,62123162,"Yellow Springs, OH 45387 United States",Antioch University Library


In [7]:
# Find and add latitude and longitude values using geopy
geolocator = Nominatim(user_agent="Natkeeran")

latitude_values = []
longitude_values = []

for index, row in location_info_df.iterrows():
    physical_address = row["physicalAddress"]
    location = geolocator.geocode(physical_address)
    latitude_values.append(location.latitude)
    longitude_values.append(location.longitude)
location_info_df['latitude'] = latitude_values
location_info_df['longitude'] = longitude_values

In [8]:
location_info_df.head(10)
location_info_df.to_csv("data/extracted/location_info.csv", header=True, columns=["oclc_number","institutionIdentifier","physicalLocation","physicalAddress","copiesCount","latitude","longitude"])