In [1]:
# Imports

import os, os.path, sys
import glob
import string
import urllib.request
from itertools import groupby
import pickle

import xml.etree.ElementTree as ET
from xml.dom import minidom

import xmltodict

# `callnumber` needs to be installed manually from the diyclassics fork:
# https://github.com/diyclassics/library-callnumber-lc/tree/master/python
# 
# Follow the installation instructions in the README. But in brief...
# 1. Download the zip file
# 2. Run `python setup.py install` in the 'python' folder
import callnumber as callnumber

from pprint import pprint

In [2]:
# Statics

# Fix in envs
append_infile = 'data/new-titles/append-bsns.txt'

In [3]:
# Helper functions

def prettify_xml(xml_string):
    return '\n'.join([line for line in minidom.parseString(xml_string).toprettyxml(indent=' '*2).split('\n') if line.strip()])

def combine_xml(files):
    # See https://stackoverflow.com/q/15921642
    first = None
    xml_files = glob.glob(files +"/*.xml")
    xml_element_tree = None
    for xml_file in xml_files:
        data = ET.parse(xml_file).getroot()
        if first is None:
            first = data
        else:
            first.extend(data)
    if first is not None:
        return ET.tostring(first)   

 ## Append BSNs

In [4]:
# Read a txt file of isbns

with open(append_infile, "r") as f:
    append_bsns = f.read().splitlines()

In [5]:
# Create XML 

root = ET.Element('printout')

for i, item in enumerate(append_bsns):
    temp = ET.Element('ROW')
    child = ET.Element('BSN')
    child.text = item
    temp.append(child)
    child = ET.Element('BARCODE')
    child.text = str(i)
    temp.append(child)
    root.append(temp)

    # pretty string
    xmlstr = prettify_xml(ET.tostring(root))


In [6]:
# Write append record to xml file
with open("data/new-titles/append_bsns.xml", "w") as f:
    f.write(xmlstr)

In [7]:
combined_xml = combine_xml('/Users/patrick/Envs/isaw-library-misc/notebooks/data/new-titles/')
xmlstr = prettify_xml(combined_xml)

In [8]:
# Store a temporary version of the XML report with appended BSNs
with open("data/new-titles/temp/full_report.xml", "w") as f:
    f.write(xmlstr)

## Process New Titles report

In [9]:
# Convert New Titles xml report to dictionary for misc info

with open('data/new-titles/temp/full_report.xml') as f:
    doc = xmltodict.parse(f.read())

In [10]:
print('There are %d records in this month\'s report.' % (len(doc['printout']['ROW'])))

There are 195 records in this month's report.


In [11]:
# Start report list

report = []

for row in doc['printout']['ROW']:
    item = {}
    item['barcode'] = row['BARCODE']
    item['bsn'] = row['BSN']
    if 'VOLUME_INFO' in row.keys():
        item['volume'] = row['VOLUME_INFO']
        if '(' in item['volume']:
            item['volume'] = item['volume'].replace('(',' (')
        
    if 'Z13_IMPRINT' in row.keys():
        item['imprint'] = row['Z13_IMPRINT']

    report.append(item)

barcodes = [item['barcode'] for item in report]
bsns = [item['bsn'] for item in report]

In [12]:
class NewTitle(object):
    def __init__(self, bsn):
        self.bsn = bsn
        urlstring = 'http://aleph.library.nyu.edu/X?op=publish_avail&library=nyu01&doc_num=%s' % self.bsn
        url = urllib.request.urlopen(urlstring)
        tree = ET.parse(url)
        self.root = tree.getroot()
        
        # Get NewTitle info
        self.title_info = self.get_title_info()
        self.contributor_info = self.get_contributor_info()
        self.edition_info = self.get_edition_info()
        self.imprint_info = self.get_imprint_info()
        self.collection_info = self.get_collection_info()
        self.series_info = self.get_series_info()
        self.gift_info = self.get_gift_info()
        self.handle_info = self.get_handle_info()
                
        
    def get_element(self, tag, code, nr=True):
        datastring = ".//{http://www.loc.gov/MARC21/slim}datafield[@tag='%s']/{http://www.loc.gov/MARC21/slim}subfield" % tag
        datafield = self.root.findall(datastring)
        if nr:
            element = next((item.text for item in datafield if item.attrib['code'] == code), None)
        else:
            element = [item.text for item in datafield if item.attrib['code'] == code]
        return element

    
    # Should abstract this to be useful for getting other XML nodes
    def get_alts(self, tag):
        
        datastring = ".//{http://www.loc.gov/MARC21/slim}datafield[@tag='%s']/" % tag
        nodes = self.root.findall(datastring)
        
        alts = []
        
        for node in nodes:
            alts.append((node.attrib['code'], node.text))

        a = [list(g) for k, g in groupby(alts, lambda x: x[0] != '6') if k]
        b = [list(g)[0][1][:3] for k, g in groupby(alts, lambda x: x[0] == '6') if k]  
        c = dict(zip(b, a))
        
        return c
    
    
    def strip_char_(self, s, char):
        if s.endswith(char):
            return s[:-1]
        else:
            return s    

        
    def fix_punctuation_(self, string):
        string = string.replace(' ;', ';')
        string = string.replace(' :', ':')
        return string
    
    
    def alt_exists_(self):
        return any(self.get_element('880','6', False))

    
    def get_title_info(self):
        self.title = self.get_element('245','a')
        self.remainder_of_title = self.get_element('245','b')
        self.section_number = " ".join(self.get_element('245','n', False))
        self.section_name = " ".join(self.get_element('245','p', False))   
        
        if self.alt_exists_():
            alt = self.get_alts('880')
            if '245' in alt.keys():
                alt_block = dict(alt['245'])

                self.title = alt_block['a']
                if 'b' in alt_block.keys():
                    self.remainder_of_title = alt_block['b']            

            
    def get_contributor_info(self):
        self.contributor = self.get_element('245', 'c')
        
        if self.alt_exists_():
            alt = self.get_alts('880')
            if '245' in alt.keys():
                alt_block = dict(alt['245'])
                if 'c' in alt_block.keys():
                    self.contributor = alt_block['c']

                    
    def get_edition_info(self):
        self.edition = self.get_element('250', 'a')
        self.remainder = self.get_element('250', 'b')

        
    def get_imprint_info(self):
        self.places = self.get_element('264', 'a', False)
        self.publishers = self.get_element('264', 'b', False)
        self.dates = self.get_element('264', 'c', False)
        self.places_alt = self.get_element('260', 'a', False)
        self.publishers_alt = self.get_element('260', 'b', False)
        self.dates_alt = self.get_element('260', 'c', False)

        
    def get_collection_info(self):
        self.library = self.get_element('AVA', 'b', False)
        self.collection = self.get_element('AVA', 'c', False)
        self.callnumber = self.get_element('AVA', 'd', False)

        # Fix this hack; a node-based solution like the get_alts might be better for collections
        while len(self.callnumber) < len(self.library):
            self.callnumber += self.callnumber[0]

        collection_ = list(zip(self.library, self.collection, self.callnumber))
        collection = []
        
        for item in collection_:
            if item[0] == 'NISAW':
                collection.append(item)
                break
            elif item[0] == 'WEB':
                collection.append(item)
                break
        
        #collection = [item for item in collection if item[0] == 'NISAW']
        if collection == []:
            self.library, self.collection, self.callnumber = None, None, None
        else:
            self.library, self.collection, self.callnumber = zip(*collection)

        
    def get_series_info(self):
        self.series = self.get_element('490', 'a', False)
        self.version = self.get_element('490', 'v', False)

        
    def get_gift_info(self):
        self.gift = self.get_element('500', 'a', False)
        self.gift = [item for item in self.gift if item.startswith('ISAW copy')]

    def get_handle_info(self):
        handle_loc = self.get_element('856', '3', False)
        handle = self.get_element('856', 'u', False)
        handles = list(zip(handle_loc, handle))
        handles = [item[1] for item in handles if item[0].startswith('Ancient World Digital Library')]
        if handles:
            self.handle = handles[0]
        else:
            self.handle = None
        
    def format_title(self):
        title = self.fix_punctuation_(self.title)
        if self.remainder_of_title:
            title += ' ' + self.remainder_of_title
        title = self.strip_char_(title, '/')
        title += self.section_number + self.section_name
        title = title.strip()
        title = self.strip_char_(title, '.')
        
        return title

    def format_contributor(self):
        contributor = self.contributor
        if contributor:
            contributor = contributor[0].capitalize() + contributor[1:] # Capitalize first letter
            contributor = contributor.strip()
            contributor = self.strip_char_(contributor, '.')
        return contributor
    
    def format_edition(self):
        edition = self.edition
        remainder = self.remainder

        if remainder:
            edition += remainder
        
        if edition:
            return self.strip_char_(edition.strip(), '.')

    
    def format_imprint(self):
        
        self.places = self.get_element('264', 'a', False)
        self.publishers = self.get_element('264', 'b', False)
        self.dates = self.get_element('264', 'c', False)
        
        if self.places:
            places = self.places
        else:
            places = self.places_alt

        if self.publishers:
            publishers = self.publishers
        else:
            publishers = self.publishers_alt

        if self.dates:
            dates = self.dates
        else:
            dates = self.dates_alt

        places = [self.fix_punctuation_(place) for place in places]
        place = " ".join(places)

        publishers = [self.fix_punctuation_(publisher) for publisher in publishers]
        publisher = " ".join(publishers)

        if len(dates) == 2:
            date = dates[1]
        else:
            date = " ".join(dates)
        
        #print(date)
    
        imprint = " ".join([place, publisher, date]).strip()
        imprint = self.strip_char_(imprint, '.')
        
        return imprint
    
    def format_collection(self):
        collection = self.collection
        if collection:
            collection = collection[0].strip()
        return collection
    
    
    def format_callnumber(self):
        callnumber = self.callnumber
        if callnumber:
            callnumber = callnumber[0].strip()
            if callnumber.endswith(' Non-circulating'):
                callnumber = callnumber.replace(' Non-circulating','')
        return callnumber
    
    
    def format_series(self):
        series = self.series
        version = self.version
        version = [item.replace('no. ','') for item in version]
        
        series = [self.fix_punctuation_(s) for s in series]
        series = list(zip(series, version))
        series = " ".join([" ".join(item) for item in series])
        return series
    
    
    def format_gift(self):
        if self.gift:
            gift = self.gift[0]
            index = gift.find('from')
            gift = gift[index].upper() + gift[index+1:]
            gift = self.strip_char_(gift, '.')
            return gift
                       
    def format_handle(self):
        if self.handle:
            handle = self.handle
            return handle


In [13]:
# http://stackoverflow.com/a/3308844

import unicodedata as ud

latin_letters= {}

def is_latin(uchr):
    try: return latin_letters[uchr]
    except KeyError:
         return latin_letters.setdefault(uchr, 'LATIN' in ud.name(uchr))

def only_roman_chars(unistr):
    return all(is_latin(uchr)
           for uchr in unistr
           if uchr.isalpha()) # isalpha suggested by John Machin

In [14]:
records = []
processed = 0

for i, barcode in enumerate(barcodes):
    bc_index = barcodes.index(barcode)
    
    bsn = report[bc_index]['bsn']
    

    new_title = NewTitle(bsn)
    if new_title.format_collection():
        print("Processing record %d: %s" % (i+1, bsn))
        processed += 1
        record = {}
        record['bsn'] = bsn
        record['title'] = new_title.format_title()
        record['char'] = only_roman_chars(record['title'])
        record['contributor'] = new_title.format_contributor()
        record['edition'] = new_title.format_edition()

        if 'imprint' in report[bc_index].keys():
            record['imprint'] = report[bc_index]['imprint'].strip()
            record['imprint'] = record['imprint'][:-1] if record['imprint'][-1] == '.' else record['imprint']
        else:
            record['imprint'] = new_title.format_imprint()

        record['imprint'] = new_title.format_imprint()
        record['collection'] = new_title.format_collection()
        record['series'] = new_title.format_series()

        if 'volume' in report[bc_index].keys():
            record['volume'] = report[bc_index]['volume'].replace('.', '. ')
        else:
            record['volume'] = ""

        record['callnumber'] = new_title.format_callnumber()
        record['lccn'] = callnumber.LC(record['callnumber']).normalized
        
        if record['volume']:
            record['callnumber'] += " " + record['volume']

        record['gift'] = new_title.format_gift()
        record['handle'] = new_title.format_handle()

        records.append(record)
    else:
        print("Processing record %d: %s RECORD SKIPPED" % (i+1, bsn))

#print('\nFinished processing %d records.' % processed)
        

Processing record 1: 1246244
Processing record 2: 002615400
Processing record 3: 1204701
Processing record 4: 005684177
Processing record 5: 5684181
Processing record 6: 005684199
Processing record 7: 005684205
Processing record 8: 5684211
Processing record 9: 005684215
Processing record 10: 4068225
Processing record 11: 05684231
Processing record 12: 5684875
Processing record 13: 4068180
Processing record 14: 002452317
Processing record 15: 002628150
Processing record 16: 002866018
Processing record 17: 002614180
Processing record 18: 002175975
Processing record 19: 002580772
Processing record 20: 002639144
Processing record 21: 002068884
Processing record 22: 001779351
Processing record 23: 002551047
Processing record 24: 002038367
Processing record 25: 003750420
Processing record 26: 005675297
Processing record 27: 001856937
Processing record 28: 005065208
Processing record 29: 005063689
Processing record 30: 002207943
Processing record 31: 002207943
Processing record 32: 002207943


In [15]:
## Choose category using call number map

import csv

with open('data/ref/lc_classes.csv', 'r') as f:
  reader = csv.reader(f)
  lc_classes = list(reader)

for i, record in enumerate(records):
    #print(i, record['title'], record['callnumber'])
    record['category'] = 'other'
    cn = callnumber.LC(record['callnumber'])
    cn_split = cn.components()
    #print(cn_split)
    if len(cn_split) > 1:
        if cn_split[0] in [item[0] for item in lc_classes]:
            #print('Yes')
            rows = [item for item in lc_classes if cn_split[0]==item[0]]
            for row in rows:
                #print(row)
                if float(row[1]) <= float(cn_split[1]) <= float(row[2]):
                    #print(float(row[1]) <= float(cn_split[1]) <= float(row[2]))
                    record['category'] = row[3]
                    #print('Updated!')
                    break

In [16]:
## Guess category

from data.ref.train import train
import random
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import MultinomialNB

from nltk.corpus import stopwords
stops = stopwords.words('english') + stopwords.words('german') + stopwords.words('french')

def preprocess(text):
    punctuation ="\"#$%&\'()*+,-/:;<=>@[\]^_`{|}~.?!"
    translator = str.maketrans({key: " " for key in punctuation})
    text = text.translate(translator)
    
    symbols = "©"
    translator = str.maketrans({key: " " for key in symbols})
    text = text.translate(translator)
    
    translator = str.maketrans({key: " " for key in '0123456789'})
    text = text.translate(translator)
    
    return text

data_ = [item for item in train]
data_ = random.sample(data_, len(data_))
train_data = [preprocess(item[1]) for item in data_][:2000]
train_target = [item[0] for item in data_][:2000]
test_data = [preprocess(item[1]) for item in data_][2000:]
test_target = [item[0] for item in data_][2000:]

categories = set([item[0] for item in train])

def predict_categories(titles):
    count_vect = CountVectorizer(stop_words=stops, min_df=5)
    X_train_counts = count_vect.fit_transform(train_data)
    tfidf_transformer = TfidfTransformer()
    X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)
    clf = MultinomialNB().fit(X_train_tfidf, train_target)
    X_new_counts = count_vect.transform(titles)
    X_new_tfidf = tfidf_transformer.transform(X_new_counts)
    predicted = clf.predict(X_new_tfidf)
    return predicted

titles = [record['title'] for record in records]

predicted_categories = predict_categories(titles)
for i, category in enumerate(predicted_categories):
    if records[i]['category'] == 'other':
        records[i]['title'] = "*"+records[i]['title']
        records[i]['category'] = category

In [17]:
# Should record sorting be done by Flask? Still need to figure out how to sort Flask by two keys
# See https://stackoverflow.com/a/26825833; had to add '0' to avoid error for no numbers in volume

records = sorted(records, key=lambda k: (k['lccn'], int(''.join(list(filter(str.isdigit, "0"+ k['volume']))))))
#pprint(records)

In [18]:
# Pickle dictionary for Flask

with open('../new-titles/app/data/newtitles.p', 'wb') as f:
    pickle.dump(records, f)