In [6]:
# The %... is an iPython thing, and is not part of the Python language.
# In this case we're just telling the plotting library to draw things on
# the notebook, instead of on a separate window.
%matplotlib inline
# See all the "as ..." contructs? They're just aliasing the package names.
# That way we can call methods like plt.plot() instead of matplotlib.pyplot.plot().
import numpy as np
import scipy as sp
import matplotlib as mpl
import matplotlib.cm as cm
import matplotlib.pyplot as plt
import pandas as pd
import time
pd.set_option('display.width', 500)
pd.set_option('display.max_columns', 100)
pd.set_option('display.notebook_repr_html', True)
import seaborn as sns
sns.set_style("whitegrid")
sns.set_context("poster")

from bs4 import BeautifulSoup
from collections import OrderedDict # provides the ordered dictionary
import re # for regular expressions used below
import urllib # to read from URLs
import networkx as nx # network analysis
import itertools
import os.path
from datetime import datetime # for time measurement
import sys
import os
import pickle

# OAI
from sickle import Sickle

def printLog(text):
    now=str(datetime.now())
    print "["+now+"]\t"+text
    # forces to output the result of the print command immediately, see: http://stackoverflow.com/questions/230751/how-to-flush-output-of-python-print
    sys.stdout.flush()

In [100]:
from PIL import Image
im = Image.open(urllib.urlopen('http://digital.staatsbibliothek-berlin.de/europeana/PPN730725200/00000001.tif'))
jpg = Image.new("RGB", im.size)
jpg.paste(im)
im.save('./img/test.jpg', 'jpeg')

KeyboardInterrupt: 

Stabi-URL:

PPN722144857

http://ngcs.staatsbibliothek-berlin.de/?action=metsImage&format=jpg&metsFile=PPN722144857&divID=PHYS_0001&width=800&rotate=0

http://ngcs.staatsbibliothek-berlin.de/?action=metsImage&format=jpg&metsFile=PPN730725200&divID=PHYS_0001&width=800&rotate=0

In [None]:
urllib.urlretrieve("http://ngcs.staatsbibliothek-berlin.de/?action=metsImage&format=jpg&metsFile=PPN722144857&divID=PHYS_0001&width=800&rotate=0","./img/test2.jpg")

In [9]:
# connect to a metadata repository
sickle = Sickle('http://digital.staatsbibliothek-berlin.de/oai')
# get the sets from the data provider connected to
sets = sickle.ListSets()
# print the returned sets including their identifiers
print "Sets provided by data provider\n* * * * * * * * * * * * * * * * * * * * * " # \n creates a new line
for s in sets:
    print "'"+s.setName+"' accessible via: '"+s.setSpec+"'"

Sets provided by data provider
* * * * * * * * * * * * * * * * * * * * * 
'Historische Drucke' accessible via: 'DC_historische.drucke'
'Theologie' accessible via: 'DC_theologie'
'Rechtswissenschaft' accessible via: 'DC_rechtswissenschaft'
'Geschichte/Ethnographie/Geographie' accessible via: 'DC_geschichte.ethnographie.geographie'
'Landwirtschaft/Forstwirtschaft' accessible via: 'DC_landwirtschaft'
'Politik/Staat/Gesellschaft/Wirtschaft' accessible via: 'DC_politik.staat.gesellschaft.wirtschaft'
'Sprachen/Literaturen' accessible via: 'DC_sprachen.literaturen'
'Aberglaube/Mystische Philosophie' accessible via: 'DC_aberglaube.mystische.philosophie'
'Naturwissenschaften/Mathematik' accessible via: 'DC_naturwissenschaften.mathematik'
'Architektur/Technik' accessible via: 'DC_architektur.technik'
'Einblattdrucke' accessible via: 'DC_einblattmaterialien'
'Philosophie/Psychologie' accessible via: 'DC_philosophie.psychologie'
'Ostasiatica' accessible via: 'DC_ostasiatica'
'Musik' accessible via

In [21]:
# get the records from this repository's specific document set 'DC_krieg.1914.1918' (documents related to World War I) 
# using Dublin Core format 
records = sickle.ListRecords(metadataPrefix='oai_dc', set='DC_all')

In [5]:
#firstRecord=records.next()
#print type(firstRecord)
#print "* * * * * * * * * * * * * * * * *"
#print "Header:"
#print "* * * * * * * * * * * * * * * * *"
#print firstRecord.header
#print "* * * * * * * * * * * * * * * * *"
#print "Metadata:"
#print "* * * * * * * * * * * * * * * * *"
#print firstRecord.metadata

<class 'sickle.models.Record'>
* * * * * * * * * * * * * * * * *
Header:
* * * * * * * * * * * * * * * * *
<header xmlns="http://www.openarchives.org/OAI/2.0/" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"><identifier>oai:digital.staatsbibliothek-berlin.de:PPN749835605</identifier><datestamp>2016-01-06</datestamp><setSpec>krieg.1914.1918</setSpec><setSpec>historische.drucke</setSpec></header>
* * * * * * * * * * * * * * * * *
Metadata:
* * * * * * * * * * * * * * * * *
{'object': ['http://digital.staatsbibliothek-berlin.de/europeana/PPN749835605/00000001.tif'], 'identifier': ['http://resolver.staatsbibliothek-berlin.de/SBB0000D1D200460000', 'PPN749835605'], 'rights': ['Open Access'], 'issued': ['1918'], 'format': ['image/jpeg'], 'source': ['Nachrichtenblatt vom. [s. l.] @]  Berlin 1918'], 'relation': [u'Nachrichtenblatt vom  f\xc3\xbcr die Kameraden im Kampf zwischen Somme u. Oise aus der "Stafette in Champagne und Argonnen". [s. l.] @]  Berlin ; 71.1918'], 'spatial': ['[s. l.]

In [22]:
printLog("Starting OAI record download...")
# initialize some variables for counting and saving the metadata records
savedDocs=0
# 2:15 h for 100k
maxDocs=100000 # 100 is just for testing, for more interesting results increase this value to 1000. ATTENTION! this will also take more time for reading data.
savedRecords=[]

# save the records locally as we don't want to have to rely on a connection to the OAI-PMH server all the time
# iterate over all records until maxDocs is reached
# ATTENTION! if you re-run this cell, the contents of the savedRecords array will be altered!
for record in records:
    # check if we reach the maximum document value
    if savedDocs<maxDocs:
        savedDocs=savedDocs+1
        # save the current record to the "savedRecords" array
        savedRecords.append(record.metadata)
        if savedDocs%1000==0:
            printLog("Downloaded %d of %d records."%(savedDocs,maxDocs))
    # if so, end the processing of the for-loop
    else:
        break # break ends the processing of the loop

printLog("Finished OAI download of "+str(len(savedRecords))+" records.")

[2016-01-24 09:54:39.370596]	Starting OAI record download...
[2016-01-24 09:54:41.475471]	Downloaded 100 of 100000 records.
[2016-01-24 09:54:45.313401]	Downloaded 200 of 100000 records.
[2016-01-24 09:54:50.126847]	Downloaded 300 of 100000 records.
[2016-01-24 09:54:55.354825]	Downloaded 400 of 100000 records.
[2016-01-24 09:55:00.996481]	Downloaded 500 of 100000 records.
[2016-01-24 09:55:04.888680]	Downloaded 600 of 100000 records.
[2016-01-24 09:55:10.605015]	Downloaded 700 of 100000 records.
[2016-01-24 09:55:15.993761]	Downloaded 800 of 100000 records.
[2016-01-24 09:55:22.091152]	Downloaded 900 of 100000 records.
[2016-01-24 09:55:26.269960]	Downloaded 1000 of 100000 records.
[2016-01-24 09:55:34.970975]	Downloaded 1100 of 100000 records.
[2016-01-24 09:55:38.459564]	Downloaded 1200 of 100000 records.
[2016-01-24 09:55:43.047749]	Downloaded 1300 of 100000 records.
[2016-01-24 09:55:47.678718]	Downloaded 1400 of 100000 records.
[2016-01-24 09:55:51.159252]	Downloaded 1500 of 1000

In [23]:
pickle.dump( savedRecords, open( "save_100k_dc_all.pickle", "wb" ) )

In [55]:
#pickledRecords=pickle.load( open( "save_100k_dc_all.pickle", "rb" ) )
availableKeys=dict()

evenRecords=[]
oddRecords=[]
for i,r in enumerate(savedRecords):
    for k in r.keys():
        if not k in availableKeys:
            availableKeys[k]=1
        else:
            availableKeys[k]=availableKeys[k]+1
    if i%2==0:
        evenRecords.append(r)
    else:
        oddRecords.append(r)
    
#print availableKeys

#print "\n"+str(savedRecords[956])

pickle.dump( evenRecords, open( "even_dc_all.pickle", "wb" ) )
pickle.dump( oddRecords, open( "odd_dc_all.pickle", "wb" ) )

In [74]:
# immer als Array hinterlegt
# bei publisher entfernen: u"Staatsbibliothek zu Berlin \xe2\x80\x93 Preu\xc3\x9fischer Kulturbesitz, Germany"
# object: Bild zur Repräsentation
# type: Monographie o.ä.
# title
# spatial Ort
# subject Klassifikatoren wie Theologie
# identifier[1] = PPN

import urllib # to read from URLs
import subprocess as subp

# even: Macbook
# odd: iMac

#savedRecords=pickle.load( open( "even_dc_all.pickle", "rb" ) )
countSavedRecords=len(savedRecords)
printLog("Started image download and processing. This will take a while...")
logFile = open("downloadIssues.txt", "w")
for i,record in enumerate(savedRecords):
    if i%1000==0:
        printLog("Downloading image %d of %d images."%(i,countSavedRecords))
    downloadDir="./tmp/"
    ppn=""
    if len(record["identifier"])>1:
        ppn=str(record["identifier"][1])
    else:
        ppn=str(record["identifier"][0])
    ppnTIFF=ppn+".tif"
    ppnJPEGPAth=downloadDir+ppn+".jpg"
    if "object" in record.keys():
        # prevent downloading of already present files
        if not os.path.isfile(ppnJPEGPAth) :
            # check for the HTTP error code, maybe the file does not exist
            httpCode=urllib.urlopen(record["object"][0],downloadDir+ppnTIFF).getcode()
            if httpCode==200:    
                urlinfo=urllib.urlretrieve(record["object"][0],downloadDir+ppnTIFF)
                ret=subp.call(["mogrify", "-resize","512x512","-format", "jpg",downloadDir+ppnTIFF])
                if ret!=0:
                    print "Problem with mogrifying "+ppnTIFF
                    logFile.write("[MOGRIFY]: %s \n%s\n\n" % (str("Problem with mogrifying "+ppnTIFF),str("Downloaded from: "+record["object"][0])))
                ret=subp.call(["rm",downloadDir+ppnTIFF])
                if ret!=0:
                    print "Problem with removing "+ppnTIFF
                    logFile.write("[REMOVAL]: %s\n\n" % "Problem with removing "+ppnTIFF)
            else:
                print "Problem with accessing "+ppnTIFF+ " due to HTTP code: "+str(httpCode)
                logFile.write("[HTTP]: %s\n\n" % "Problem with accessing "+ppnTIFF)
                logFile.write("HTTP Code: "+str(httpCode)+"\n")
                logFile.write(str(urlinfo[1])+"\n\n")
    else:
        logFile.write("[OBJECT key missing]: %s\n\n" % str(record))
logFile.close()
print "\n"
printLog("Finished image download and processing.")

[2016-01-27 18:21:05.231457]	Started image download and processing. This will take a while...
[2016-01-27 18:21:05.234335]	Downloading image 0 of 50000 images.
[2016-01-27 18:21:06.543482]	Downloading image 1000 of 50000 images.
[2016-01-27 18:21:28.756233]	Downloading image 2000 of 50000 images.
[2016-01-27 18:21:53.311774]	Downloading image 3000 of 50000 images.
[2016-01-27 18:22:54.538869]	Downloading image 4000 of 50000 images.
Problem with accessing PPN746838220.tif due to HTTP code: 404
Problem with accessing PPN746838271.tif due to HTTP code: 404
Problem with accessing PPN746838387.tif due to HTTP code: 404
[2016-01-27 18:23:06.550982]	Downloading image 5000 of 50000 images.
Problem with accessing PPN746578210.tif due to HTTP code: 403
Problem with accessing PPN821908359.tif due to HTTP code: 404
[2016-01-27 18:23:22.070141]	Downloading image 6000 of 50000 images.
[2016-01-27 18:23:26.061865]	Downloading image 7000 of 50000 images.
[2016-01-27 18:23:26.078859]	Downloading image 

ergibt Bilder "PPN813124174-0.jpg"/"PPN813124174-1.jpg", wobei eins von schlechter Qualitaet ist

wenn kein bilder gedownloadet werden konnten, dann handelt es sich in der regel um folgende types:

* Periodical
* Multivolume work

an die METS-Daten kommt man über http://digital.staatsbibliothek-berlin.de/metsresolver/?PPN=PPN721220665

In [None]:
# create a dictionary for the records
values=dict()
# for the sake of simpilicity, take the fields' names from the first metadata record 
# (we ignore that these fields differ from record to record)
keys=savedRecords[0].metadata.keys()
# for every metadata field, create an empty array as the content of the dictionary filed under the key 'k'
for k in keys:
    values[k]=[]
    
values["PPN"]=[]
values["object"]=[]

# iterate over all saved records
for record in savedRecords:
    # we cannot iterate over the keys of record.metadata directly because not all records cotain the same fields,...
    for k in keys:
        # thus we check if the metadata field 'k' has been created above
        if k in values:
            # append the metadata fields to the dictionary created above
            # if the metadata field 'k' is not available input "None" instead
            values[k].append(record.metadata.get(k,["None"])[0].encode('ISO-8859-1'))
            # get the PPN
            if k=="identifier":
                values["PPN"].append(record.metadata.get(k)[1])
# create a data frame from the 
df=pd.DataFrame(values)
df.shape

In [None]:
df.head()

In [None]:
for row in df.itertuples():
    if not row[6] == "None":
        filePath="./img.krieg/"+row[1]+".tif"
        # prevent downloading of already present files
        if not os.path.isfile(filePath) :
            #print "Downloading: "+row[1]+" "+row[6]
            urllib.urlretrieve(row[6],filePath)
    else:
        print "Could not download: "+row[1]

print "Downloading finished."

In [None]:
df.to_csv('krieg.csv',index=False,header=True,encoding='utf-8', sep="\t")