In [99]:
# The %... is an iPython thing, and is not part of the Python language.
# In this case we're just telling the plotting library to draw things on
# the notebook, instead of on a separate window.
%matplotlib inline
# See all the "as ..." contructs? They're just aliasing the package names.
# That way we can call methods like plt.plot() instead of matplotlib.pyplot.plot().
import numpy as np
import scipy as sp
import matplotlib as mpl
import matplotlib.cm as cm
import matplotlib.pyplot as plt
import pandas as pd
import time
pd.set_option('display.width', 500)
pd.set_option('display.max_columns', 100)
pd.set_option('display.notebook_repr_html', True)
import seaborn as sns
sns.set_style("whitegrid")
sns.set_context("poster")

from bs4 import BeautifulSoup
from collections import OrderedDict # provides the ordered dictionary
import re # for regular expressions used below
import urllib # to read from URLs
import networkx as nx # network analysis
import itertools
import os.path

# OAI
from sickle import Sickle

In [100]:
from PIL import Image
im = Image.open(urllib.urlopen('http://digital.staatsbibliothek-berlin.de/europeana/PPN730725200/00000001.tif'))
jpg = Image.new("RGB", im.size)
jpg.paste(im)
im.save('./img/test.jpg', 'jpeg')

KeyboardInterrupt: 

Stabi-URL:

PPN722144857

http://ngcs.staatsbibliothek-berlin.de/?action=metsImage&format=jpg&metsFile=PPN722144857&divID=PHYS_0001&width=800&rotate=0

http://ngcs.staatsbibliothek-berlin.de/?action=metsImage&format=jpg&metsFile=PPN730725200&divID=PHYS_0001&width=800&rotate=0

In [None]:
urllib.urlretrieve("http://ngcs.staatsbibliothek-berlin.de/?action=metsImage&format=jpg&metsFile=PPN722144857&divID=PHYS_0001&width=800&rotate=0","./img/test2.jpg")

In [None]:
# connect to a metadata repository
sickle = Sickle('http://digital.staatsbibliothek-berlin.de/oai')
# get the sets from the data provider connected to
sets = sickle.ListSets()
# print the returned sets including their identifiers
print "Sets provided by data provider\n* * * * * * * * * * * * * * * * * * * * * " # \n creates a new line
for s in sets:
    print "'"+s.setName+"' accessible via: '"+s.setSpec+"'"

In [None]:
# get the records from this repository's specific document set 'DC_krieg.1914.1918' (documents related to World War I) 
# using Dublin Core format 
records = sickle.ListRecords(metadataPrefix='oai_dc', set='DC_krieg.1914.1918')

In [None]:
firstRecord=records.next()
print type(firstRecord)
print "* * * * * * * * * * * * * * * * *"
print "Header:"
print "* * * * * * * * * * * * * * * * *"
print firstRecord.header
print "* * * * * * * * * * * * * * * * *"
print "Metadata:"
print "* * * * * * * * * * * * * * * * *"
print firstRecord.metadata

In [None]:
# initialize some variables for counting and saving the metadata records
savedDocs=0
maxDocs=1000 # 100 is just for testing, for more interesting results increase this value to 1000. ATTENTION! this will also take more time for reading data.
savedRecords=[]

# save the records locally as we don't want to have to rely on a connection to the OAI-PMH server all the time
# iterate over all records until maxDocs is reached
# ATTENTION! if you re-run this cell, the contents of the savedRecords array will be altered!
for record in records:
    # check if we reach the maximum document value
    if savedDocs<maxDocs:
        savedDocs=savedDocs+1
        # save the current record to the "savedRecords" array
        savedRecords.append(record)
    # if so, end the processing of the for-loop
    else:
        break # break ends the processing of the loop

In [None]:
# create a dictionary for the records
values=dict()
# for the sake of simpilicity, take the fields' names from the first metadata record 
# (we ignore that these fields differ from record to record)
keys=savedRecords[0].metadata.keys()
# for every metadata field, create an empty array as the content of the dictionary filed under the key 'k'
for k in keys:
    values[k]=[]
    
values["PPN"]=[]
values["object"]=[]

# iterate over all saved records
for record in savedRecords:
    # we cannot iterate over the keys of record.metadata directly because not all records cotain the same fields,...
    for k in keys:
        # thus we check if the metadata field 'k' has been created above
        if k in values:
            # append the metadata fields to the dictionary created above
            # if the metadata field 'k' is not available input "None" instead
            values[k].append(record.metadata.get(k,["None"])[0].encode('ISO-8859-1'))
            # get the PPN
            if k=="identifier":
                values["PPN"].append(record.metadata.get(k)[1])
# create a data frame from the 
df=pd.DataFrame(values)
df.shape

In [None]:
df.head()

In [None]:
for row in df.itertuples():
    if not row[6] == "None":
        filePath="./img.krieg/"+row[1]+".tif"
        # prevent downloading of already present files
        if not os.path.isfile(filePath) :
            #print "Downloading: "+row[1]+" "+row[6]
            urllib.urlretrieve(row[6],filePath)
    else:
        print "Could not download: "+row[1]

print "Downloading finished."

In [None]:
df.to_csv('krieg.csv',index=False,header=True,encoding='utf-8', sep="\t")