In [156]:
# The %... is an iPython thing, and is not part of the Python language.
# In this case we're just telling the plotting library to draw things on
# the notebook, instead of on a separate window.
%matplotlib inline
# See all the "as ..." contructs? They're just aliasing the package names.
# That way we can call methods like plt.plot() instead of matplotlib.pyplot.plot().
import numpy as np
import scipy as sp
import matplotlib as mpl
import matplotlib.cm as cm
import matplotlib.pyplot as plt
import pandas as pd
import time
pd.set_option('display.width', 500)
pd.set_option('display.max_columns', 100)
pd.set_option('display.notebook_repr_html', True)
import seaborn as sns
sns.set_style("whitegrid")
sns.set_context("poster")

from bs4 import BeautifulSoup
from collections import OrderedDict # provides the ordered dictionary
import re # for regular expressions used below
import urllib # to read from URLs
import networkx as nx # network analysis
import itertools
import os.path
from datetime import datetime # for time measurement
import sys
import os
import pickle

# OAI
from sickle import Sickle

def printLog(text):
    now=str(datetime.now())
    print "["+now+"]\t"+text
    # forces to output the result of the print command immediately, see: http://stackoverflow.com/questions/230751/how-to-flush-output-of-python-print
    sys.stdout.flush()

In [100]:
from PIL import Image
im = Image.open(urllib.urlopen('http://digital.staatsbibliothek-berlin.de/europeana/PPN730725200/00000001.tif'))
jpg = Image.new("RGB", im.size)
jpg.paste(im)
im.save('./img/test.jpg', 'jpeg')

KeyboardInterrupt: 

Stabi-URL:

PPN722144857

http://ngcs.staatsbibliothek-berlin.de/?action=metsImage&format=jpg&metsFile=PPN722144857&divID=PHYS_0001&width=800&rotate=0

http://ngcs.staatsbibliothek-berlin.de/?action=metsImage&format=jpg&metsFile=PPN730725200&divID=PHYS_0001&width=800&rotate=0

In [None]:
urllib.urlretrieve("http://ngcs.staatsbibliothek-berlin.de/?action=metsImage&format=jpg&metsFile=PPN722144857&divID=PHYS_0001&width=800&rotate=0","./img/test2.jpg")

In [146]:
# connect to a metadata repository
sickle = Sickle('http://digital.staatsbibliothek-berlin.de/oai')
# get the sets from the data provider connected to
sets = sickle.ListSets()
# print the returned sets including their identifiers
print "Sets provided by data provider\n* * * * * * * * * * * * * * * * * * * * * " # \n creates a new line
for s in sets:
    print "'"+s.setName+"' accessible via: '"+s.setSpec+"'"

Sets provided by data provider
* * * * * * * * * * * * * * * * * * * * * 
'Historische Drucke' accessible via: 'DC_historische.drucke'
'Theologie' accessible via: 'DC_theologie'
'Rechtswissenschaft' accessible via: 'DC_rechtswissenschaft'
'Geschichte/Ethnographie/Geographie' accessible via: 'DC_geschichte.ethnographie.geographie'
'Landwirtschaft/Forstwirtschaft' accessible via: 'DC_landwirtschaft'
'Politik/Staat/Gesellschaft/Wirtschaft' accessible via: 'DC_politik.staat.gesellschaft.wirtschaft'
'Sprachen/Literaturen' accessible via: 'DC_sprachen.literaturen'
'Aberglaube/Mystische Philosophie' accessible via: 'DC_aberglaube.mystische.philosophie'
'Naturwissenschaften/Mathematik' accessible via: 'DC_naturwissenschaften.mathematik'
'Architektur/Technik' accessible via: 'DC_architektur.technik'
'Einblattdrucke' accessible via: 'DC_einblattmaterialien'
'Philosophie/Psychologie' accessible via: 'DC_philosophie.psychologie'
'Ostasiatica' accessible via: 'DC_ostasiatica'
'Musik' accessible via

In [147]:
# get the records from this repository's specific document set 'DC_krieg.1914.1918' (documents related to World War I) 
# using Dublin Core format 
records = sickle.ListRecords(metadataPrefix='oai_dc', set='DC_all')

In [5]:
#firstRecord=records.next()
#print type(firstRecord)
#print "* * * * * * * * * * * * * * * * *"
#print "Header:"
#print "* * * * * * * * * * * * * * * * *"
#print firstRecord.header
#print "* * * * * * * * * * * * * * * * *"
#print "Metadata:"
#print "* * * * * * * * * * * * * * * * *"
#print firstRecord.metadata

<class 'sickle.models.Record'>
* * * * * * * * * * * * * * * * *
Header:
* * * * * * * * * * * * * * * * *
<header xmlns="http://www.openarchives.org/OAI/2.0/" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"><identifier>oai:digital.staatsbibliothek-berlin.de:PPN749835605</identifier><datestamp>2016-01-06</datestamp><setSpec>krieg.1914.1918</setSpec><setSpec>historische.drucke</setSpec></header>
* * * * * * * * * * * * * * * * *
Metadata:
* * * * * * * * * * * * * * * * *
{'object': ['http://digital.staatsbibliothek-berlin.de/europeana/PPN749835605/00000001.tif'], 'identifier': ['http://resolver.staatsbibliothek-berlin.de/SBB0000D1D200460000', 'PPN749835605'], 'rights': ['Open Access'], 'issued': ['1918'], 'format': ['image/jpeg'], 'source': ['Nachrichtenblatt vom. [s. l.] @]  Berlin 1918'], 'relation': [u'Nachrichtenblatt vom  f\xc3\xbcr die Kameraden im Kampf zwischen Somme u. Oise aus der "Stafette in Champagne und Argonnen". [s. l.] @]  Berlin ; 71.1918'], 'spatial': ['[s. l.]

In [148]:
printLog("Starting OAI record download...")
# initialize some variables for counting and saving the metadata records
savedDocs=0
# 2:15 h for 100k
maxDocs=120000 # 100 is just for testing, for more interesting results increase this value to 1000. ATTENTION! this will also take more time for reading data.
savedRecords=[]

# save the records locally as we don't want to have to rely on a connection to the OAI-PMH server all the time
# iterate over all records until maxDocs is reached
# ATTENTION! if you re-run this cell, the contents of the savedRecords array will be altered!
for record in records:
    # check if we reach the maximum document value
    if savedDocs<maxDocs:
        savedDocs=savedDocs+1
        # save the current record to the "savedRecords" array
        savedRecords.append(record.metadata)
        if savedDocs%1000==0:
            printLog("Downloaded %d of %d records."%(savedDocs,maxDocs))
    # if so, end the processing of the for-loop
    else:
        break # break ends the processing of the loop

printLog("Finished OAI download of "+str(len(savedRecords))+" records.")

[2016-01-28 18:27:42.651279]	Starting OAI record download...
[2016-01-28 18:28:11.857042]	Downloaded 1000 of 120000 records.
[2016-01-28 18:28:40.865639]	Downloaded 2000 of 120000 records.
[2016-01-28 18:29:11.575680]	Downloaded 3000 of 120000 records.
[2016-01-28 18:29:41.792284]	Downloaded 4000 of 120000 records.
[2016-01-28 18:30:26.794156]	Downloaded 5000 of 120000 records.
[2016-01-28 18:31:11.363767]	Downloaded 6000 of 120000 records.
[2016-01-28 18:31:56.732200]	Downloaded 7000 of 120000 records.
[2016-01-28 18:32:33.579985]	Downloaded 8000 of 120000 records.
[2016-01-28 18:33:13.931280]	Downloaded 9000 of 120000 records.
[2016-01-28 18:33:55.102198]	Downloaded 10000 of 120000 records.
[2016-01-28 18:34:40.986906]	Downloaded 11000 of 120000 records.
[2016-01-28 18:35:33.162897]	Downloaded 12000 of 120000 records.
[2016-01-28 18:36:33.102607]	Downloaded 13000 of 120000 records.
[2016-01-28 18:37:48.131282]	Downloaded 14000 of 120000 records.
[2016-01-28 18:38:53.888490]	Downloade

In [150]:
len(savedRecords)

120000

In [149]:
pickle.dump( savedRecords, open( "save_120k_dc_all.pickle", "wb" ) )

In [55]:
#pickledRecords=pickle.load( open( "save_100k_dc_all.pickle", "rb" ) )
availableKeys=dict()

evenRecords=[]
oddRecords=[]
for i,r in enumerate(savedRecords):
    for k in r.keys():
        if not k in availableKeys:
            availableKeys[k]=1
        else:
            availableKeys[k]=availableKeys[k]+1
    if i%2==0:
        evenRecords.append(r)
    else:
        oddRecords.append(r)

#pickle.dump( evenRecords, open( "even_dc_all.pickle", "wb" ) )
#pickle.dump( oddRecords, open( "odd_dc_all.pickle", "wb" ) )

In [152]:
# immer als Array hinterlegt
# bei publisher entfernen: u"Staatsbibliothek zu Berlin \xe2\x80\x93 Preu\xc3\x9fischer Kulturbesitz, Germany"
# object: Bild zur Repräsentation
# type: Monographie o.ä.
# title
# spatial Ort
# subject Klassifikatoren wie Theologie
# identifier[1] = PPN

import urllib # to read from URLs
import subprocess as subp

# even: Macbook
# odd: iMac

# * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * *
#
# If you set 'allowDownloads' to True, the next steps will take a lot of time
#
# * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * *
allowDownloads=False

#savedRecords=pickle.load( open( "save_100k_dc_all.pickle", "rb" ) )
countSavedRecords=len(savedRecords)
printLog("Started image download and processing. This will take a while...")
logFile = open("/Volumes/2TB_WD/sbb_images/downloadIssues.txt", "w")
#logFile = open("./downloadIssues.txt", "w")

for i,record in enumerate(savedRecords):
    if i%1000==0:
        printLog("Downloading image %d of %d images."%(i,countSavedRecords))
    downloadDir="/Volumes/2TB_WD/sbb_images/tmp/"
    #downloadDir="./tmp/"
    ppn=""
    if len(record["identifier"])>1:
        ppn=str(record["identifier"][1])
    else:
        ppn=str(record["identifier"][0])
    ppnTIFF=ppn+".tif"
    ppnJPEGPAth=downloadDir+ppn+".jpg"
    if "object" in record.keys():
        # prevent downloading of already present files
        if not os.path.isfile(ppnJPEGPAth) :
            # check for the HTTP error code, maybe the file does not exist
            httpCode=urllib.urlopen(record["object"][0],downloadDir+ppnTIFF).getcode()
            if httpCode==200:
                if allowDownloads:
                    urlinfo=urllib.urlretrieve(record["object"][0],downloadDir+ppnTIFF)
                    ret=subp.call(["mogrify", "-resize","512x512","-format", "jpg",downloadDir+ppnTIFF])
                    if ret!=0:
                        print "Problem with mogrifying "+ppnTIFF
                        logFile.write("[MOGRIFY]: %s \n%s\n\n" % (str("Problem with mogrifying "+ppnTIFF),str("Downloaded from: "+record["object"][0])))
                    ret=subp.call(["rm",downloadDir+ppnTIFF])
                    if ret!=0:
                        print "Problem with removing "+ppnTIFF
                        logFile.write("[REMOVAL]: %s\n\n" % "Problem with removing "+ppnTIFF)
            else:
                print "Problem with accessing "+ppnTIFF+ " due to HTTP code: "+str(httpCode)
                logFile.write("[HTTP]: %s\n\n" % "Problem with accessing "+ppnTIFF)
                logFile.write("HTTP Code: "+str(httpCode)+"\n")
                logFile.write(str(urlinfo[1])+"\n\n")
    else:
        logFile.write("[OBJECT key missing]: %s\n\n" % str(record))
logFile.close()
print "\n"
printLog("Finished image download and processing.")

[2016-01-28 20:48:05.614923]	Started image download and processing. This will take a while...
[2016-01-28 20:48:06.688921]	Downloading image 0 of 120000 images.
[2016-01-28 21:04:49.002119]	Downloading image 1000 of 120000 images.
[2016-01-28 21:04:49.028423]	Downloading image 2000 of 120000 images.
[2016-01-28 21:04:50.377403]	Downloading image 3000 of 120000 images.
[2016-01-28 21:05:41.241220]	Downloading image 4000 of 120000 images.
[2016-01-28 21:05:48.729935]	Downloading image 5000 of 120000 images.
Problem with accessing PPN75097740X.tif due to HTTP code: 403
[2016-01-28 21:05:56.379565]	Downloading image 6000 of 120000 images.
[2016-01-28 21:06:55.477859]	Downloading image 7000 of 120000 images.
[2016-01-28 21:07:34.706122]	Downloading image 8000 of 120000 images.
[2016-01-28 21:09:36.415265]	Downloading image 9000 of 120000 images.
Problem with accessing PPN746838255.tif due to HTTP code: 404
Problem with accessing PPN746838409.tif due to HTTP code: 404
Problem with accessing 

85.566 Bilder

ergibt Bilder "PPN813124174-0.jpg"/"PPN813124174-1.jpg", wobei eins von schlechter Qualitaet ist

wenn kein bilder gedownloadet werden konnten, dann handelt es sich in der regel um folgende types:

* Periodical
* Multivolume work

an die METS-Daten kommt man über http://digital.staatsbibliothek-berlin.de/metsresolver/?PPN=PPN721220665

In [153]:
print availableKeys

{'rights': 100000, 'publisher': 47649, 'object': 85197, 'description': 7588, 'title': 100000, 'issued': 47677, 'format': 100000, 'source': 100000, 'creator': 76774, 'dataProvider': 100000, 'spatial': 81285, 'provider': 100000, 'date': 47677, 'isShownAt': 99994, 'alternative': 24543, 'identifier': 100000, 'type': 100000, 'relation': 23946, 'subject': 100000}


In [107]:
# create a dictionary for the records
values=dict()
# take the keys as they have found within the downloaded OAI records
keys=availableKeys.keys()
# for every metadata field, create an empty array as the content of the dictionary filed under the key 'k'
for k in keys:
    values[k]=[]
# in addition, store the PPN (the SBB's unique identifier for digitized content)    
values["PPN"]=[]

# iterate over all saved records
for record in savedRecords:
    # we cannot iterate over the keys of record.metadata directly because not all records cotain the same fields,...
    for k in keys:
        # thus we check if the metadata field 'k' has been created above
        if k in values:
            # append the metadata fields to the dictionary created above
            # if the metadata field 'k' is not available input "None" instead
            #values[k].append(record.get(k,["None"])[0].encode('ISO-8859-1'))
            if k in record:
                value=record.get(k)[0]
                if value.isdigit():
                    value=int(value)
                else:
                    value=value.encode('ISO-8859-1')
                values[k].append(value)
                # get the PPN
                if k=="identifier":
                    if len(record["identifier"])>1:
                        ppn=str(record.get(k)[1])
                    else:
                        ppn=str(record.get(k)[0])
                    values["PPN"].append(ppn)
            else:
                values[k].append(np.nan)
# create a data frame from the 
df=pd.DataFrame(pd.to_numeric(values,errors='coerce'))
df.shape

(50000, 20)

In [165]:
df.head(100)

Unnamed: 0,PPN,alternative,creator,dataProvider,date,description,format,identifier,isShownAt,issued,object,provider,publisher,relation,rights,source,spatial,subject,title,type
0,PPN716641283,,"Colizzi, Giovanni Andrea",Staatsbibliothek zu Berlin - Preußischer Kultu...,,,image/jpeg,http://resolver.staatsbibliothek-berlin.de/SBB...,http://resolver.staatsbibliothek-berlin.de/SBB...,,http://digital.staatsbibliothek-berlin.de/euro...,Staatsbibliothek zu Berlin - Preußischer Kultu...,,,Open Access,"Colizzi, Giovanni Andrea: Schriftprobe Giovann...",,Schlüsselseiten,Schriftprobe Giovanni Andrea Colizzi (1792),Text
1,PPN83459269X,,"Fleeg, Johann Both, Hans-Ulrich",Staatsbibliothek zu Berlin - Preußischer Kultu...,1711,,image/jpeg,http://resolver.staatsbibliothek-berlin.de/SBB...,http://resolver.staatsbibliothek-berlin.de/SBB...,1711,http://digital.staatsbibliothek-berlin.de/euro...,Staatsbibliothek zu Berlin - Preußischer Kultu...,Struck,,Open Access,"Fleeg, Johann Both, Hans-Ulrich: Den auf Gott...",Lubeck,Historische Drucke,"Den auf Gott sich verlassenden David, Trug aus...",Monograph
2,PPN840970862,,"Wiedeburg, Friedrich August Seidel, Christoph...",Staatsbibliothek zu Berlin - Preußischer Kultu...,1797,,image/jpeg,http://resolver.staatsbibliothek-berlin.de/SBB...,http://resolver.staatsbibliothek-berlin.de/SBB...,1797,http://digital.staatsbibliothek-berlin.de/euro...,Staatsbibliothek zu Berlin - Preußischer Kultu...,Fleckeisen,,Open Access,"Wiedeburg, Friedrich August Seidel, Christoph...",Helmstädt,Historische Drucke,Charakterzüge Herrn Christoph Matthias Seidels...,Monograph
3,PPN743984099,Polizeiblatt für Mecklenburg,,Staatsbibliothek zu Berlin - Preußischer Kultu...,,,image/jpeg,http://resolver.staatsbibliothek-berlin.de/SBB...,http://resolver.staatsbibliothek-berlin.de/SBB...,,,Staatsbibliothek zu Berlin - Preußischer Kultu...,,,Open Access,Der Wächter Polizeiblatt für Mecklenburg. Sch...,Schwerin,Rechtswissenschaft,Der Wächter,Periodical
4,PPN841234620,,"Menzel, Adolph",Staatsbibliothek zu Berlin - Preußischer Kultu...,1890,,image/jpeg,http://resolver.staatsbibliothek-berlin.de/SBB...,http://resolver.staatsbibliothek-berlin.de/SBB...,1890,http://digital.staatsbibliothek-berlin.de/euro...,Staatsbibliothek zu Berlin - Preußischer Kultu...,,,Open Access,"Menzel, Adolph: Adolf v. Menzel. [s. l.] Berl...",[s. l.],Einblattdrucke,Adolf v. Menzel,Monograph
5,PPN837381703,,"A., Eva Maria",Staatsbibliothek zu Berlin - Preußischer Kultu...,1696,,image/jpeg,http://resolver.staatsbibliothek-berlin.de/SBB...,http://resolver.staatsbibliothek-berlin.de/SBB...,1696,http://digital.staatsbibliothek-berlin.de/euro...,Staatsbibliothek zu Berlin - Preußischer Kultu...,Johann Jonathan Felsecker Erben,,Open Access,"A., Eva Maria: Uber dem Verlust Des ersten Pfa...",[Nürnberg],Historische Drucke,Uber dem Verlust Des ersten Pfandes Seiner Ehe...,Monograph
6,PPN843637315,op. 32 ; für Violine und Pianoforte,"Lipinski, Ch.",Staatsbibliothek zu Berlin - Preußischer Kultu...,,,image/jpeg,http://resolver.staatsbibliothek-berlin.de/SBB...,http://resolver.staatsbibliothek-berlin.de/SBB...,,http://digital.staatsbibliothek-berlin.de/euro...,Staatsbibliothek zu Berlin - Preußischer Kultu...,Hofmeister,,Open Access,"Lipinski, Ch.: 4me concerto (A) op. 32 ; für ...",Leipzig,Musiknoten,4me concerto (A),Monograph
7,PPN837374227,,"Götz, Johann Christoph",Staatsbibliothek zu Berlin - Preußischer Kultu...,1687,,image/jpeg,http://resolver.staatsbibliothek-berlin.de/SBB...,http://resolver.staatsbibliothek-berlin.de/SBB...,1687,http://digital.staatsbibliothek-berlin.de/euro...,Staatsbibliothek zu Berlin - Preußischer Kultu...,"Spörlin, Johann Michael",,Open Access,"Götz, Johann Christoph: Zu frühe von dem Tod a...",Nürnberg,Historische Drucke,Zu frühe von dem Tod abgedrungene Klag Welche ...,Monograph
8,PPN843865857,für eine Singstimme mit Begleitung des Pianoforte,"Grimmer, Christian Friedrich Franz, Robert",Staatsbibliothek zu Berlin - Preußischer Kultu...,,,image/jpeg,http://resolver.staatsbibliothek-berlin.de/SBB...,http://resolver.staatsbibliothek-berlin.de/SBB...,,http://digital.staatsbibliothek-berlin.de/euro...,Staatsbibliothek zu Berlin - Preußischer Kultu...,Breitkopf & Härtel,,Open Access,"Grimmer, Christian Friedrich Franz, Robert: 2...",Leipzig,Musiknoten,20 Balladen und Romanzen im Volkston,Monograph
9,PPN827742231,,,Staatsbibliothek zu Berlin - Preußischer Kultu...,1863,,image/jpeg,http://resolver.staatsbibliothek-berlin.de/SBB...,http://resolver.staatsbibliothek-berlin.de/SBB...,1863,http://digital.staatsbibliothek-berlin.de/euro...,Staatsbibliothek zu Berlin - Preußischer Kultu...,,Annalen der Landwirthschaft in den Königlich P...,Open Access,Annalen der Landwirthschaft in den Königlich P...,Berlin,Historische Drucke,Annalen der Landwirthschaft in den Königlich P...,Volume


In [137]:
df[df.PPN.isnull()].count()

PPN             0
alternative     0
creator         0
dataProvider    0
date            0
description     0
format          0
identifier      0
isShownAt       0
issued          0
object          0
provider        0
publisher       0
relation        0
rights          0
source          0
spatial         0
subject         0
title           0
type            0
dtype: int64

In [124]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 50000 entries, 0 to 49999
Data columns (total 20 columns):
PPN             50000 non-null object
alternative     12330 non-null object
creator         38376 non-null object
dataProvider    50000 non-null object
date            23793 non-null object
description     3735 non-null object
format          50000 non-null object
identifier      50000 non-null object
isShownAt       49996 non-null object
issued          23793 non-null object
object          42598 non-null object
provider        50000 non-null object
publisher       23798 non-null object
relation        11991 non-null object
rights          50000 non-null object
source          50000 non-null object
spatial         40574 non-null object
subject         50000 non-null object
title           50000 non-null object
type            50000 non-null object
dtypes: object(20)
memory usage: 8.0+ MB


In [174]:
#uniqueNames=len(df.NameCaps.unique())
colNames=df.columns.values.tolist()
for colName in colNames:
    print colName+";\t unique values:\t"+str(len(df[colName].unique()))+ "\t total count: "+str(df[colName].count())

PPN;	 unique values:	49994	 total count: 50000
alternative;	 unique values:	11661	 total count: 12330
creator;	 unique values:	21213	 total count: 38376
dataProvider;	 unique values:	1	 total count: 50000
date;	 unique values:	526	 total count: 23793
description;	 unique values:	3656	 total count: 3735
format;	 unique values:	1	 total count: 50000
identifier;	 unique values:	49993	 total count: 50000
isShownAt;	 unique values:	49990	 total count: 49996
issued;	 unique values:	526	 total count: 23793
object;	 unique values:	42599	 total count: 42598
provider;	 unique values:	1	 total count: 50000
publisher;	 unique values:	8184	 total count: 23798
relation;	 unique values:	8366	 total count: 11991
rights;	 unique values:	1	 total count: 50000
source;	 unique values:	43077	 total count: 50000
spatial;	 unique values:	4669	 total count: 40574
subject;	 unique values:	36	 total count: 50000
title;	 unique values:	36525	 total count: 50000
type;	 unique values:	6	 total count: 50000


In [363]:
p = re.compile('\d\d\d\d')

rawText="Januar x2012x"
m = p.search(rawText)
# deal with missing years of first appearance
if m:
    firstAppearance=m.group()
    print firstAppearance
    
# zum matchen: p.match
# regular expressions taken from: http://stackoverflow.com/questions/1449817/what-are-some-of-the-most-useful-regular-expressions-for-programmers
# extended by me

patterns=dict()

patterns["positiveInteger"]="^\d+$"
patterns["negativeInteger"]="^-\d+$"
patterns["generalInteger"]="^-?\d+$"
patterns["positiveFloat"]="^\d*\.?\d+$"
patterns["negativeFloat"]="^-\d*\.?\d+$"
patterns["generalFloat"]="^-?\d*\.?\d+$"
patterns["positiveGermanFloat"]="^\d*,?\d+$"
patterns["negativeGermanFloat"]="^-\d*,?\d+$"
patterns["generalGermanFloat"]="^-?\d*,?\d+$"
# Date (dd mm yyyy, d/m/yyyy, etc.), in range 1000-2099 without proper February handling
patterns["dateVariant"]="^([1-9]|0[1-9]|[12][0-9]|3[01])\D([1-9]|0[1-9]|1[012])\D(1[0-9][0-9][0-9]|20[0-9][0-9])$"
patterns["year"]="(1[0-9][0-9][0-9]|20[0-9][0-9])"
patterns["email"]="^[_]*([a-z0-9]+(\.|_*)?)+@([a-z][a-z0-9-]+(\.|-*\.))+[a-z]{2,6}$"
patterns["domain"]="^([a-z][a-z0-9-]+(\.|-*\.))+[a-z]{2,6}$"
patterns["url"]="^https?\:\/\/[a-zA-Z0-9.-]+\.[a-zA-Z]{2,3}\/?$"
patterns["ipv4"]="^(?:\d{1,3}\.){3}\d{1,3}$"
patterns["rgbHex"]="^#?([a-fA-F0-9]{6}|[a-fA-F0-9]{3})$"
patterns["generalHex"]="^#?[a-fA-F0-9]*$"
# year range with splitter "- / :", the splitter can be surrounded by an arbitrary amount of whitespaces 
patterns["yearRange"]="^\s*(1[0-9][0-9][0-9]|20[0-9][0-9])\s*(\-|\/|:)\s*(1[0-9][0-9][0-9]|20[0-9][0-9])\s*$"
patterns["dateVariantRange"]="^\s*([1-9]|0[1-9]|[12][0-9]|3[01])\D([1-9]|0[1-9]|1[012])\D(1[0-9][0-9][0-9]|20[0-9][0-9])\s*(\-|\/|:)\s*([1-9]|0[1-9]|[12][0-9]|3[01])\D([1-9]|0[1-9]|1[012])\D(1[0-9][0-9][0-9]|20[0-9][0-9])\s*$"
patterns["isbnPrefix"]="^ISBN(-1(?:(0)|3))?:?\x20(\s)*[0-9]+[- ][0-9]+[- ][0-9]+[- ][0-9]*[- ]*[xX0-9]$"
patterns["isbn"]="^[0-9]+[- ][0-9]+[- ][0-9]+[- ][0-9]*[- ]*[xX0-9]$"
patterns["NaN"]="[Nn][Aa][Nn]"

rawText="2099"

p=re.compile(patterns["year"])
m = p.search(rawText)
if m:
    firstAppearance=m.group()
    print firstAppearance
else:
    print "No match."

2012
2099


In [335]:
#df.to_csv('krieg.csv',index=False,header=True,encoding='utf-8', sep="\t")

In [350]:
df2=df[df.date.notnull()]
df2.head()

Unnamed: 0,PPN,alternative,creator,dataProvider,date,description,format,identifier,isShownAt,issued,object,provider,publisher,relation,rights,source,spatial,subject,title,type
1,PPN83459269X,,"Fleeg, Johann Both, Hans-Ulrich",Staatsbibliothek zu Berlin - Preußischer Kultu...,1711,,image/jpeg,http://resolver.staatsbibliothek-berlin.de/SBB...,http://resolver.staatsbibliothek-berlin.de/SBB...,1711,http://digital.staatsbibliothek-berlin.de/euro...,Staatsbibliothek zu Berlin - Preußischer Kultu...,Struck,,Open Access,"Fleeg, Johann Both, Hans-Ulrich: Den auf Gott...",Lubeck,Historische Drucke,"Den auf Gott sich verlassenden David, Trug aus...",Monograph
2,PPN840970862,,"Wiedeburg, Friedrich August Seidel, Christoph...",Staatsbibliothek zu Berlin - Preußischer Kultu...,1797,,image/jpeg,http://resolver.staatsbibliothek-berlin.de/SBB...,http://resolver.staatsbibliothek-berlin.de/SBB...,1797,http://digital.staatsbibliothek-berlin.de/euro...,Staatsbibliothek zu Berlin - Preußischer Kultu...,Fleckeisen,,Open Access,"Wiedeburg, Friedrich August Seidel, Christoph...",Helmstädt,Historische Drucke,Charakterzüge Herrn Christoph Matthias Seidels...,Monograph
4,PPN841234620,,"Menzel, Adolph",Staatsbibliothek zu Berlin - Preußischer Kultu...,1890,,image/jpeg,http://resolver.staatsbibliothek-berlin.de/SBB...,http://resolver.staatsbibliothek-berlin.de/SBB...,1890,http://digital.staatsbibliothek-berlin.de/euro...,Staatsbibliothek zu Berlin - Preußischer Kultu...,,,Open Access,"Menzel, Adolph: Adolf v. Menzel. [s. l.] Berl...",[s. l.],Einblattdrucke,Adolf v. Menzel,Monograph
5,PPN837381703,,"A., Eva Maria",Staatsbibliothek zu Berlin - Preußischer Kultu...,1696,,image/jpeg,http://resolver.staatsbibliothek-berlin.de/SBB...,http://resolver.staatsbibliothek-berlin.de/SBB...,1696,http://digital.staatsbibliothek-berlin.de/euro...,Staatsbibliothek zu Berlin - Preußischer Kultu...,Johann Jonathan Felsecker Erben,,Open Access,"A., Eva Maria: Uber dem Verlust Des ersten Pfa...",[Nürnberg],Historische Drucke,Uber dem Verlust Des ersten Pfandes Seiner Ehe...,Monograph
7,PPN837374227,,"Götz, Johann Christoph",Staatsbibliothek zu Berlin - Preußischer Kultu...,1687,,image/jpeg,http://resolver.staatsbibliothek-berlin.de/SBB...,http://resolver.staatsbibliothek-berlin.de/SBB...,1687,http://digital.staatsbibliothek-berlin.de/euro...,Staatsbibliothek zu Berlin - Preußischer Kultu...,"Spörlin, Johann Michael",,Open Access,"Götz, Johann Christoph: Zu frühe von dem Tod a...",Nürnberg,Historische Drucke,Zu frühe von dem Tod abgedrungene Klag Welche ...,Monograph


In [366]:
rowCount=0
histogram=dict()
for row in df2.iterrows():
    rowCount=rowCount+1
    readDate=str(row[1]["date"])
    for key in patterns:
        p=re.compile(patterns[key])
        m = p.search(readDate)
        if m:
            if not key in histogram:
                histogram[key]=0
            histogram[key]=histogram[key]+1
        else:
            pass
print "Row count: "+str(rowCount)
print histogram    

Row count: 23793
{'positiveFloat': 23554, 'generalGermanFloat': 23554, 'generalFloat': 23554, 'yearRange': 1, 'generalHex': 23554, 'positiveInteger': 23554, 'rgbHex': 1, 'positiveGermanFloat': 23554, 'year': 23558, 'generalInteger': 23554}


In [343]:
df2.date.astype(int).plot()
# date contains strings like "1817-1819"

ValueError: invalid literal for long() with base 10: '1817-1819'