In [1]:
# The %... is an iPython thing, and is not part of the Python language.
# In this case we're just telling the plotting library to draw things on
# the notebook, instead of on a separate window.
%matplotlib inline
# See all the "as ..." contructs? They're just aliasing the package names.
# That way we can call methods like plt.plot() instead of matplotlib.pyplot.plot().
import numpy as np
import scipy as sp
import matplotlib as mpl
import matplotlib.cm as cm
import matplotlib.pyplot as plt
import pandas as pd
import time
pd.set_option('display.width', 500)
pd.set_option('display.max_columns', 100)
pd.set_option('display.notebook_repr_html', True)
import seaborn as sns
sns.set_style("whitegrid")
sns.set_context("poster")

from bs4 import BeautifulSoup
from collections import OrderedDict # provides the ordered dictionary
import re # for regular expressions used below
import urllib # to read from URLs
import networkx as nx # network analysis
import itertools
import os.path
from datetime import datetime # for time measurement
import sys
import os
import pickle

# OAI
from sickle import Sickle

def printLog(text):
    now=str(datetime.now())
    print "["+now+"]\t"+text
    # forces to output the result of the print command immediately, see: http://stackoverflow.com/questions/230751/how-to-flush-output-of-python-print
    sys.stdout.flush()



In [398]:
!pip install jellyfish

Collecting jellyfish
  Downloading jellyfish-0.5.1.tar.gz
Building wheels for collected packages: jellyfish
  Running setup.py bdist_wheel for jellyfish
  Stored in directory: /Users/david/Library/Caches/pip/wheels/f5/bc/a0/bd291f78df7f71a18628b771df11d0086357579b1484db21d5
Successfully built jellyfish
Installing collected packages: jellyfish
Successfully installed jellyfish-0.5.1
[33mYou are using pip version 7.1.2, however version 8.0.2 is available.
You should consider upgrading via the 'pip install --upgrade pip' command.[0m


In [100]:
from PIL import Image
im = Image.open(urllib.urlopen('http://digital.staatsbibliothek-berlin.de/europeana/PPN730725200/00000001.tif'))
jpg = Image.new("RGB", im.size)
jpg.paste(im)
im.save('./img/test.jpg', 'jpeg')

KeyboardInterrupt: 

Stabi-URL:

PPN722144857

http://ngcs.staatsbibliothek-berlin.de/?action=metsImage&format=jpg&metsFile=PPN722144857&divID=PHYS_0001&width=800&rotate=0

http://ngcs.staatsbibliothek-berlin.de/?action=metsImage&format=jpg&metsFile=PPN730725200&divID=PHYS_0001&width=800&rotate=0

In [None]:
urllib.urlretrieve("http://ngcs.staatsbibliothek-berlin.de/?action=metsImage&format=jpg&metsFile=PPN722144857&divID=PHYS_0001&width=800&rotate=0","./img/test2.jpg")

In [146]:
# connect to a metadata repository
sickle = Sickle('http://digital.staatsbibliothek-berlin.de/oai')
# get the sets from the data provider connected to
sets = sickle.ListSets()
# print the returned sets including their identifiers
print "Sets provided by data provider\n* * * * * * * * * * * * * * * * * * * * * " # \n creates a new line
for s in sets:
    print "'"+s.setName+"' accessible via: '"+s.setSpec+"'"

Sets provided by data provider
* * * * * * * * * * * * * * * * * * * * * 
'Historische Drucke' accessible via: 'DC_historische.drucke'
'Theologie' accessible via: 'DC_theologie'
'Rechtswissenschaft' accessible via: 'DC_rechtswissenschaft'
'Geschichte/Ethnographie/Geographie' accessible via: 'DC_geschichte.ethnographie.geographie'
'Landwirtschaft/Forstwirtschaft' accessible via: 'DC_landwirtschaft'
'Politik/Staat/Gesellschaft/Wirtschaft' accessible via: 'DC_politik.staat.gesellschaft.wirtschaft'
'Sprachen/Literaturen' accessible via: 'DC_sprachen.literaturen'
'Aberglaube/Mystische Philosophie' accessible via: 'DC_aberglaube.mystische.philosophie'
'Naturwissenschaften/Mathematik' accessible via: 'DC_naturwissenschaften.mathematik'
'Architektur/Technik' accessible via: 'DC_architektur.technik'
'Einblattdrucke' accessible via: 'DC_einblattmaterialien'
'Philosophie/Psychologie' accessible via: 'DC_philosophie.psychologie'
'Ostasiatica' accessible via: 'DC_ostasiatica'
'Musik' accessible via

In [147]:
# get the records from this repository's specific document set 'DC_krieg.1914.1918' (documents related to World War I) 
# using Dublin Core format 
records = sickle.ListRecords(metadataPrefix='oai_dc', set='DC_all')

In [5]:
#firstRecord=records.next()
#print type(firstRecord)
#print "* * * * * * * * * * * * * * * * *"
#print "Header:"
#print "* * * * * * * * * * * * * * * * *"
#print firstRecord.header
#print "* * * * * * * * * * * * * * * * *"
#print "Metadata:"
#print "* * * * * * * * * * * * * * * * *"
#print firstRecord.metadata

<class 'sickle.models.Record'>
* * * * * * * * * * * * * * * * *
Header:
* * * * * * * * * * * * * * * * *
<header xmlns="http://www.openarchives.org/OAI/2.0/" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"><identifier>oai:digital.staatsbibliothek-berlin.de:PPN749835605</identifier><datestamp>2016-01-06</datestamp><setSpec>krieg.1914.1918</setSpec><setSpec>historische.drucke</setSpec></header>
* * * * * * * * * * * * * * * * *
Metadata:
* * * * * * * * * * * * * * * * *
{'object': ['http://digital.staatsbibliothek-berlin.de/europeana/PPN749835605/00000001.tif'], 'identifier': ['http://resolver.staatsbibliothek-berlin.de/SBB0000D1D200460000', 'PPN749835605'], 'rights': ['Open Access'], 'issued': ['1918'], 'format': ['image/jpeg'], 'source': ['Nachrichtenblatt vom. [s. l.] @]  Berlin 1918'], 'relation': [u'Nachrichtenblatt vom  f\xc3\xbcr die Kameraden im Kampf zwischen Somme u. Oise aus der "Stafette in Champagne und Argonnen". [s. l.] @]  Berlin ; 71.1918'], 'spatial': ['[s. l.]

In [148]:
printLog("Starting OAI record download...")
# initialize some variables for counting and saving the metadata records
savedDocs=0
# 2:15 h for 100k
maxDocs=120000 # 100 is just for testing, for more interesting results increase this value to 1000. ATTENTION! this will also take more time for reading data.
savedRecords=[]

# save the records locally as we don't want to have to rely on a connection to the OAI-PMH server all the time
# iterate over all records until maxDocs is reached
# ATTENTION! if you re-run this cell, the contents of the savedRecords array will be altered!
for record in records:
    # check if we reach the maximum document value
    if savedDocs<maxDocs:
        savedDocs=savedDocs+1
        # save the current record to the "savedRecords" array
        savedRecords.append(record.metadata)
        if savedDocs%1000==0:
            printLog("Downloaded %d of %d records."%(savedDocs,maxDocs))
    # if so, end the processing of the for-loop
    else:
        break # break ends the processing of the loop

printLog("Finished OAI download of "+str(len(savedRecords))+" records.")

[2016-01-28 18:27:42.651279]	Starting OAI record download...
[2016-01-28 18:28:11.857042]	Downloaded 1000 of 120000 records.
[2016-01-28 18:28:40.865639]	Downloaded 2000 of 120000 records.
[2016-01-28 18:29:11.575680]	Downloaded 3000 of 120000 records.
[2016-01-28 18:29:41.792284]	Downloaded 4000 of 120000 records.
[2016-01-28 18:30:26.794156]	Downloaded 5000 of 120000 records.
[2016-01-28 18:31:11.363767]	Downloaded 6000 of 120000 records.
[2016-01-28 18:31:56.732200]	Downloaded 7000 of 120000 records.
[2016-01-28 18:32:33.579985]	Downloaded 8000 of 120000 records.
[2016-01-28 18:33:13.931280]	Downloaded 9000 of 120000 records.
[2016-01-28 18:33:55.102198]	Downloaded 10000 of 120000 records.
[2016-01-28 18:34:40.986906]	Downloaded 11000 of 120000 records.
[2016-01-28 18:35:33.162897]	Downloaded 12000 of 120000 records.
[2016-01-28 18:36:33.102607]	Downloaded 13000 of 120000 records.
[2016-01-28 18:37:48.131282]	Downloaded 14000 of 120000 records.
[2016-01-28 18:38:53.888490]	Downloade

In [150]:
len(savedRecords)

120000

In [149]:
pickle.dump( savedRecords, open( "save_120k_dc_all.pickle", "wb" ) )

In [55]:
#pickledRecords=pickle.load( open( "save_100k_dc_all.pickle", "rb" ) )
availableKeys=dict()

evenRecords=[]
oddRecords=[]
for i,r in enumerate(savedRecords):
    for k in r.keys():
        if not k in availableKeys:
            availableKeys[k]=1
        else:
            availableKeys[k]=availableKeys[k]+1
    if i%2==0:
        evenRecords.append(r)
    else:
        oddRecords.append(r)

#pickle.dump( evenRecords, open( "even_dc_all.pickle", "wb" ) )
#pickle.dump( oddRecords, open( "odd_dc_all.pickle", "wb" ) )

In [152]:
# immer als Array hinterlegt
# bei publisher entfernen: u"Staatsbibliothek zu Berlin \xe2\x80\x93 Preu\xc3\x9fischer Kulturbesitz, Germany"
# object: Bild zur Repräsentation
# type: Monographie o.ä.
# title
# spatial Ort
# subject Klassifikatoren wie Theologie
# identifier[1] = PPN

import urllib # to read from URLs
import subprocess as subp

# even: Macbook
# odd: iMac

# * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * *
#
# If you set 'allowDownloads' to True, the next steps will take a lot of time
#
# * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * *
allowDownloads=False

#savedRecords=pickle.load( open( "save_100k_dc_all.pickle", "rb" ) )
countSavedRecords=len(savedRecords)
printLog("Started image download and processing. This will take a while...")
logFile = open("/Volumes/2TB_WD/sbb_images/downloadIssues.txt", "w")
#logFile = open("./downloadIssues.txt", "w")

for i,record in enumerate(savedRecords):
    if i%1000==0:
        printLog("Downloading image %d of %d images."%(i,countSavedRecords))
    downloadDir="/Volumes/2TB_WD/sbb_images/tmp/"
    #downloadDir="./tmp/"
    ppn=""
    if len(record["identifier"])>1:
        ppn=str(record["identifier"][1])
    else:
        ppn=str(record["identifier"][0])
    ppnTIFF=ppn+".tif"
    ppnJPEGPAth=downloadDir+ppn+".jpg"
    if "object" in record.keys():
        # prevent downloading of already present files
        if not os.path.isfile(ppnJPEGPAth) :
            # check for the HTTP error code, maybe the file does not exist
            httpCode=urllib.urlopen(record["object"][0],downloadDir+ppnTIFF).getcode()
            if httpCode==200:
                if allowDownloads:
                    urlinfo=urllib.urlretrieve(record["object"][0],downloadDir+ppnTIFF)
                    ret=subp.call(["mogrify", "-resize","512x512","-format", "jpg",downloadDir+ppnTIFF])
                    if ret!=0:
                        print "Problem with mogrifying "+ppnTIFF
                        logFile.write("[MOGRIFY]: %s \n%s\n\n" % (str("Problem with mogrifying "+ppnTIFF),str("Downloaded from: "+record["object"][0])))
                    ret=subp.call(["rm",downloadDir+ppnTIFF])
                    if ret!=0:
                        print "Problem with removing "+ppnTIFF
                        logFile.write("[REMOVAL]: %s\n\n" % "Problem with removing "+ppnTIFF)
            else:
                print "Problem with accessing "+ppnTIFF+ " due to HTTP code: "+str(httpCode)
                logFile.write("[HTTP]: %s\n\n" % "Problem with accessing "+ppnTIFF)
                logFile.write("HTTP Code: "+str(httpCode)+"\n")
                logFile.write(str(urlinfo[1])+"\n\n")
    else:
        logFile.write("[OBJECT key missing]: %s\n\n" % str(record))
logFile.close()
print "\n"
printLog("Finished image download and processing.")

[2016-01-28 20:48:05.614923]	Started image download and processing. This will take a while...
[2016-01-28 20:48:06.688921]	Downloading image 0 of 120000 images.
[2016-01-28 21:04:49.002119]	Downloading image 1000 of 120000 images.
[2016-01-28 21:04:49.028423]	Downloading image 2000 of 120000 images.
[2016-01-28 21:04:50.377403]	Downloading image 3000 of 120000 images.
[2016-01-28 21:05:41.241220]	Downloading image 4000 of 120000 images.
[2016-01-28 21:05:48.729935]	Downloading image 5000 of 120000 images.
Problem with accessing PPN75097740X.tif due to HTTP code: 403
[2016-01-28 21:05:56.379565]	Downloading image 6000 of 120000 images.
[2016-01-28 21:06:55.477859]	Downloading image 7000 of 120000 images.
[2016-01-28 21:07:34.706122]	Downloading image 8000 of 120000 images.
[2016-01-28 21:09:36.415265]	Downloading image 9000 of 120000 images.
Problem with accessing PPN746838255.tif due to HTTP code: 404
Problem with accessing PPN746838409.tif due to HTTP code: 404
Problem with accessing 

85.566 Bilder

ergibt Bilder "PPN813124174-0.jpg"/"PPN813124174-1.jpg", wobei eins von schlechter Qualitaet ist

wenn kein bilder gedownloadet werden konnten, dann handelt es sich in der regel um folgende types:

* Periodical
* Multivolume work

an die METS-Daten kommt man über http://digital.staatsbibliothek-berlin.de/metsresolver/?PPN=PPN721220665

In [2]:
# load the records
printLog("Loading pickled records...")
savedRecords=pickle.load( open( "save_120k_dc_all.pickle", "rb" ) )
printLog("Finished loading pickled records.")

availableKeys=dict()

for i,r in enumerate(savedRecords):
    for k in r.keys():
        if not k in availableKeys:
            availableKeys[k]=1
        else:
            availableKeys[k]=availableKeys[k]+1
    
print availableKeys

# create a dictionary for the records
values=dict()
# take the keys as they have found within the downloaded OAI records
keys=availableKeys.keys()
# for every metadata field, create an empty array as the content of the dictionary filed under the key 'k'
for k in keys:
    values[k]=[]
# in addition, store the PPN (the SBB's unique identifier for digitized content)    
values["PPN"]=[]

# iterate over all saved records
for record in savedRecords:
    # we cannot iterate over the keys of record.metadata directly because not all records cotain the same fields,...
    for k in keys:
        # thus we check if the metadata field 'k' has been created above
        if k in values:
            # append the metadata fields to the dictionary created above
            # if the metadata field 'k' is not available input "None" instead
            #values[k].append(record.get(k,["None"])[0].encode('ISO-8859-1'))
            if k in record:
                value=record.get(k)[0]
                if value.isdigit():
                    value=int(value)
                else:
                    value=value.encode('ISO-8859-1')
                values[k].append(value)
                # get the PPN
                if k=="identifier":
                    if len(record["identifier"])>1:
                        ppn=str(record.get(k)[1])
                    else:
                        ppn=str(record.get(k)[0])
                    values["PPN"].append(ppn)
            else:
                values[k].append(np.nan)
# create a data frame from the 
df=pd.DataFrame(pd.to_numeric(values,errors='coerce'))
df.shape

[2016-01-31 21:09:45.429186]	Loading pickled records...
[2016-01-31 21:10:29.096854]	Finished loading pickled records.
{'rights': 120000, 'publisher': 60862, 'object': 102054, 'description': 9794, 'identifier': 120000, 'title': 120000, 'type': 120000, 'format': 120000, 'source': 120000, 'issued': 64751, 'spatial': 98511, 'provider': 120000, 'date': 64751, 'isShownAt': 119994, 'alternative': 32089, 'dataProvider': 120000, 'creator': 92532, 'relation': 27570, 'subject': 120000}


(120000, 20)

In [390]:
df.head()

Unnamed: 0,PPN,alternative,creator,dataProvider,date,description,format,identifier,isShownAt,issued,object,provider,publisher,relation,rights,source,spatial,subject,title,type
0,PPN818409886,,"Averroes,",Staatsbibliothek zu Berlin - Preußischer Kultu...,,,image/jpeg,http://resolver.staatsbibliothek-berlin.de/SBB...,http://resolver.staatsbibliothek-berlin.de/SBB...,,http://digital.staatsbibliothek-berlin.de/euro...,Staatsbibliothek zu Berlin - Preußischer Kultu...,,,Open Access,"Averroes,: Großer Kommentar über die Metaphysi...",,Hebräische Handschriften,Großer Kommentar über die Metaphysik des Arist...,Text
1,PPN743983386,,,Staatsbibliothek zu Berlin - Preußischer Kultu...,,,image/jpeg,http://www.bibliothek.uni-regensburg.de/ezeit/...,http://www.bibliothek.uni-regensburg.de/ezeit/...,,,Staatsbibliothek zu Berlin - Preußischer Kultu...,,,Open Access,Allgemeines Polizei-Archiv für Preussen. Berli...,Berlin,Rechtswissenschaft,Allgemeines Polizei-Archiv für Preussen,Periodical
2,PPN839758545,Auff eine anmutige bekandte Melodey appliciret...,"Jordano, Alberto Läger, Joachimus",Staatsbibliothek zu Berlin - Preußischer Kultu...,1650.0,,image/jpeg,http://resolver.staatsbibliothek-berlin.de/SBB...,http://resolver.staatsbibliothek-berlin.de/SBB...,1650.0,http://digital.staatsbibliothek-berlin.de/euro...,Staatsbibliothek zu Berlin - Preußischer Kultu...,,,Open Access,"Jordano, Alberto Läger, Joachimus: Der XLII. ...",[S.l.],Einblattdrucke,Der XLII. Psalm Davids,Monograph
3,PPN828810257,Enthaltend verschiedene geheime Nachrichten vo...,"Maubert de Gouvest, Jean Henri",Staatsbibliothek zu Berlin - Preußischer Kultu...,1755.0,,image/jpeg,http://resolver.staatsbibliothek-berlin.de/SBB...,http://resolver.staatsbibliothek-berlin.de/SBB...,1755.0,http://digital.staatsbibliothek-berlin.de/euro...,Staatsbibliothek zu Berlin - Preußischer Kultu...,Haude und Spener,,Open Access,"Maubert de Gouvest, Jean Henri: Der erlauchte ...",Berlin,Historische Drucke,Der erlauchte Bauer oder Lebensgeschichte und ...,Monograph
4,PPN743984099,Polizeiblatt für Mecklenburg,,Staatsbibliothek zu Berlin - Preußischer Kultu...,,,image/jpeg,http://resolver.staatsbibliothek-berlin.de/SBB...,http://resolver.staatsbibliothek-berlin.de/SBB...,,,Staatsbibliothek zu Berlin - Preußischer Kultu...,,,Open Access,Der Wächter Polizeiblatt für Mecklenburg. Sch...,Schwerin,Rechtswissenschaft,Der Wächter,Periodical


In [72]:
df[df.PPN.isnull()].count()

PPN             0
alternative     0
creator         0
dataProvider    0
date            0
description     0
format          0
identifier      0
isShownAt       0
issued          0
object          0
provider        0
publisher       0
relation        0
rights          0
source          0
spatial         0
subject         0
title           0
type            0
spatialClean    0
dtype: int64

In [392]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 120000 entries, 0 to 119999
Data columns (total 20 columns):
PPN             120000 non-null object
alternative     32089 non-null object
creator         92532 non-null object
dataProvider    120000 non-null object
date            64751 non-null object
description     9794 non-null object
format          120000 non-null object
identifier      120000 non-null object
isShownAt       119994 non-null object
issued          64751 non-null object
object          102054 non-null object
provider        120000 non-null object
publisher       60862 non-null object
relation        27570 non-null object
rights          120000 non-null object
source          120000 non-null object
spatial         98511 non-null object
subject         120000 non-null object
title           120000 non-null object
type            120000 non-null object
dtypes: object(20)
memory usage: 19.2+ MB


In [71]:
def uniqueValues(currentDataFrame):
    colNames=currentDataFrame.columns.values.tolist()
    for colName in colNames:
        print colName+";\t\t unique values:\t"+str(len(currentDataFrame[colName].unique()))+ "\t total count: "+str(currentDataFrame[colName].count())

uniqueValues(df)

PPN;		 unique values:	119956	 total count: 120000
alternative;		 unique values:	29238	 total count: 32089
creator;		 unique values:	44469	 total count: 92532
dataProvider;		 unique values:	1	 total count: 120000
date;		 unique values:	571	 total count: 64751
description;		 unique values:	9500	 total count: 9794
format;		 unique values:	1	 total count: 120000
identifier;		 unique values:	119938	 total count: 120000
isShownAt;		 unique values:	119933	 total count: 119994
issued;		 unique values:	571	 total count: 64751
object;		 unique values:	102053	 total count: 102054
provider;		 unique values:	1	 total count: 120000
publisher;		 unique values:	14755	 total count: 60862
relation;		 unique values:	16791	 total count: 27570
rights;		 unique values:	1	 total count: 120000
source;		 unique values:	101804	 total count: 120000
spatial;		 unique values:	8054	 total count: 98511
subject;		 unique values:	44	 total count: 120000
title;		 unique values:	85471	 total count: 120000
type;		 unique

* https://www.maxmind.com/en/free-world-cities-database
* http://www.geonames.org/export/
* http://www.opengeocode.org/download.php#cities
* https://en.wikipedia.org/wiki/Lists_of_cities_by_country

In [85]:
# zum matchen: p.match
# regular expressions taken from: http://stackoverflow.com/questions/1449817/what-are-some-of-the-most-useful-regular-expressions-for-programmers
# extended by me

patterns=dict()

patterns["positiveInteger"]="^\d+$"
patterns["negativeInteger"]="^-\d+$"
patterns["generalInteger"]="^-?\d+$"
patterns["positiveFloat"]="^\d*\.\d+$"
patterns["negativeFloat"]="^-\d*\.\d+$"
patterns["generalFloat"]="^-?\d*\.\d+$"
patterns["positiveGermanFloat"]="^\d*,\d+$"
patterns["negativeGermanFloat"]="^-\d*,\d+$"
patterns["generalGermanFloat"]="^-?\d*,\d+$"
# Date (dd mm yyyy, d/m/yyyy, etc.), in range 1000-2099 without proper February handling
patterns["dateVariant"]="^([1-9]|0[1-9]|[12][0-9]|3[01])\D([1-9]|0[1-9]|1[012])\D(1[0-9][0-9][0-9]|20[0-9][0-9])$"
patterns["year"]="^(1[0-9][0-9][0-9]|20[0-9][0-9])$"
patterns["ancientYear"]="^([0-1]?[0-9][0-9][0-9]|20[0-9][0-9])$"
patterns["century"]="^(1[0-9][Xx][Xx]|20[Xx][Xx])$"
patterns["ancientCentury"]="^([0-1]?[0-9][Xx][Xx]|20[Xx][Xx])$"
patterns["decade"]="^(1[0-9][0-9][Xx]|20[0-9][Xx])$"
patterns["ancientDecade"]="^([0-1]?[0-9][0-9][Xx]|20[0-9][Xx])$"
# year range with splitter "- / :", the splitter can be surrounded by an arbitrary amount of whitespaces
patterns["rangeYear"]="^\s*(1[0-9][0-9][0-9]|20[0-9][0-9])\s*(\-|\/|:)\s*(1[0-9][0-9][0-9]|20[0-9][0-9])\s*$"
patterns["rangeCentury"]="^\s*(1[0-9][Xx][Xx]|20[Xx][Xx])\s*(\-|\/|:)\s*(1[0-9][Xx][Xx]|20[Xx][Xx])\s*$"
patterns["rangeAncientYear"]="^\s*([0-1]?[0-9][0-9][0-9]|20[0-9][0-9])\s*(\-|\/|:)\s*(1[0-9][0-9][0-9]|20[0-9][0-9])\s*$"
patterns["rangeAncientCentury"]="^\s*([0-1]?[0-9][Xx][Xx]|20[Xx][Xx])\s*(\-|\/|:)\s*(1[0-9][Xx][Xx]|20[Xx][Xx])\s*$"
patterns["rangeYear2Digit"]="^\s*(1[0-9][0-9][0-9]|20[0-9][0-9])\s*(\-|\/|:)\s*([0-9][0-9])\s*$"
patterns["rangeDateVariant"]="^\s*([1-9]|0[1-9]|[12][0-9]|3[01])\D([1-9]|0[1-9]|1[012])\D(1[0-9][0-9][0-9]|20[0-9][0-9])\s*(\-|\/|:)\s*([1-9]|0[1-9]|[12][0-9]|3[01])\D([1-9]|0[1-9]|1[012])\D(1[0-9][0-9][0-9]|20[0-9][0-9])\s*$"


patterns["email"]="^[_]*([a-z0-9]+(\.|_*)?)+@([a-z][a-z0-9-]+(\.|-*\.))+[a-z]{2,6}$"
patterns["domain"]="^([a-z][a-z0-9-]+(\.|-*\.))+[a-z]{2,6}$"
patterns["url"]="^https?\:\/\/[a-zA-Z0-9.-]+\.[a-zA-Z]{2,3}\/?$"
patterns["ipv4"]="^(?:\d{1,3}\.){3}\d{1,3}$"
patterns["rgbHex"]="^#([a-fA-F0-9]{6}|[a-fA-F0-9]{3})$"
patterns["generalHex"]="^#[a-fA-F0-9]*$"
 
patterns["isbnPrefix"]="^ISBN(-1(?:(0)|3))?:?\x20(\s)*[0-9]+[- ][0-9]+[- ][0-9]+[- ][0-9]*[- ]*[xX0-9]$"
patterns["isbn"]="^[0-9]+[- ][0-9]+[- ][0-9]+[- ][0-9]*[- ]*[xX0-9]$"
patterns["NaN"]="^[Nn][Aa][Nn]$"

rawText="923,0"

p=re.compile(patterns["positiveGermanFloat"])
m = p.search(rawText)
if m:
    firstAppearance=m.group()
    print firstAppearance
else:
    print "No match."

2012
923,0


In [395]:
#df.to_csv('krieg.csv',index=False,header=True,encoding='utf-8', sep="\t")

In [6]:
df2=df[df.date.notnull()]
df2.head()

Unnamed: 0,PPN,alternative,creator,dataProvider,date,description,format,identifier,isShownAt,issued,object,provider,publisher,relation,rights,source,spatial,subject,title,type
2,PPN839758545,Auff eine anmutige bekandte Melodey appliciret...,"Jordano, Alberto Läger, Joachimus",Staatsbibliothek zu Berlin - Preußischer Kultu...,1650,,image/jpeg,http://resolver.staatsbibliothek-berlin.de/SBB...,http://resolver.staatsbibliothek-berlin.de/SBB...,1650,http://digital.staatsbibliothek-berlin.de/euro...,Staatsbibliothek zu Berlin - Preußischer Kultu...,,,Open Access,"Jordano, Alberto Läger, Joachimus: Der XLII. ...",[S.l.],Einblattdrucke,Der XLII. Psalm Davids,Monograph
3,PPN828810257,Enthaltend verschiedene geheime Nachrichten vo...,"Maubert de Gouvest, Jean Henri",Staatsbibliothek zu Berlin - Preußischer Kultu...,1755,,image/jpeg,http://resolver.staatsbibliothek-berlin.de/SBB...,http://resolver.staatsbibliothek-berlin.de/SBB...,1755,http://digital.staatsbibliothek-berlin.de/euro...,Staatsbibliothek zu Berlin - Preußischer Kultu...,Haude und Spener,,Open Access,"Maubert de Gouvest, Jean Henri: Der erlauchte ...",Berlin,Historische Drucke,Der erlauchte Bauer oder Lebensgeschichte und ...,Monograph
5,PPN839755570,Als Er den 23 Decemb. Anno 1649 von seiner ......,"Wolcken, Johann Meier, Johannes",Staatsbibliothek zu Berlin - Preußischer Kultu...,1650,,image/jpeg,http://resolver.staatsbibliothek-berlin.de/SBB...,http://resolver.staatsbibliothek-berlin.de/SBB...,1650,http://digital.staatsbibliothek-berlin.de/euro...,Staatsbibliothek zu Berlin - Preußischer Kultu...,"Rebenlein, Jacob",,Open Access,"Wolcken, Johann Meier, Johannes: Scheid-Gedic...",Hamburg,Historische Drucke,Scheid-Gedicht/ über Den Bitter-Süssen Abzuge ...,Monograph
6,PPN839796323,So gehalten in Hamburg Anno 1645. den 27. Janua:,"Lithoecaeus, Georgius Röver, Johann Overbeck...",Staatsbibliothek zu Berlin - Preußischer Kultu...,1645,,image/jpeg,http://resolver.staatsbibliothek-berlin.de/SBB...,http://resolver.staatsbibliothek-berlin.de/SBB...,1645,http://digital.staatsbibliothek-berlin.de/euro...,Staatsbibliothek zu Berlin - Preußischer Kultu...,"Rebenlein, Jacob",,Open Access,"Lithoecaeus, Georgius Röver, Johann Overbeck...",Hamburg,Historische Drucke,Hochzeitliches Ehren-Gedicht Auff das Ehliche ...,Monograph
7,PPN840788037,,"Chesterfield, Philipp Dormer Stanhope",Staatsbibliothek zu Berlin - Preußischer Kultu...,1780,,image/jpeg,http://resolver.staatsbibliothek-berlin.de/SBB...,http://resolver.staatsbibliothek-berlin.de/SBB...,1780,http://digital.staatsbibliothek-berlin.de/euro...,Staatsbibliothek zu Berlin - Preußischer Kultu...,Weidmann & Reich,"Chesterfield, Philipp Dormer Stanhope: Vermisc...",Open Access,"Chesterfield, Philipp Dormer Stanhope: Aufsätz...",Leipzig,Historische Drucke,Aufsätze aus Wochenschriften und freundschaftl...,Volume


In [86]:
rowCount=0
histogram=dict()
for row in df.iterrows():
    rowCount=rowCount+1
    readDate=str(row[1]["date"])
    matchedOnce=False
    for key in patterns:
        p=re.compile(patterns[key])
        m = p.search(readDate)
        if m:
            if not key in histogram:
                histogram[key]=0
            histogram[key]=histogram[key]+1
            matchedOnce=True
        else:
            pass
    if not matchedOnce:
        print "No matches at all: "+row[1]["PPN"]+"\t for: "+str(readDate)
print "Row count: "+str(rowCount)
print histogram    

No matches at all: PPN3303600473	 for: 16XX-1867
No matches at all: PPN670389129	 for: 1756 $ [ca. 1756]
No matches at all: PPN670201561	 for: 1739 $ [ca. 1739]
No matches at all: PPN666097402	 for: um 1100
Row count: 120000
{'rangeCentury': 3, 'rangeYear2Digit': 2, 'rangeAncientCentury': 3, 'century': 577, 'NaN': 55249, 'rangeYear': 4, 'positiveInteger': 64126, 'ancientDecade': 30, 'ancientCentury': 582, 'ancientYear': 64126, 'rangeAncientYear': 4, 'year': 64125, 'generalInteger': 64126, 'decade': 30}


In [95]:
class DataCleaner:
    # matches alphanumeric character and the underscore at the beginning of the string
    #Unicode flag is needed because of Asian character sets otherwise such signs would be considered as non-alphanumeric
    regEx_AlphaNum=re.compile("^\w",re.UNICODE)
    # checks for surrounding []; will match almost everything but Asian characters
    regEx_BracketText=re.compile("^\[[\w\?\.,\sßÄäÖöÜü]*\]",re.UNICODE)
    # checks for typical spellings of the "sine loco" abbreviation "s. l."
    regEx_SineLoco=re.compile("[sSoO]\s?\.\s?[lLoO]\s?\.?\s?",re.UNICODE)
    
    def __init__(self):
        pass
    
    def cleanAncientYearStrict(self,readData):
        if type(readData)==float:
            return readDate
        else:
            p=re.compile(patterns["ancientYear"])
            m = p.search(str(readData))
            if m:
                firstAppearance=m.group()
                return firstAppearance
            else:
                return np.nan
            
    def cleanSpatialText(self,readData):
        # just in case we did not get a string, we use brute force and return NaN
        if type(readData)==float:
            return np.nan
        else:
            #readData=str(readData)
            m = self.regEx_AlphaNum.search(readData)
            # if the string does start with a bracket...
            if not m:
                #print "No matches at all: "+row[1]["PPN"]+"\t for: "+str(readData)
                m2 = self.regEx_BracketText.search(readData)
                if m2:
                    matchedGroup=m2.group()
                    #print "\tMatch: "+matchedGroup
                    m3=self.regEx_SineLoco.search(matchedGroup)
                    if m3:
                        #print "\tMatched Sine Loco: "+str(m3.group())
                        return np.nan
                    else:
                        matchedGroup=matchedGroup.replace("[","").replace("]","")
                        #print "\tFinal string: "+matchedGroup
                        return matchedGroup
            # otherwise, it may still be a "sine loco"
            else:
                m3=self.regEx_SineLoco.search(readData)
                if m3:
                    #print "\tMatched Sine Loco: "+str(m3.group())
                    return np.nan
                else:
                    return readData

1) wenn nicht alphanumerisch, dann alles zwischen dem ersten [] selektieren
2) prüfen, ob das != s.l. ist
3) ergebnis speichern
* [s. l.] = sine loco (Latin: ohne Ortsangabe), Groß- und Kleinschreibung variiert, mit  oder; ebenso: o.O.
* "[S.l.]  Cölln an der Spree" 
* [Halle, Saale]  Hall
* [Frankfurt, Oder]  [Frankfurt, Oder]
* [Frankfurt, Oder?]
* [Antwerpen?]
* [Wittenberg]  Lipsiae  Lipsiae
* [Stendal]  Leipzig
* [Bando, Japan]
* [Berlin-Lichterfelde]
* [Köln]  Düsseldorf [u.a.]
* [S.l.]  [Berlin?]
* [Neuchâtel  Lausanne]
* À Paris [usw.]
* [London u.a.]
* [China]

In [65]:
dc=DataCleaner()

#for row in df.iterrows():
#    print dc.cleanSpatialText(str(row[1]["spatial"]))
    
df['spatialClean'] = df.spatial.apply(dc.cleanSpatialText)

In [96]:
dc=DataCleaner()
df['dateClean'] = df.date.apply(dc.cleanAncientYearStrict)

In [99]:
df.sort_values(by="date")

Unnamed: 0,PPN,alternative,creator,dataProvider,date,description,format,identifier,isShownAt,issued,object,provider,publisher,relation,rights,source,spatial,subject,title,type,spatialClean,dateClean
69058,PPN3308101424,,,Staatsbibliothek zu Berlin - Preußischer Kultu...,764,558000 ROA,image/jpeg,http://resolver.staatsbibliothek-berlin.de/SBB...,http://resolver.staatsbibliothek-berlin.de/SBB...,764,http://digital.staatsbibliothek-berlin.de/euro...,Staatsbibliothek zu Berlin - Preußischer Kultu...,,,Open Access,無垢浄光経自心印陀羅尼. 奈良 Berlin 0764,奈良,Historische Drucke,無垢浄光経自心印陀羅尼,Monograph,奈良,764
87065,PPN655634029,,"Astarābāḏī, Abū-ʽAlī Ibn-al-Ḥusain Ibn-Aḥmad",Staatsbibliothek zu Berlin - Preußischer Kultu...,1071,,image/jpeg,http://resolver.staatsbibliothek-berlin.de/SBB...,http://resolver.staatsbibliothek-berlin.de/SBB...,1071,http://digital.staatsbibliothek-berlin.de/euro...,Staatsbibliothek zu Berlin - Preußischer Kultu...,,,Open Access,"Astarābāḏī, Abū-ʽAlī Ibn-al-Ḥusain Ibn-Aḥmad: ...",,Orientalische Handschriften,Muḫtaṣar ġarīb al-ḥadīṯ,Monograph,,1071
90114,PPN627401597,,"Sunnī, Aḥmad Ibn-Muḥammad Ibn-Isḥāq",Staatsbibliothek zu Berlin - Preußischer Kultu...,1145,,image/jpeg,http://resolver.staatsbibliothek-berlin.de/SBB...,http://resolver.staatsbibliothek-berlin.de/SBB...,1145,http://digital.staatsbibliothek-berlin.de/euro...,Staatsbibliothek zu Berlin - Preußischer Kultu...,,,Open Access,"Sunnī, Aḥmad Ibn-Muḥammad Ibn-Isḥāq: Kitāb ʿam...",,Orientalische Handschriften,Kitāb ʿamal al-yaum wa-'l-laila,Monograph,,1145
88653,PPN645217638,Kommentar,,Staatsbibliothek zu Berlin - Preußischer Kultu...,1203,,image/jpeg,http://resolver.staatsbibliothek-berlin.de/SBB...,http://resolver.staatsbibliothek-berlin.de/SBB...,1203,http://digital.staatsbibliothek-berlin.de/euro...,Staatsbibliothek zu Berlin - Preußischer Kultu...,,,Open Access,al-Qurʿān Kommentar. 1203,,Orientalische Handschriften,al-Qurʿān,Monograph,,1203
108079,PPN662350499,,"Banū-Mūsā Ibn-Šākir,",Staatsbibliothek zu Berlin - Preußischer Kultu...,1210,,image/jpeg,http://resolver.staatsbibliothek-berlin.de/SBB...,http://resolver.staatsbibliothek-berlin.de/SBB...,1210,http://digital.staatsbibliothek-berlin.de/euro...,Staatsbibliothek zu Berlin - Preußischer Kultu...,,,Open Access,"Banū-Mūsā Ibn-Šākir,: Kitāb al-Ḥiyal. 1210",,Orientalische Handschriften,Kitāb al-Ḥiyal,Monograph,,1210
109152,PPN616194641,al-ǧuzʾ aṯ-ṯāliṯ,"Ḥarāšī, Sulaimān Ibn-ʿAbdallāh",Staatsbibliothek zu Berlin - Preußischer Kultu...,1214,,image/jpeg,http://digital.staatsbibliothek-berlin.de/SBB0...,http://digital.staatsbibliothek-berlin.de/SBB0...,1214,http://digital.staatsbibliothek-berlin.de/euro...,Staatsbibliothek zu Berlin - Preußischer Kultu...,,,Open Access,"Ḥarāšī, Sulaimān Ibn-ʿAbdallāh: at-Tafṣīl li-ǧ...",,Orientalische Handschriften,at-Tafṣīl li-ǧumal at-taḥṣīl,Monograph,,1214
81127,PPN635598744,,"Ibn-Razīn, Abū 'l-Ḥasan ʿAlī-",Staatsbibliothek zu Berlin - Preußischer Kultu...,1233,,image/jpeg,http://resolver.staatsbibliothek-berlin.de/SBB...,http://resolver.staatsbibliothek-berlin.de/SBB...,1233,http://digital.staatsbibliothek-berlin.de/euro...,Staatsbibliothek zu Berlin - Preußischer Kultu...,,,Open Access,"Ibn-Razīn, Abū 'l-Ḥasan ʿAlī-: Adāb al-mulūk. ...",,Orientalische Handschriften,Adāb al-mulūk,Monograph,,1233
109674,PPN670595624,,,Staatsbibliothek zu Berlin - Preußischer Kultu...,1233,,image/jpeg,http://resolver.staatsbibliothek-berlin.de/SBB...,http://resolver.staatsbibliothek-berlin.de/SBB...,1233,http://digital.staatsbibliothek-berlin.de/euro...,Staatsbibliothek zu Berlin - Preußischer Kultu...,,,Open Access,Torah mit großer Masora. Rouen 1233,Rouen,Orientalische Handschriften,Torah mit großer Masora,Monograph,Rouen,1233
94865,PPN66400010X,,,Staatsbibliothek zu Berlin - Preußischer Kultu...,1241,,image/jpeg,http://resolver.staatsbibliothek-berlin.de/SBB...,http://resolver.staatsbibliothek-berlin.de/SBB...,1241,http://digital.staatsbibliothek-berlin.de/euro...,Staatsbibliothek zu Berlin - Preußischer Kultu...,,,Open Access,"Evangeliarium nach der Ḥarḳlensis, zum Teil au...",Ṭûr-ʿAbdîn,Orientalische Handschriften,"Evangeliarium nach der Ḥarḳlensis, zum Teil au...",Monograph,Ṭûr-ʿAbdîn,1241
106126,PPN669225819,,ʿAbd-al-Karīm Ibn-ʿAbd-aṣ-Ṣamad aṭ-Ṭabarī Abū-...,Staatsbibliothek zu Berlin - Preußischer Kultu...,1252,,image/jpeg,http://resolver.staatsbibliothek-berlin.de/SBB...,http://resolver.staatsbibliothek-berlin.de/SBB...,1252,http://digital.staatsbibliothek-berlin.de/euro...,Staatsbibliothek zu Berlin - Preußischer Kultu...,,,Open Access,ʿAbd-al-Karīm Ibn-ʿAbd-aṣ-Ṣamad aṭ-Ṭabarī Abū-...,,Orientalische Handschriften,K. al- Ǧāmiʿ al-maʿrūf bi-Sūq al-ʿarūs,Monograph,,1252


# Ideen

* Timeline und Grafisches Aussehen, x-Achse: Zeit, y-Achse. Farbe? Brightness? Entropy? Abweichung vom Referenzbild (Distanz zum QBE)? https://www.slideshare.net/formalist/how-and-why-study-big-cultural-data-v2-15552598 #43
* 