In [43]:
# The %... is an iPython thing, and is not part of the Python language.
# In this case we're just telling the plotting library to draw things on
# the notebook, instead of on a separate window.
%matplotlib inline

# See all the "as ..." contructs? They're just aliasing the package names.
# That way we can call methods like plt.plot() instead of matplotlib.pyplot.plot().
import numpy as np
import scipy as sp
import matplotlib as mpl
import matplotlib.cm as cm
import matplotlib.pyplot as plt
import pandas as pd
import time
pd.set_option('display.width', 500)
pd.set_option('display.max_columns', 100)
pd.set_option('display.max_rows', 40)
pd.set_option('display.notebook_repr_html', True)
import seaborn as sns
sns.set_style("whitegrid")
sns.set_context("notebook")

from bs4 import BeautifulSoup
from collections import OrderedDict # provides the ordered dictionary
import re # for regular expressions used below
import urllib # to read from URLs
import json
import networkx as nx # network analysis
from networkx.readwrite import json_graph
import itertools
import os.path
from datetime import datetime # for time measurement
import sys
import os
import pickle
import subprocess as subp
import gzip
import math
import codecs

from jellyfish import jaro_distance, jaro_winkler, hamming_distance, levenshtein_distance
import scipy.cluster.hierarchy as scipycluster
from sklearn.feature_extraction.text import TfidfVectorizer

from skimage import io, exposure
from scipy.spatial import distance
# import the k-means algorithm
from sklearn.cluster import KMeans, MiniBatchKMeans
from sklearn.metrics import pairwise_distances_argmin,pairwise_distances_argmin_min, pairwise_distances
from sklearn.metrics.pairwise import euclidean_distances



# image handling
from PIL import Image

# geo stuff
from geopy.distance import vincenty
import geojson as gj

import csv

def printLog(text):
    now=str(datetime.now())
    print("["+now+"]\t"+text)
    # forces to output the result of the print command immediately, see: http://stackoverflow.com/questions/230751/how-to-flush-output-of-python-print
    sys.stdout.flush()
    
def pickleCompress(fileName,pickledObject):
    printLog("Pickling to '%s'" %fileName)
    f = gzip.open(fileName,'wb')
    pickle.dump(pickledObject,f)
    f.close()
    printLog("Pickling done.")
    
def pickleDecompress(fileName):
    #restore the object
    printLog("Depickling from '%s'" %fileName)
    f = gzip.open(fileName,'rb')
    pickledObject = pickle.load(f)
    f.close()
    printLog("Depickling done.")
    return pickledObject

if not os.path.exists("./analysis/"):
        os.makedirs("./analysis/")


In [44]:
def createGraph(data):
    # create an empty graph from the nx (networkx) package imported above
    G=nx.DiGraph()

    rowCount=0
    seenAuthors=[]
    seenLocations=[]
    seenPublishers=[]
    #noRecords=len(data.keys())
    #print("Processing %i records."%noRecords)
    for ppn in data:
        #print(ppn)
        rowCount=rowCount+1
        #if rowCount%10000==0:
        #    printLog("Processed %i records of %i"%(rowCount,noRecords))
        
        
        author=data[ppn]["author"]
        authorID=data[ppn]["authorID"]
        publisher=data[ppn]["publisher"]
        publisherLocation=data[ppn]["publisherLocation"]
        
        title=data[ppn]["title"]
        
        if author:
            if not author in seenAuthors:
                seenAuthors.append(author)
                G.add_node(author)
                # the name attribute will be helpful for D3.js visualizations
                G.node[author]['name'] = author
                G.node[author]['type'] = "author"
                if authorID:
                    G.node[author]['id'] = authorID
        if publisher:
            if not publisher in seenPublishers:
                seenPublishers.append(publisher)
                G.add_node(publisher)
                G.node[publisher]['name'] = publisher
                G.node[publisher]['type'] = "publisher"
       
        if publisherLocation:
            if not publisherLocation in seenLocations:
                seenLocations.append(publisherLocation)
                G.add_node(publisherLocation)
                G.node[publisherLocation]['name'] = publisherLocation
                G.node[publisherLocation]['type'] = "publisherLocation"
        
        if title:
            G.add_node(title)
            G.node[title]['name'] = title
            G.node[title]['type'] = "title"
        
        if author and title:
            G.add_edge(author,title)
        if publisher and title:
            G.add_edge(publisher,title)
            
        if author and publisher:
            G.add_edge(author,publisher)
        if publisher and publisherLocation:
            G.add_edge(publisher,publisherLocation)
        
    return G

In [45]:
def createGraph_Author_Publisher_Location(data):
    # create an empty graph from the nx (networkx) package imported above
    G=nx.DiGraph()

    rowCount=0
    seenAuthors=[]
    seenLocations=[]
    seenPublishers=[]
    #noRecords=len(data.keys())
    #print("Processing %i records."%noRecords)
    for ppn in data:
        #print(ppn)
        rowCount=rowCount+1
        #if rowCount%10000==0:
        #    printLog("Processed %i records of %i"%(rowCount,noRecords))
        
        author=data[ppn]["author"]
        authorID=data[ppn]["authorID"]
        publisher=data[ppn]["publisher"]
        publisherLocation=data[ppn]["publisherLocation"]
        
        title=data[ppn]["title"]
        
        if author:
            if not author in seenAuthors:
                seenAuthors.append(author)
                G.add_node(author)
                # the name attribute will be helpful for D3.js visualizations
                G.node[author]['name'] = author
                G.node[author]['type'] = "author"
                if authorID:
                    G.node[author]['id'] = authorID
        if publisher:
            if not publisher in seenPublishers:
                seenPublishers.append(publisher)
                G.add_node(publisher)
                G.node[publisher]['name'] = publisher
                G.node[publisher]['type'] = "publisher"
       
        if publisherLocation:
            if not publisherLocation in seenLocations:
                seenLocations.append(publisherLocation)
                G.add_node(publisherLocation)
                G.node[publisherLocation]['name'] = publisherLocation
                G.node[publisherLocation]['type'] = "publisherLocation"
            
        if author and publisher:
            G.add_edge(author,publisher)
        if publisher and publisherLocation:
            G.add_edge(publisher,publisherLocation)
        
    return G

https://networkx.github.io/documentation/stable/reference/readwrite/json_graph.html JSON output is compatible with d3.js

In [None]:
baseDir="/Users/david/src/__datasets/cbs_analysis/"
# the following catalog files are sorted by file size
files=["fry_out.txt","ice_out.txt","por_out.txt","nor_out.txt","dan_out.txt","swe_out.txt","spa_out.txt","dut_out.txt","ita_out.txt","lat_out.txt","fre_out.txt","eng_out.txt","ger_out.txt"]

# create a dictionary for the records
records=dict()

dataFrameDict={"language":[],"graph_type":[],"nodes":[],"edges":[],"creation_duration":[],"records":[]}

csv.field_size_limit(sys.maxsize)

for file in files:
    currentLanguage=file.split("_")[0]
    ppn=None
    with open(baseDir+file, newline='') as csvfile:
        spamreader = csv.reader(csvfile, delimiter='\t')
        for row in spamreader:
            # skip empty separator line
            if row:
                # if we haven't seen a PPN before, it is a new record we have to deal with
                if not ppn:
                    ppn=row[0]
                    # an empty dict for the record values
                    values={"title":None,"author":None,"authorID":None,"publisher":None,"publisherLocation":None}
                    records[ppn]=values
                # check if we deal with a row containing a title
                if row[1]=="021A":
                    records[ppn]["title"]=row[2]
                # publisher
                if row[1]=="033A":
                    tokens=row[2].split("@")
                    if len(tokens)>=2:
                        records[ppn]["publisher"]=tokens[0]
                        records[ppn]["publisherLocation"]=tokens[1].replace("[","").replace("]","")
                    else:
                        records[ppn]["publisher"]=row[2]
                # author
                if row[1]=="028A":
                    tokens=row[2].split("@")
                    if len(tokens)>=2:
                        records[ppn]["author"]=tokens[0]
                        records[ppn]["authorID"]=tokens[1]
                    else:
                        records[ppn]["author"]=row[2]             
            # in case of an empty line, prepare for a new record
            else:
                #debug
                #if ppn:
                #    print(ppn+str(records[ppn]))
                ppn=None
    # * * * * * * * * * * * * * * *
    # process the found records
    # * * * * * * * * * * * * * * *
    noRecords=len(records.keys())
    
    dataFrameDict["language"].append(currentLanguage)
    dataFrameDict["graph_type"].append("author_publisher_location_title")
    printLog("Processing catalog language: %s"%currentLanguage)
    printLog("Creating graph...")
    startTime = datetime.now()
    returnedGraph=createGraph(records)
    endTime = datetime.now()
    dataFrameDict["nodes"].append(len(returnedGraph.nodes()))
    dataFrameDict["edges"].append(len(returnedGraph.edges()))
    dataFrameDict["records"].append(noRecords)
    dataFrameDict["creation_duration"]=endTime-startTime
    #printLog("Serializing graph with %i nodes and %i edges."%(len(returnedGraph.nodes()),len(returnedGraph.edges())))
    nx.write_gml(returnedGraph,"analysis/"+currentLanguage+"_author_publisher_location_title.gml")
    jsonData = json_graph.node_link_data(returnedGraph, {'link': 'edges', 'source': 'from', 'target': 'to'})
    with open("analysis/"+currentLanguage+"_author_publisher_location_title.json", "w") as write_file:
        json.dump(jsonData, write_file,default={'link': 'edges', 'source': 'from', 'target': 'to'})
    nx.write_graphml_lxml(returnedGraph, "analysis/"+currentLanguage+"_author_publisher_location_title.graphml")
    
    dataFrameDict["language"].append(currentLanguage)
    dataFrameDict["graph_type"].append("author_publisher_location_title")
    printLog("Processing catalog language: %s"%currentLanguage)
    printLog("Creating graph...")
    startTime = datetime.now()
    returnedGraph=createGraph_Author_Publisher_Location(records)
    endTime = datetime.now()
    dataFrameDict["nodes"].append(len(returnedGraph.nodes()))
    dataFrameDict["edges"].append(len(returnedGraph.edges()))
    dataFrameDict["records"].append(noRecords)
    dataFrameDict["creation_duration"]=endTime-startTime
    #printLog("Serializing graph with %i nodes and %i edges."%(len(returnedGraph.nodes()),len(returnedGraph.edges())))
    nx.write_gml(returnedGraph,"analysis/"+currentLanguage+"_author_publisher_location.gml")
    jsonData = json_graph.node_link_data(returnedGraph, {'link': 'edges', 'source': 'from', 'target': 'to'})
    with open("analysis/"+currentLanguage+"_author_publisher_location.json", "w") as write_file:
        json.dump(jsonData, write_file,default={'link': 'edges', 'source': 'from', 'target': 'to'})
    nx.write_graphml_lxml(returnedGraph, "analysis/"+currentLanguage+"_author_publisher_location.graphml")
    
printLog("\nDone.")

[2019-03-04 17:23:55.834287]	Processing catalog language: fry
[2019-03-04 17:23:55.834911]	Creating graph...
[2019-03-04 17:23:55.894831]	Processing catalog language: fry
[2019-03-04 17:23:55.895478]	Creating graph...
[2019-03-04 17:23:55.924303]	Processing catalog language: ice
[2019-03-04 17:23:55.924814]	Creating graph...
[2019-03-04 17:23:56.071901]	Processing catalog language: ice
[2019-03-04 17:23:56.072512]	Creating graph...
[2019-03-04 17:23:56.187140]	Processing catalog language: por
[2019-03-04 17:23:56.187641]	Creating graph...
[2019-03-04 17:23:57.352159]	Processing catalog language: por
[2019-03-04 17:23:57.352663]	Creating graph...
[2019-03-04 17:23:57.971977]	Processing catalog language: nor
[2019-03-04 17:23:57.972766]	Creating graph...
[2019-03-04 17:24:00.885687]	Processing catalog language: nor
[2019-03-04 17:24:00.886498]	Creating graph...
[2019-03-04 17:24:02.523848]	Processing catalog language: dan
[2019-03-04 17:24:02.524672]	Creating graph...
[2019-03-04 17:24:0

In [None]:
df = pd.DataFrame(dataFrameDict)
df

deutschsprachiger Graph:
[2019-03-03 21:10:04.826571]	Creating graph...
[2019-03-04 00:49:56.174232]	Done.

https://networkx.github.io/documentation/stable/reference/algorithms/approximation.html

In [11]:
from networkx.algorithms import centrality
printLog("Computing graph properties...")
degreeCentralities=centrality.degree_centrality(returnedGraph)
inDegrees=centrality.in_degree_centrality(returnedGraph)
printLog("Done.")

[2019-03-04 15:12:35.226562]	Computing graph properties...
[2019-03-04 15:12:35.238867]	Done.


In [7]:
# very sloooooooow...
from networkx.algorithms import approximation
printLog("Computing graph properties...")
#s=approximation.max_clique(returnedGraph)
printLog("Done.")

[2019-03-04 15:09:25.855886]	Computing graph properties...
[2019-03-04 15:09:25.856747]	Done.
