In [299]:
import gzip            
import io              
import numpy as np
import pandas as pd    
import pymysql.cursors 
import rdflib
from rdflib import Namespace
import urllib.request 
import math

import matplotlib.pyplot as plt
plt.style.use('ggplot')

%matplotlib inline

# Connect to the database

connection = pymysql.connect(host='hosting.nyu.edu',
                             user='cmrougha_adsq',
                             password='###REPLACE###',
                             db='cmrougha_adsq2017',
                             charset='utf8mb4',
                             cursorclass=pymysql.cursors.DictCursor)

In [300]:
resourceKey = 'www.astronomoumenos.com/id/'
verbKey = 'www.astronomoumenos.com/ontologies/astr.owl#'

def convertToRDF(g,df,convType):
    cols = list(df)
    for t in df.iterrows():
        # s will always be a resource
        s = rdflib.URIRef(resourceKey + str(t[1][cols[0]]))
        
        # p might have different prefixes when using ontologies
        # external to the project
        if cols[1] == "rdfs:label":
            p = rdflib.URIRef("http://www.w3.org/2000/01/rdf-schema#label")
        elif cols[1] == "rdf:type":
            p = rdflib.URIRef("http://www.w3.org/1999/02/22-rdf-syntax-ns#type")
        else:
            p = rdflib.URIRef(verbKey + cols[1])
        
        # o can be a resource or literal
        if convType == "resource-resource":
            if str(t[1][cols[1]]).strip() != '':
                o = rdflib.URIRef(resourceKey + str(t[1][cols[1]]))
                g.add((s,p,o))
                
        if convType == "resource-string":
            if str(t[1][cols[1]]).strip() != '':
                o = rdflib.Literal(str(t[1][cols[1]]))
                g.add((s,p,o))
        
        if convType == "resource-numeric":
            if str(t[1][cols[1]]).strip() != '':
                o = rdflib.Literal(int(t[1][cols[1]]))
                g.add((s,p,o))
                
        if convType == "resource-class":
            if str(t[1][cols[1]]).strip() != '':
                o = rdflib.URIRef("http://www.astronomoumenos.com/ontologies/astr.owl#" + str(t[1][cols[1]]))
                g.add((s,p,o))     
                
    return g


In [301]:
def getDF(query):
    sql = query
    with connection.cursor() as cursor:
        cursor.execute(sql)
        names = [ x[0] for x in cursor.description]
        result = cursor.fetchall()
    return pd.DataFrame(result, columns = names)

# Converting

Note: the updated code does not automatically handle classes -- these must be manually specified as the second column using the proper SQL query.

Other updates: claims about resources are no longer handled with blank nodes. These rather are resources and follow the naming convention of appending \_[insert source here] to the identifier for the thing about which claims are being made. E.g.: `wi7922730_pinakes`.

In [302]:
g = rdflib.Graph()
print("graph has %s statements." % len(g))

graph has 0 statements.


In [303]:
# Manuscript locatedIn Institution
sql = """
SELECT DISTINCT CONCAT("ms",astrID_ms) AS contains, REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(Depot,'"',''),')',''),'(',''),"'",""),'.',''),',',''),' ','') AS locatedIn 
FROM `17.4.12_pinAstr_all`
 """
df = getDF(sql)
#df.head()
g = convertToRDF(g,df,'resource-resource')
print("graph has %s statements." % len(g))

graph has 4111 statements.


In [304]:
# Institution locatedIn City
sql = """
SELECT DISTINCT REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(Depot,'"',''),')',''),'(',''),"'",""),'.',''),',',''),' ','') AS contains, REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(Ville,'"',''),')',''),'(',''),"'",""),'.',''),',',''),' ','') AS locatedIn 
FROM `17.4.12_pinAstr_all`
"""
df = getDF(sql)
#df.head()
g = convertToRDF(g,df,'resource-resource')
print("graph has %s statements." % len(g))

graph has 4395 statements.


In [305]:
# City locatedIn Country
sql = """
SELECT DISTINCT REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(Ville,'"',''),')',''),'(',''),"'",""),'.',''),',',''),' ','') AS contains, REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(Pays,'"',''),')',''),'(',''),"'",""),'.',''),',',''),' ','') AS locatedIn 
FROM `17.4.12_pinAstr_all`
"""
df = getDF(sql)
#df.head()
g = convertToRDF(g,df,'resource-resource')
print("graph has %s statements." % len(g))

graph has 4575 statements.


In [306]:
# Manuscript rdf:type Manuscript
sql = """
SELECT DISTINCT CONCAT("ms",astrID_ms) AS ``, "Manuscript" AS `rdf:type` 
FROM `17.4.12_pinAstr_all`
 """
df = getDF(sql)
#df.head()
g = convertToRDF(g,df,'resource-class')
print("graph has %s statements." % len(g))

graph has 8686 statements.


In [307]:
# Institution rdf:type Institution
sql = """
SELECT DISTINCT REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(Depot,'"',''),')',''),'(',''),"'",""),'.',''),',',''),' ','') AS ``, "Institution" AS `rdf:type` 
FROM `17.4.12_pinAstr_all`
 """
df = getDF(sql)
#df.head()
g = convertToRDF(g,df,'resource-class')
print("graph has %s statements." % len(g))

graph has 8944 statements.


In [308]:
# City rdf:type City
sql = """
SELECT DISTINCT REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(Ville,'"',''),')',''),'(',''),"'",""),'.',''),',',''),' ','') AS ``, "City" AS `rdf:type` 
FROM `17.4.12_pinAstr_all`
 """
df = getDF(sql)
#df.head()
g = convertToRDF(g,df,'resource-class')
print("graph has %s statements." % len(g))

graph has 9127 statements.


In [309]:
# Country rdf:type Country
sql = """
SELECT DISTINCT REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(Pays,'"',''),')',''),'(',''),"'",""),'.',''),',',''),' ','') AS ``, "Country" AS `rdf:type` 
FROM `17.4.12_pinAstr_all`
 """
df = getDF(sql)
#df.head()
g = convertToRDF(g,df,'resource-class')
print("graph has %s statements." % len(g))

graph has 9159 statements.


In [310]:
# Manuscript rdfs:label Shelfmark
sql = """
SELECT DISTINCT CONCAT("ms",astrID_ms) AS ``, CONCAT(Ville," ",Depot," ",FondsCote) AS `rdfs:label`
FROM `17.4.12_pinAstr_all`
 """
df = getDF(sql)
#df.head()
g = convertToRDF(g,df,'resource-string')
print("graph has %s statements." % len(g))

graph has 13270 statements.


In [311]:
# Institution rdfs:label Name
sql = """
SELECT DISTINCT REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(Depot,'"',''),')',''),'(',''),"'",""),'.',''),',',''),' ','') AS ``, Depot AS `rdfs:label` 
FROM `17.4.12_pinAstr_all`
 """
df = getDF(sql)
#df.head()
g = convertToRDF(g,df,'resource-string')
print("graph has %s statements." % len(g))

graph has 13528 statements.


In [312]:
# City rdfs:label Name
sql = """
SELECT DISTINCT REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(Ville,'"',''),')',''),'(',''),"'",""),'.',''),',',''),' ','') AS ``, Ville AS `rdfs:label` 
FROM `17.4.12_pinAstr_all`
 """
df = getDF(sql)
#df.head()
g = convertToRDF(g,df,'resource-string')
print("graph has %s statements." % len(g))

graph has 13711 statements.


In [313]:
# Country rdfs:label Name
sql = """
SELECT DISTINCT REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(Pays,'"',''),')',''),'(',''),"'",""),'.',''),',',''),' ','') AS ``, Pays AS `rdfs:label` 
FROM `17.4.12_pinAstr_all`
 """
df = getDF(sql)
#df.head()
g = convertToRDF(g,df,'resource-string')
print("graph has %s statements." % len(g))

graph has 13741 statements.


In [314]:
# Manuscript msContains Witness
sql = """
SELECT DISTINCT CONCAT("ms",astrID_ms) AS inMS, CONCAT("wi",astrID_wi) AS msContains 
FROM `17.4.12_pinAstr_all` """
df = getDF(sql)
#df.head()
g = convertToRDF(g,df,'resource-resource')
print("graph has %s statements." % len(g))

graph has 49783 statements.


In [315]:
# Witness rdf:type Witness
sql = """
SELECT DISTINCT CONCAT("wi",astrID_wi) AS ``, "Witness" AS `rdf:type` 
FROM `17.4.12_pinAstr_all`
 """
df = getDF(sql)
#df.head()
g = convertToRDF(g,df,'resource-class')
print("graph has %s statements." % len(g))

graph has 85825 statements.


In [316]:
# Witness witnessOf Work
sql = """
SELECT DISTINCT CONCAT("wi",astrID_wi) AS hasWitness, CONCAT("wo",astrID_wo) AS witnessOf
FROM `17.4.12_pinAstr_all`
"""
df = getDF(sql)
#df.head()
g = convertToRDF(g,df,'resource-resource')
print("graph has %s statements." % len(g))

graph has 121867 statements.


In [317]:
# Work rdf:type Work
sql = """
SELECT DISTINCT CONCAT("wo",astrID_wo) AS ``, "Work" AS `rdf:type` 
FROM `17.4.12_pinAstr_all`
 """
df = getDF(sql)
#df.head()
g = convertToRDF(g,df,'resource-class')
print("graph has %s statements." % len(g))

graph has 127881 statements.


In [318]:
# Work rdfs:label Title
# This is perhaps where dc:title would be useful - TBD
sql = """
SELECT DISTINCT CONCAT("wo",astrID_wo) AS ``, Oeuvre AS `rdfs:label` 
FROM `17.4.12_pinAstr_all`
 """
df = getDF(sql)
#df.head()
g = convertToRDF(g,df,'resource-string')
print("graph has %s statements." % len(g))

graph has 133895 statements.


In [319]:
# Work hasContributor Contributor
# This is perhaps where dc:author would be useful - TBD
sql = """
SELECT DISTINCT CONCAT("wo",astrID_wo) AS ``, CONCAT("au",astrID_au) AS hasContributor
FROM `17.4.12_pinAstr_all`
 """
df = getDF(sql)
#df.head()
g = convertToRDF(g,df,'resource-resource')
print("graph has %s statements." % len(g))

graph has 139909 statements.


In [320]:
# Contributor rdfs:label Name
sql = """
SELECT DISTINCT CONCAT("au",astrID_au) AS ``, Auteur AS `rdfs:label`
FROM `17.4.12_pinAstr_all`
 """
df = getDF(sql)
#df.head()
g = convertToRDF(g,df,'resource-string')
print("graph has %s statements." % len(g))

graph has 141401 statements.


In [321]:
# Manuscript hasClaim Claim
sql = """
SELECT DISTINCT CONCAT("ms",astrID_ms) AS claimAbout, CONCAT("ms",astrID_ms,"_pinakes") AS hasClaim
FROM `17.4.12_pinAstr_all`
 """
df = getDF(sql)
#df.head()
g = convertToRDF(g,df,'resource-resource')
print("graph has %s statements." % len(g))

graph has 145512 statements.


In [322]:
# Claim rdf:type Claim
sql = """
SELECT DISTINCT CONCAT("ms",astrID_ms,"_pinakes") AS ``, "Claim" as `rdf:type`
FROM `17.4.12_pinAstr_all`
 """
df = getDF(sql)
#df.head()
g = convertToRDF(g,df,'resource-class')
print("graph has %s statements." % len(g))

graph has 149623 statements.


In [323]:
# Claim claimedBy Source
sql = """
SELECT DISTINCT CONCAT("ms",astrID_ms,"_pinakes") AS claims, "pinakes" as claimedBy
FROM `17.4.12_pinAstr_all`
 """
df = getDF(sql)
#df.head()
g = convertToRDF(g,df,'resource-resource')
print("graph has %s statements." % len(g))

graph has 153734 statements.


In [324]:
# Claim hasID ID
sql = """
SELECT DISTINCT CONCAT("ms",astrID_ms,"_pinakes") AS ``, MS AS hasID
FROM `17.4.12_pinAstr_all`
 """
df = getDF(sql)
#df.head()
g = convertToRDF(g,df,'resource-string')
print("graph has %s statements." % len(g))

graph has 157845 statements.


In [325]:
# Witness hasClaim Claim
sql = """
SELECT DISTINCT CONCAT("wi",astrID_wi) AS claimAbout, CONCAT("wi",astrID_wi,"_pinakes") AS hasClaim
FROM `17.4.12_pinAstr_all`
 """
df = getDF(sql)
#df.head()
g = convertToRDF(g,df,'resource-resource')
print("graph has %s statements." % len(g))

graph has 193887 statements.


In [326]:
# Claim rdf:type Claim
sql = """
SELECT DISTINCT CONCAT("wi",astrID_wi,"_pinakes") AS ``, "Claim" as `rdf:type`
FROM `17.4.12_pinAstr_all`
 """
df = getDF(sql)
#df.head()
g = convertToRDF(g,df,'resource-class')
print("graph has %s statements." % len(g))

graph has 229929 statements.


In [327]:
# Claim claimedBy Source
sql = """
SELECT DISTINCT CONCAT("wi",astrID_wi,"_pinakes") AS claims, "pinakes" as claimedBy
FROM `17.4.12_pinAstr_all`
 """
df = getDF(sql)
#df.head()
g = convertToRDF(g,df,'resource-resource')
print("graph has %s statements." % len(g))

graph has 265971 statements.


In [328]:
# Claim pinakesFolios Folios
sql = """
SELECT DISTINCT CONCAT("wi",astrID_wi,"_pinakes") AS ``, Folios AS pinakesFolios
FROM `17.4.12_pinAstr_all`
 """
df = getDF(sql)
#df.head()
g = convertToRDF(g,df,'resource-string')
print("graph has %s statements." % len(g))

graph has 302712 statements.


In [329]:
# Claim pinakesSiecle Century
sql = """
SELECT DISTINCT CONCAT("wi",astrID_wi,"_pinakes") AS ``, Siecle AS pinakesSiecle
FROM `17.4.12_pinAstr_all`
 """
df = getDF(sql)
#df.head()
g = convertToRDF(g,df,'resource-string')
print("graph has %s statements." % len(g))

graph has 336822 statements.


In [330]:
# Claim immediatelyFollowedBy Claim (Witness-Witness)
sql = """
SELECT CONCAT("wi",x.astrID_wi,"_pinakes") AS immediatelyPrecededBy, CONCAT("wi",y.astrID_wi,"_pinakes") AS immediatelyFollowedBy
FROM `17.4.12_pinAstr_all` AS x
 INNER JOIN `17.4.12_pinAstr_all` AS y
 ON ( x.astrID_ms = y.astrID_ms ) AND ( y.Order = x.Order + 1 )
 """
df = getDF(sql)
#df.head()
g = convertToRDF(g,df,'resource-resource')
print("graph has %s statements." % len(g))

graph has 371278 statements.


In [331]:
g.serialize(destination='astr.ttl', format='turtle')

.

.

.

.

.

.

.

.

.

# Test Queries

In [161]:
result = g.query(
        """
        PREFIX astr: <www.astronomoumenos.com/ontologies/astr.owl#>
        PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
        PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#type>
        SELECT * WHERE {
            ?1s rdfs:label ?3o .
            }
            ORDER BY RAND() LIMIT 20
            """)

pd.DataFrame(result.bindings)

Unnamed: 0,1s,3o
0,www.astronomoumenos.com/id/Zittau,Zittau
1,www.astronomoumenos.com/id/wo3976291,In Adam
2,www.astronomoumenos.com/id/wo2953998,Allegoria Anagogica In Iliadem 4:1-4
3,www.astronomoumenos.com/id/wo3668550,De Statu Spiritali
4,www.astronomoumenos.com/id/ms9227276,Mutilene Monê tou Leimônos fonds principal 335
5,www.astronomoumenos.com/id/ms3418540,Wien Österreichische Nationalbibliothek (ÖNB) ...
6,www.astronomoumenos.com/id/ms7327659,Venezia Biblioteca Nazionale Marciana gr. V. 0...
7,www.astronomoumenos.com/id/ms5272119,Vaticano Biblioteca Apostolica Vaticana Vat. g...
8,www.astronomoumenos.com/id/au2510363,Polybius hagiographus episcopus
9,www.astronomoumenos.com/id/ms6774029,"Cambridge Corpus Christi College, Parker Libra..."
