In [1]:
import pandas as pd

In [2]:
# Import the necessary files

# The DAS dataset
das = pd.ExcelFile('das.xlsx')

# The results of matching BGB shipvoyages to DAS ship IDs (see separate script 'BGB-voyages to DAS ship IDs')
daslinks = pd.ExcelFile('Matching_results.xlsx')
daslinkset = daslinks.parse('Matches')
dasships = das.parse('ship')

# The results of matching GZB musterings to DAS ship IDs (see separate script 'Remaining BGB and GZM ships')
zeemons = pd.read_pickle('zeemons.pkl')

# The preliminary linkset of BGB and GZM ships that were not matched to a DAS ship (see separate script 'Remaining BGB and GZM ships')
prel_linkset = pd.read_pickle('prel_linkset.pkl')

In [15]:
dasships

Unnamed: 0,shipID,voyTonnageMin,voyTonnageMax,voyTypeOfShipID,voyBuilt,voyBuiltRemark,voyBuiltY,voyYardYardID
0,DAS_ship0001,600.0,,,1724,,1724.0,DAS_yard006
1,DAS_ship0002,880.0,,,1762,,1762.0,DAS_yard001
2,DAS_ship0003,,,DAS_type031,,,,DAS_yard001
3,DAS_ship0004,600.0,,,1722,,1722.0,DAS_yard001
4,DAS_ship0005,300.0,,,,,,DAS_yard001
...,...,...,...,...,...,...,...,...
1850,DAS_ship1852,90.0,,DAS_type031,1690,built in Indies,1690.0,
1851,DAS_ship1853,,,,,hired,,
1852,DAS_ship1854,850.0,,,1745,bought (Originally the TOEVALLIGHEID was a Fr...,,
1853,DAS_ship1855,525.0,,,1735,built in Indies,,


In [3]:
# Create a list with DAS ship IDs and corresponding BGB shipvoyages
matching_dict = {}

# Loop over all unique DAS ships
for ship in dasships.index:
    das_id = dasships.loc[ship, 'shipID']
    das_name = dasships.loc[ship, 'shipID']
    
    # Create entry in dict for this DAS ship
    matching_dict[das_id] = {}
    
    # Lookup matches in BGB and for each match the rule on which the match is based (see original script)
    matches = daslinkset['BGB Shipvoyage ID'].loc[daslinkset['DAS shipname ID'] == das_id]
    rules = daslinkset['Matched based on rule'].loc[daslinkset['DAS shipname ID'] == das_id]
    
    # Create entry for keeping BGB links for this DAS ship
    matching_dict[das_id]['BGB'] = []
    
    # Append all BGB matches to dict (with corresponding rules in brackets)
    for match in zip(matches, rules):
        matching_dict[das_id]['BGB'].append(str(match[0]) + " (" + str(match[1]) + ")")
        
    # And do the same for the GZM (loopup matches and append them to dict)
    gzm_match = zeemons['GZM SHIP ID'].loc[zeemons['DAS SHIP ID'] == das_id]
    matching_dict[das_id]['GZM'] = []
    for gzm in gzm_match:
        matching_dict[das_id]['GZM'].append(gzm)

# Store results in dataframe
allmatches = pd.DataFrame.from_dict(matching_dict, orient='index')

In [4]:
# Convert linkset to use GLOB ship IDs
complete_linkset = {}

for match in allmatches.index:
    
    # Construct GLOB ship ID
    glob_id = str(match)
    glob_id = glob_id[-4:]
    glob_id = int(glob_id)
    
    complete_linkset[glob_id] = {}
    # Set GZM links
    complete_linkset[glob_id]['GZM'] = []
    for gzmlink in allmatches.loc[match, 'GZM']:
        complete_linkset[glob_id]['GZM'].append(gzmlink)
    
    # Set BGB links
    complete_linkset[glob_id]['BGB'] = []
    for bgblink in allmatches.loc[match, 'BGB']:
        complete_linkset[glob_id]['BGB'].append(bgblink)

# Store results in a dataframe        
newmatches = pd.DataFrame.from_dict(complete_linkset, orient='index')

In [60]:
# Merge the results with the preliminary linkset from a previous script (see separate script 'Remaining BGB and GZM ships')
# The preliminary set excluded ships with a DAS match (which we have now added)

# Create an empty list to hold the links
linklist = []

# Loop over every ship in our linkset
for globship in newmatches.index:
        gzmlinks = newmatches.loc[globship, 'GZM']
        bgblinks = newmatches.loc[globship, 'BGB']
        linklist.append([globship, ';'.join(str(i) for i in gzmlinks), ';'.join(str(i) for i in bgblinks)])

# Make a Dataframe
das_linkset = pd.DataFrame.from_records(linklist, columns=['GLOB SHIP ID', 'GZM SHIP IDS', 'BGB SHIPVOYAGES'])   

# Merge the preliminary linkset and the new linkset to form the final linkset
linkset = pd.concat([das_linkset, prel_linkset], ignore_index=True)

In [63]:
# Add links to DAS IDs
linkset['DAS ID'] = None

for ship in linkset.index:
    glob_id = int(linkset.loc[ship, 'GLOB SHIP ID'])
    
    if glob_id < 1857:
        das_id = 'DAS_ship' + str(glob_id).zfill(4)
        linkset.loc[ship, 'DAS ID'] = das_id
    

In [65]:
linkset

Unnamed: 0,GLOB SHIP ID,GZM SHIP IDS,BGB SHIPVOYAGES,DAS ID
0,1,2903;2996;3344;3381;3457;3863;3881;4024,7574 (2);7754 (2);7770 (2);7804 (2);7881 (2);7...,DAS_ship0001
1,2,4662,3699 (1);3943 (1);4024 (2);4225 (2);4273 (1);4...,DAS_ship0002
2,3,,,DAS_ship0003
3,4,2704;2952,13772 (1);14137 (2);14520 (2);15347 (1);15446 ...,DAS_ship0004
4,5,,,DAS_ship0005
...,...,...,...,...
2851,2853,,18537;18543,
2852,2854,1302,,
2853,2855,3403,,
2854,2856,419;496;887;1034;1139;1279,,


In [7]:
# Import necessary modules for serializing to RDF linkset
import rdflib
from rdflib import Graph, Namespace, RDF, RDFS, URIRef, Literal, BNode, OWL, XSD, FOAF, PROV
from rdflib.resource import Resource

In [74]:
# Define namespaces to be used in linkset
LINKSETS = Namespace("https://data.globalise.huygens.knaw.nl/id/linksets/")
SHIP = Namespace("https://data.globalise.huygens.knaw.nl/id/ship/")
GZM_SHIP = Namespace("https://data.globalise.huygens.knaw.nl/id/gzm/ship/")
BGB_SHIP = Namespace("https://data.globalise.huygens.knaw.nl/id/bgb/ship/")
DAS_SHIP = Namespace("https://data.globalise.huygens.knaw.nl/id/das/ship/")

In [76]:
# Create an empty graph
g = Graph(identifier=LINKSETS.enrich_from_das + "/")

# Loop over every ship in our linkset
for globship in linkset.index:
    glob_ship_id = linkset.loc[globship, 'GLOB SHIP ID']
    gzm_ids = linkset.loc[globship, 'GZM SHIP IDS']
    bgb_ids = linkset.loc[globship, 'BGB SHIPVOYAGES']
    das_id = linkset.loc[globship, 'DAS ID']
    
    # Create GLOB ID resource
    glob_ship_uri = URIRef(SHIP + str(glob_ship_id)) 
    
    # If the ship has a DAS ID, add it to the graph
    if das_id:
        das_ship_uri = URIRef(DAS_SHIP + str(das_id))
        g.add((das_ship_uri, OWL.sameAs, glob_ship_uri))    

    # Add the links to GZM ships
    for gzm_id in gzm_ids.split(';'):
        if gzm_id:
            gzm_ship_uri = URIRef(GZM_SHIP + str(gzm_id))
            g.add((gzm_ship_uri, OWL.sameAs, glob_ship_uri))

            # Add provenance information on this assertion (it comes from a previous script)
            statement_id = BNode()       
            g.add((statement_id, RDF.type, RDF.Statement))
            g.add((statement_id, RDF.subject, gzm_ship_uri))
            g.add((statement_id, RDF.predicate, OWL.sameAs))
            g.add((statement_id, RDF.object, glob_ship_uri))
        g.add((statement_id, PROV.wasDerivedFrom, URIRef("https://github.com/globalise-huygens/enrich_from_das/blob/4c34d29560ab644c1aa1a029ca640446aff92dcc/Remaining%20BGB%20and%20GZM%20ships.ipynb")))      
    
    # Add the links to BGB ships
    for bgb_id in bgb_ids.split(';'):
        if bgb_id:
            rule = 0
            
            # If the link contains brackets, it includes info on the rule based on which the link was made (see previous script)
            # If so, we want to include this rule in the provenance information
            if '(' in bgb_id:
                temp = bgb_id.split(' (')
                bgb_id = temp[0]
                rule = temp[1].rstrip(')')

            bgb_ship_uri = URIRef(BGB_SHIP + str(bgb_id))
            g.add((bgb_ship_uri, OWL.sameAs, glob_ship_uri))

            # Add provenance information on this assertion (it comes from a previous script)
            statement_id = BNode()       
            g.add((statement_id, RDF.type, RDF.Statement))
            g.add((statement_id, RDF.subject, bgb_ship_uri))
            g.add((statement_id, RDF.predicate, OWL.sameAs))
            g.add((statement_id, RDF.object, glob_ship_uri))

            # Add the provenance based on the applied rule (see above and see the script linked in the URI below)
            if rule == '1':
                g.add((statement_id, PROV.wasDerivedFrom, URIRef("https://github.com/globalise-huygens/enrich_from_das/blob/4c34d29560ab644c1aa1a029ca640446aff92dcc/BGB-voyages%20to%20DAS%20ship%20IDs.ipynb#rule_1")))   
            if rule == '2':
                g.add((statement_id, PROV.wasDerivedFrom, URIRef("https://github.com/globalise-huygens/enrich_from_das/blob/4c34d29560ab644c1aa1a029ca640446aff92dcc/BGB-voyages%20to%20DAS%20ship%20IDs.ipynb#rule_2")))
            else:
                g.add((statement_id, PROV.wasDerivedFrom, URIRef("https://github.com/globalise-huygens/enrich_from_das/blob/4c34d29560ab644c1aa1a029ca640446aff92dcc/BGB-voyages%20to%20DAS%20ship%20IDs.ipynb")))



In [79]:
# Serialize to trig
g.bind("ship", SHIP)
g.bind("gzm_ship", GZM_SHIP)
g.bind("bgb_ship", BGB_SHIP)
g.bind("das_ship", DAS_SHIP)

g.serialize(destination="enrich_from_das.trig", format="trig")

<Graph identifier=https://data.globalise.huygens.knaw.nl/id/linksets/enrich_from_das/ (<class 'rdflib.graph.Graph'>)>