# RDF-   NLS - 

This notebook is going to create the RDF triples to generate our RDLIB GRAPH

Before we need to run **Metadata_Pages_NLS.ipynb** to get the **collection dataframe**. 

The collection dataframes will have this shape:


- MMSID                                                   9937033633804341
- serieTitle                            Chapbooks printed in Scotland
- editor                                                       Milne, John
- editor_date                                                    1792-1871
- genre                              Chapbooks-Scotland-Aberdeen-1801-1900
- language                                                             eng
- metsXML                                               104184105-mets.xml
- termsOfAddress                                                      None
- numberOfPages                                                          8
- numberOfWords                                                         53
- permanentURL                            https://digital.nls.uk/104184105
- physicalDescription                                        8 p. ; 18 cm.
- place                                                           Aberdeen
- publisher                             Printed by A. Imlay, 22, Long Acre
- referencedBy                                                        None
- shelfLocator                                               L.C.2786.A(1)
- altoXML                                  104184105/alto/107134030.34.xml
- serieSubTitle                             to the tune of Johnny Cop
- text                   A SONG JRAISB OP THE ^ HIGHLAND LADS. To the T...
- pageNum                                                            Page1
- volumeTitle                          song in praise of the highland lads
- volumeId                                                       104184105
- year                                                                1826
- collectionNum                                                          0
- part                                                                   0
- publisherPersons                                                      []
- numberOfVolumes                                                     3080
- volumeNum                                                             1

### Loading the necessary libraries

In [1]:
import yaml
import numpy as np
import collections
import string
import copy
from datetime import datetime
import re

In [2]:
import pandas as pd
from yaml import safe_load
from pandas.io.json import json_normalize
from difflib import SequenceMatcher

In [3]:
NON_AZ_REGEXP = re.compile('[^a-zA-Z]')
NON_AZ_19_REGEXP = re.compile('[^a-z0-9]')

### Functions

In [4]:
def serie2rdf(data, g, nls):

    serie = URIRef("https://w3id.org/nls/i/Serie/"+str(data["MMSID"]))
    serie_title= data["serieTitle"]+"," +str(data["year"])
    g.add((serie, RDF.type, nls.Serie))
    g.add((serie, nls.number, Literal(data["serieNum"], datatype=XSD.integer)))
    g.add((serie, nls.title, Literal(serie_title, datatype=XSD.string)))
    g.add((serie, nls.subtitle, Literal(data["serieSubTitle"], datatype=XSD.string)))
    g.add((serie, nls.collection, Literal(data["collectionName"], datatype=XSD.string)))
    g.add((serie, nls.publicationYear, Literal(data["year"], datatype=XSD.integer)))
    g.add((serie, nls.printedAt, Literal(data["place"], datatype=XSD.string)))
    g.add((serie, nls.mmsid, Literal(str(data["MMSID"]), datatype=XSD.string)))
    g.add((serie, nls.physicalDescription, Literal(data["physicalDescription"], datatype=XSD.string)))
    g.add((serie, nls.genre, Literal(data["genre"], datatype=XSD.string)))
    g.add((serie, nls.language, Literal(data["language"], datatype=XSD.string)))
    g.add((serie, nls.shelfLocator, Literal(data["shelfLocator"], datatype=XSD.string)))
    g.add((serie, nls.numberOfVolumes, Literal(data["numberOfVolumes"], datatype=XSD.integer)))

    #### Editor 
    

    if data["editor"]!=0:
        name=data["editor"].replace(" ", "")
        name=name.replace("]", "")
        name=name.replace(",", "")
        name=name.replace(".", "")
        name=name.replace(":-", "")
        name=re.sub(NON_AZ_REGEXP, '', name)
        
        
        editor = URIRef("https://w3id.org/nls/i/Person/"+str(name))
        g.add((editor, RDF.type, nls.Person))
        g.add((editor, nls.name, Literal(data["editor"], datatype=XSD.string)))

        try:
            if data["editor_date"]!=0:
                tmpDate=data["editor_date"].split("-")
                
                if "?" in tmpDate[0]:
                    tmpDate[0]=tmpDate[0].replace("?", "")
                if "?" in tmpDate[1]:
                    tmpDate[1]=tmpDate[1].replace("?", "")
                
                birthDate=datetime.strptime(tmpDate[0], '%Y')
                deathDate=datetime.strptime(tmpDate[1], '%Y')
                g.add((editor, nls.birthDate, Literal(birthDate, datatype=XSD.dateTime)))
                g.add((editor, nls.deathDate, Literal(deathDate, datatype=XSD.dateTime)))
        except:
            pass
    
        if data["termsOfAddress"] != 0:
            g.add((editor, nls.termsOfAddress, Literal(data["termsOfAddress"], datatype=XSD.string)))

        g.add((serie, nls.editor, editor))

    #### Publishers Persons 

    #This was the result to pass entity recognition to publisher

    if data["publisherPersons"] != 0:
        publisherPersons=name=data["publisherPersons"]
        for p in publisherPersons: 
            name=p.replace(" ", "")
            name=name.replace("]", "")
            name=name.replace(",", "_")
            name=name.replace(".", "")
            name=name.replace(":-", "")
            name=re.sub(NON_AZ_REGEXP, '', name)
            publisher = URIRef("https://w3id.org/nls/i/Person/"+name)
            #print("---- Publisher %s" %publisher)
            #if "RobertDrummond" in name:
            #    print("------ WARNING!!! %s" %name)
            g.add((publisher, RDF.type, nls.Person))
            g.add((publisher, nls.name, Literal(p, datatype=XSD.string)))
            g.add((serie, nls.publisher, publisher))
        
    #### Is Referenced by  

    if data["referencedBy"] != 0:
        references=data["referencedBy"]
        for r in references: 
            name=r.replace(" ", "")
            book = URIRef("https://w3id.org/nls/i/Book/"+name)
            g.add((book, RDF.type, nls.Book))
            g.add((book, nls.title, Literal(r, datatype=XSD.string)))
            g.add((serie, nls.referencedBy, book))
            
    return g, serie
        

### 1. Loading the final dataframe

In [5]:
!ls ./results/

README                                gazetters_scotland.ttl
README.md                             ladiesDebating.ttl
chapbooks_dataframe                   ladiesDebating_dataframe
chapbooks_metadata_pages.yml          ladiesDebating_dataframe.ttl
chapbooks_scotland.ttl                ladiesDebating_metadata_pages.yml
gazetterOfScotland_dataframe          ladies_debating.ttl
gazetterOfScotland_metadata_pages.yml


In [6]:
df= pd.read_json('results/chapbooks_dataframe', orient="index") 
#df= pd.read_json('results/ladiesDebating_dataframe', orient="index") 
#df= pd.read_json('results/gazetterOfScotland_dataframe', orient="index") 

In [7]:
df=df.fillna(0)

In [8]:
df.loc[3]

MMSID                                                   9937033633804340
edition                                                                0
editor                                                       Milne, John
editor_date                                                    1792-1871
genre                              Chapbooks-Scotland-Aberdeen-1801-1900
language                                                             eng
metsXML                                               104184105-mets.xml
termsOfAddress                                                         0
numberOfPages                                                          8
numberOfWords                                                        296
permanentURL                            https://digital.nls.uk/104184105
physicalDescription                                        8 p. ; 18 cm.
place                                                           Aberdeen
publisher                             Printed by A.

In [9]:
#tmp["pageNum"] = df['pageNum'].apply(lambda s:s.split('Page')[1])
#df['pageNum']= tmp["pageNum"]

In [10]:
list_MMSID=df["MMSID"].unique()
len(list_MMSID)

2728

### 2. Create a Graph and import the information of the collection

In [11]:
from rdflib import Graph, URIRef, Literal, Namespace, XSD
from rdflib.namespace import RDF, RDFS


# Create a Graph
g = Graph()

g.namespace_manager.bind('nls', Namespace("https://w3id.org/nls#"), override="False")
nls = Namespace("https://w3id.org/nls#")


list_MMSID=df["MMSID"].unique()
for s in range(0, len(list_MMSID)):
    
    ### SERIE
    #print("Serie: %s" %list_MMSID[s])
    
    df_serie=df[df['MMSID'] == list_MMSID[s]].reset_index(drop=True)
    serie_data = df_serie.loc[0]
    g, serie = serie2rdf(serie_data,g, nls)
    
    ### VOLUMES 
    list_vols = df_serie["volumeNum"].unique()
    
    for v in range(0,len(list_vols)):
        
        
    #print("Vol %s" % list_vols[v])
        df_vl=df_serie[df_serie["volumeNum"] == list_vols[v]].reset_index(drop=True)
        volume_data=df_vl.loc[0]
        volume_id=volume_data["volumeId"]
        volume = URIRef("https://w3id.org/nls/i/Volume/"+str(volume_data["MMSID"])+"_"+str(volume_data["volumeId"]))
        #print(volume)
    
        g.add((volume, RDF.type, nls.Volume))
        g.add((volume, nls.volumeId, Literal(volume_data["volumeId"], datatype=XSD.int)))
        g.add((volume, nls.number, Literal(volume_data["volumeNum"], datatype=XSD.int)))
        g.add((volume, nls.title, Literal(volume_data["volumeTitle"], datatype=XSD.string)))
        if volume_data["part"]!=0:
            g.add((volume, nls.part, Literal(volume_data["part"], datatype=XSD.string)))
    
        g.add((volume, nls.metsXML, Literal(volume_data["metsXML"], datatype=XSD.string)))
        g.add((volume, nls.permanentURL, Literal(volume_data["permanentURL"], datatype=XSD.string)))
        g.add((volume, nls.numberOfPages, Literal(volume_data["numberOfPages"], datatype=XSD.string)))
    
        #print("Volume is %s" % volume)
        g.add((serie, nls.hasPart, volume))
    
        list_pages = df_vl["pageNum"].unique()
    
                        
        #### PAGES
        for p in range(0, len(list_pages)):
            df_p=df_vl[df_vl["pageNum"] == list_pages[p]].reset_index(drop=True)
            df_page=df_p.loc[0]
            page= URIRef("https://w3id.org/nls/i/Page/"+ str(df_page["MMSID"])+"_"+str(df_page["volumeId"])+"_"+str(df_page["pageNum"]))
            #print("Page is %s" %page)
        
            g.add((page, RDF.type, nls.Page))
            
            #df_p=df_page["pageNum"].split('Page')[0]
            
            g.add((page, nls.number, Literal(df_page["pageNum"], datatype=XSD.int)))
            g.add((page, nls.numberOfWords, Literal(df_page["numberOfWords"], datatype=XSD.int)))
            g.add((page, nls.text,  Literal(df_page["text"], datatype=XSD.string)))
            g.add((page, nls.altoXML, Literal(df_page["altoXML"], datatype=XSD.string)))
            g.add((volume, nls.hasPart, page))
      
            


In [12]:
# Save the Graph in the RDF Turtle format
g.serialize(format="turtle", destination="results/chapbooks_scotland.ttl")
#g.serialize(format="turtle", destination="results/ladies_debating.ttl")
#g.serialize(format="turtle", destination="results/gazetters_scotland.ttl")


<Graph identifier=Nff51762754174855b9bbf132ea4fd904 (<class 'rdflib.graph.Graph'>)>

List all the elements that we have added for the last Edition added in the graph

In [13]:
for s,p,o in g.triples((serie, None, None)):
  print(s,p,o)

https://w3id.org/nls/i/Serie/9937393453804340 http://www.w3.org/1999/02/22-rdf-syntax-ns#type https://w3id.org/nls#Serie
https://w3id.org/nls/i/Serie/9937393453804340 https://w3id.org/nls#number 2727
https://w3id.org/nls/i/Serie/9937393453804340 https://w3id.org/nls#title old Scottish ballad of Andrew Lammie; or, Mill of Tifty's Annie,1850
https://w3id.org/nls/i/Serie/9937393453804340 https://w3id.org/nls#subtitle 0
https://w3id.org/nls/i/Serie/9937393453804340 https://w3id.org/nls#collection Chapbooks printed in Scotland
https://w3id.org/nls/i/Serie/9937393453804340 https://w3id.org/nls#publicationYear 1850
https://w3id.org/nls/i/Serie/9937393453804340 https://w3id.org/nls#printedAt Glasgow
https://w3id.org/nls/i/Serie/9937393453804340 https://w3id.org/nls#mmsid 9937393453804340
https://w3id.org/nls/i/Serie/9937393453804340 https://w3id.org/nls#physicalDescription 8 p. ; 16 cm.
https://w3id.org/nls/i/Serie/9937393453804340 https://w3id.org/nls#genre Chapbooks-Scotland-Glasgow-1801-190

In [14]:
for s,p,o in g.triples((None, nls.editor, None)):
  print(s,p,o)

https://w3id.org/nls/i/Serie/9937033633804340 https://w3id.org/nls#editor https://w3id.org/nls/i/Person/MilneJohn
https://w3id.org/nls/i/Serie/9937038123804340 https://w3id.org/nls#editor https://w3id.org/nls/i/Person/FultonJohn
https://w3id.org/nls/i/Serie/9937038533804340 https://w3id.org/nls#editor https://w3id.org/nls/i/Person/AitkenA
https://w3id.org/nls/i/Serie/9937038873804340 https://w3id.org/nls#editor https://w3id.org/nls/i/Person/BurnessJohn
https://w3id.org/nls/i/Serie/9937049243804340 https://w3id.org/nls#editor https://w3id.org/nls/i/Person/BurnessJohn
https://w3id.org/nls/i/Serie/9937068603804340 https://w3id.org/nls#editor https://w3id.org/nls/i/Person/BurnessJohn
https://w3id.org/nls/i/Serie/9937286833804340 https://w3id.org/nls#editor https://w3id.org/nls/i/Person/BurnessJohn
https://w3id.org/nls/i/Serie/9937410113804340 https://w3id.org/nls#editor https://w3id.org/nls/i/Person/BurnessJohn
https://w3id.org/nls/i/Serie/9937429513804340 https://w3id.org/nls#editor https