In [39]:
# Check for existing libraries in environment
! pip install rdflib lxml



In [40]:
from lxml import etree
import os

# this set up globals for some useful URIs
globals = {
    "xml_id":"{http://www.w3.org/XML/1998/namespace}id",
    "base_data_URI": "https://github.com/falaimo99/sam/data/",
    }

# this function extract the tree from the well formed xml
def extract_tree(path):
    tree = etree.parse(path)
    root = tree.getroot()  
    return root

root = extract_tree("./short_stories/Novellino_II.xml")


In [None]:
from rdflib import Graph, Literal, BNode, Namespace, URIRef
from rdflib.namespace import RDF, RDFS, XSD, DCTERMS, OWL
from urllib.parse import urlparse

# Namespaces used by SAM
sam = Namespace("https://purl.org/samcore#")
wdt = Namespace("https://www.wikidata.org/wiki/")
wdp = Namespace("https://www.wikidata.org/wiki/Property:")
cwrc = Namespace("https://sparql.cwrc.ca/ontologies/cwrc.html#")

# Graph Instantiation
def setting_the_graph():
    g = Graph()

    g.bind("sam", sam)
    g.bind("wdt", wdt)
    g.bind("wdp", wdp)
    g.bind("cwrc", cwrc)
    
    return g

# utils dictionary that get filled as the blocks connect themselves
utils_URI = {}

# Function to set up the story element and the mandatory related items
# CharacterList and GroupOfScenes/Scenes
def set_base_story ():
    
    story_URI = URIRef(
        globals["base_data_URI"] + root.find("Story").attrib[globals["xml_id"]]
        )
    utils_URI['story'] = story_URI
    character_list_URI = URIRef(
        story_URI + "_CharacterList"
    )
    utils_URI['CharacterList'] = character_list_URI

    if root.findall("SequenceOfScenes"):
        for SoS in root.findall("SequenceOfScenes"):
            SoS_URI = story_URI + "_" + SoS.attrib[globals["xml_id"]]
            g.add( (story_URI, wdp.P527, SoS_URI) )
            print(f"{SoS_URI} added to Story via wdp:P527")
    
    for scene in root.findall("Scene"):
        scene_URI = story_URI + "_" + scene.attrib[globals["xml_id"]]
        g.add( (story_URI, wdp.P527, scene_URI) )
        g.add( (scene_URI, RDF.type, sam.Scene))
        g.add( (scene_URI, RDF.type, sam.Scene))
        print(f"{scene_URI} added to Story via wdp:P527")

    g.add ( (story_URI, RDF.type, sam.Story) )
    g.add ( (character_list_URI, RDF.type, sam.CharacterList) )

    print(
        f"""{story_URI} added as sam:Story;
{character_list_URI} added as sam:CharacterList;
        """
        )

    return g

# This function finds all the characters add them to the CharacterList
#  and adds all the relevant information to the graph
def set_characters():
    character_list = root.find('CharacterList')

    for character in character_list.findall('Character'):
        character_URI = (
            utils_URI['story']+"_"+ character.attrib[globals['xml_id']]
            )
        g.add((utils_URI['CharacterList'], wdp.P527, character_URI))
        g.add((character_URI, RDF.type, wdt.Q95074))
        character_properties(character, character_URI)
        

    for character_group in character_list.findall('CharacterGroup'):
        character_group_URI = (
            utils_URI['story']+"_"+ character_group.attrib[globals['xml_id']]
        )
        g.add((utils_URI['CharacterList'], wdp.P527, character_group_URI))
        
    return g

# Ancillary functions that attaches the properties to the characters
def character_properties(character, character_URI):
    # attributes analyzer. Some attributes are a shorthand for the literal part
    # of a character class

    if 'name' in character.attrib:
            character_name = character_URI + "_name"
            g.add((character_name, RDF.type, wdt.Q82799))
            g.add((character_URI, wdp.P2561, character_name))
            g.add((
                character_name, sam.hasName, Literal(character.attrib['name']
                )))

    if 'occupation' in character.attrib:
        character_occupation = character_URI + "_occupation"
        g.add((character_occupation, RDF.type, wdt.Q12737077))
        g.add((character_URI, wdp.P106, character_occupation))
        g.add((
            character_occupation,
            sam.hasTitle,
            Literal(character.attrib['occupation'])
            ))

    if 'descriptor' in character.attrib:
        g.add((
            character_URI,
            sam.descriptor,
            Literal(character.attrib['descriptor'])
            ))

    if 'gender' in character.attrib:
        try:
            url_parsed = urlparse(character.attrib['gender'])
            if url_parsed.netloc == "sparql.cwrc.ca":
                g.add((
                    character_URI, wdp.P21, character.attrib['gender']
                    ))
        except:
            print('not a valid URL from sparql.cwrc.ca')

    if 'narratorName' in character.attrib:
        g.add((
            character_URI,
            sam.narratorName,
            Literal(character.attrib['narratorName'])
            ))

    if 'partOf' in character.attrib:
        g.add((
            character_URI,
            wdp.P361,
            (utils_URI['story']+ f"_{character.attrib['partOf']}")
            ))

    # nested elements analyzer. Full reference where attributes are only used
    # when describing a data property


    for child in character.iterchildren():
        if child.tag.lower() == "trope":
            try:    
                url_parsed = url_parsed(child.attrib['url'])
                if url_parsed.netloc == "tvtrope.org":
                    trope_URI = character_URI + "_"
                    g.add((character_URI, sam.hasTrope, trope_URI))
                    g.add((trope_URI, RDF.type, sam.Trope))
                    g.add((trope_URI, sam.tropeURI, child.attrib['url']))
            except:
                print(f"""{
                    character.attrib[globals['xml_id']]
                    } lacks a valid url from tvtrope""")

    


In [42]:
# Testing block
g: Graph = setting_the_graph()

set_base_story()
set_characters()

https://github.com/falaimo99/sam/data/novellino_II_anonimo_scene1 added to Story via wdp:P527
https://github.com/falaimo99/sam/data/novellino_II_anonimo_scene2 added to Story via wdp:P527
https://github.com/falaimo99/sam/data/novellino_II_anonimo_scene3 added to Story via wdp:P527
https://github.com/falaimo99/sam/data/novellino_II_anonimo_scene4 added to Story via wdp:P527
https://github.com/falaimo99/sam/data/novellino_II_anonimo_scene5 added to Story via wdp:P527
https://github.com/falaimo99/sam/data/novellino_II_anonimo_scene6 added to Story via wdp:P527
https://github.com/falaimo99/sam/data/novellino_II_anonimo added as sam:Story;
https://github.com/falaimo99/sam/data/novellino_II_anonimo_CharacterList added as sam:CharacterList;
        
True
not a valid URL from sparql.cwrc.ca
federigoImperadore lacks a valid url from tvtrope
True
not a valid URL from sparql.cwrc.ca
True
not a valid URL from sparql.cwrc.ca


<Graph identifier=Nee211a8953dc473f9e1298846f030fef (<class 'rdflib.graph.Graph'>)>

In [43]:
g.print()

@prefix sam: <https://purl.org/samcore#> .
@prefix wdp: <https://www.wikidata.org/wiki/Property:> .
@prefix wdt: <https://www.wikidata.org/wiki/> .

<https://github.com/falaimo99/sam/data/novellino_II_anonimo> a sam:Story ;
    wdp:P527 <https://github.com/falaimo99/sam/data/novellino_II_anonimo_scene1>,
        <https://github.com/falaimo99/sam/data/novellino_II_anonimo_scene2>,
        <https://github.com/falaimo99/sam/data/novellino_II_anonimo_scene3>,
        <https://github.com/falaimo99/sam/data/novellino_II_anonimo_scene4>,
        <https://github.com/falaimo99/sam/data/novellino_II_anonimo_scene5>,
        <https://github.com/falaimo99/sam/data/novellino_II_anonimo_scene6> .

<https://github.com/falaimo99/sam/data/novellino_II_anonimo_CharacterList> a sam:CharacterList ;
    wdp:P527 <https://github.com/falaimo99/sam/data/novellino_II_anonimo_ambasciatori>,
        <https://github.com/falaimo99/sam/data/novellino_II_anonimo_federigoImperadore>,
        <https://github.com/falai