Skip to content

HTTPS clone URL

Subversion checkout URL

You can clone with HTTPS or Subversion.

Download ZIP
Browse files

basic parsing working

  • Loading branch information...
commit 31a83dc2a4552cbf3927b27a882295973b4d19d2 1 parent 5e45756
@edsu authored
View
2  .gitignore
@@ -1,2 +1,4 @@
*.pyc
*egg-info
+*egg
+ENV
View
12 example.html
@@ -1,11 +1,12 @@
<!doctype html>
<html>
+ <!-- shamelessly stolen from http://schema.org/Person -->
<head>
<title>person example</title>
</head>
<body>
- <div itemscope itemtype="http://schema.org/Person">
+ <div itemid="http://www.xyz.edu/~jane" itemscope itemtype="http://schema.org/Person">
<span itemprop="name">Jane Doe</span>
<img src="janedoe.jpg" itemprop="image" />
@@ -20,16 +21,15 @@
<span itemprop="postalCode">98052</span>
</div>
<span itemprop="telephone">(425) 123-4567</span>
- <a href="mailto:jane-doe@xyz.edu" itemprop="email">
- jane-doe@xyz.edu</a>
+ <a href="mailto:jane-doe@xyz.edu" itemprop="email">jane-doe@xyz.edu</a>
Jane's home page:
- <a href="www.janedoe.com" itemprop="url">janedoe.com</a>
+ <a href="http://www.janedoe.com" itemprop="url">janedoe.com</a>
Graduate students:
- <a href="www.xyz.edu/students/alicejones.html" itemprop="colleagues">
+ <a href="http://www.xyz.edu/students/alicejones.html" itemprop="colleagues">
Alice Jones</a>
- <a href="www.xyz.edu/students/bobsmith.html" itemprop="colleagues">
+ <a href="http://www.xyz.edu/students/bobsmith.html" itemprop="colleagues">
Bob Smith</a>
</div>
</body>
View
46 rdflib_microdata.py
@@ -1,6 +1,6 @@
import microdata
-from rdflib import URIRef
+from rdflib import URIRef, Literal, BNode, Namespace, RDF
from rdflib.plugin import register
from rdflib.parser import Parser
@@ -9,6 +9,44 @@
class MicrodataParser(Parser):
def parse(self, source, sink, **kwargs):
- # TODO: something not right here with source :-(
- items = microdata.get_items(source)
- # build the graph now :-)
+ """
+ Pass in a file or file-like object containing html5 microdata
+ and populate the sink graph with triples.
+ """
+ for item in microdata.get_items(source.getByteStream()):
+ self._add_item(item, sink)
+
+ def _add_item(self, item, sink):
+ # the URI to hang our assertions off of
+ if item.itemid:
+ s = URIRef(item.itemid.string)
+ else:
+ s = BNode()
+
+ # must have a profile for the item to convert to rdf data model
+ if not item.itemtype:
+ return
+
+ # create a vocab namespace, appending a # as necessary
+ ns = str(item.itemtype)
+ if ns.endswith("#") or ns.endswith("/"):
+ ns = Namespace(item.itemtype)
+ else:
+ ns = Namespace(ns + "#")
+
+ sink.add((s, RDF.type, str(item.itemtype)))
+
+ # go through each property/value and add triples to the graph
+ for item_property, item_values in item.props.items():
+ p = ns[item_property]
+ for v in item_values:
+ if isinstance(v, microdata.Item):
+ o = self._add_item(v, sink)
+ elif isinstance(v, microdata.URI):
+ o = URIRef(v.string)
+ else:
+ o = Literal(v)
+ # TODO: handle dates
+ sink.add((s, p, o))
+
+ return s
View
2  setup.py
@@ -9,5 +9,5 @@
url = "http://github.com/edsu/rdflib-microdata",
py_modules = ["rdflib_microdata"],
test_suite = "test",
- install_requires = ["html5lib", "microdata", "rdflib"],
+ install_requires = ["html5lib", "microdata", "rdflib>=3.0"],
)
View
22 test.py
@@ -1,12 +1,28 @@
from unittest import TestCase
-import rdflib
+from rdflib import Graph, URIRef, Namespace, RDF, BNode
+
import rdflib_microdata
class MicrodataParserTest(TestCase):
def test_parse(self):
- g = rdflib.Graph()
+ g = Graph()
g.parse(open("example.html"), format="microdata")
- self.assertTrue(len(g) > 0)
+
+ # seem to be the right amount of assertions?
+ self.assertEqual(len(g), 15)
+
+ # is there an person?
+ s = URIRef("http://www.xyz.edu/~jane")
+ person = Namespace("http://schema.org/Person#")
+ self.assertEqual(g.value(s, RDF.type), "http://schema.org/Person")
+ self.assertEqual(g.value(s, person.telephone), "(425) 123-4567")
+
+ # is the person attached to an address?
+ addr = Namespace("http://schema.org/PostalAddress#")
+ a = BNode(g.value(s, person.address))
+ self.assertEqual(g.value(a, RDF.type), "http://schema.org/PostalAddress")
+ self.assertEqual(g.value(a, addr.postalCode), "98052")
+ # TODO: test dates?
Please sign in to comment.
Something went wrong with that request. Please try again.