Permalink
Browse files

first complete CSV parse for Schema.org terms

  • Loading branch information...
mhausenblas
mhausenblas committed Jun 11, 2011
1 parent 0e3d411 commit 7190933df13ab639976851d68a67251ae38312e9
@@ -17,37 +17,79 @@
from rdflib.plugin import register
from rdflib.parser import Parser
-
register("schemaorg_csv", Parser, "rdflib_schemaorg_csv", "SchemaOrgCSVParser")
class SchemaOrgCSVParser(Parser):
-
+ NAMESPACES = {
+ 'schema' : Namespace('http://schema.org/'),
+ 'scsv' : Namespace('http://purl.org/NET/schema-org-csv#'),
+ 'dcterms' : Namespace('http://purl.org/dc/terms/')
+ }
+
def parse(self, source, sink, **kwargs):
"""
Pass in a file or file-like object containing CSV with Schema.org
column headers and populate the sink graph with triples.
"""
+ row_num = 1
+ fURI = kwargs.get("csv_file_URI", "")
+ self._add_table(fURI, sink)
try:
- rows = csv.reader(source, delimiter=' ', quoting=csv.QUOTE_ALL)
- for row in source:
- self._add_row(row, sink)
+ f = source.getByteStream()
+ rows = csv.reader(f, delimiter=',', quoting=csv.QUOTE_ALL)
+ for row in rows:
+ if row_num == 1:
+ columns = self._add_header(fURI, row, row_num, sink)
+ else:
+ self._add_row(fURI, columns, row, row_num, sink)
+ row_num = row_num + 1
+ f.close()
except csv.Error, e:
sys.exit('%s' %e)
- def _add_row(self, row, sink):
- # the URI to hang our assertions off of
- s = BNode()
-
- ns = str('http://example.org/')
- # if ns.endswith("#") or ns.endswith("/"):
- # ns = Namespace(item.itemtype)
- # else:
- # ns = Namespace(ns + "#")
+ def _add_table(self, fURI, sink):
+ t = URIRef(fURI + '#table')
+ sink.add((t, RDF.type, URIRef(SchemaOrgCSVParser.NAMESPACES['scsv']['Table'])))
+ sink.add((t, SchemaOrgCSVParser.NAMESPACES['dcterms']['source'], URIRef(fURI)))
+ sink.add((t, SchemaOrgCSVParser.NAMESPACES['dcterms']['title'], Literal(fURI.split('/')[-1])))
- sink.add((s, RDF.type, str('http://example.org/')))
- p = URIRef('http://schema.org/Property')
- # o = URIRef(row)
- o = Literal(row)
- sink.add((s, p, o))
+ def _add_header(self, fURI, row, row_num, sink):
+ t = URIRef(fURI + '#table')
+ r = URIRef(fURI + '#row:' + str(row_num))
+ # row-level:
+ sink.add((t, URIRef(SchemaOrgCSVParser.NAMESPACES['scsv']['row']), r))
+ sink.add((r, RDF.type, URIRef(SchemaOrgCSVParser.NAMESPACES['scsv']['HeaderRow'])))
+ sink.add((r, SchemaOrgCSVParser.NAMESPACES['dcterms']['title'], Literal('header')))
+ # cell-level:
+ col_num = 1
+ columns = []
+ for cell in row:
+ c = URIRef(fURI + '#row:' + str(row_num) + ',' + 'col:' + str(col_num))
+ sink.add((r, URIRef(SchemaOrgCSVParser.NAMESPACES['scsv']['cell']), c))
+ sink.add((c, SchemaOrgCSVParser.NAMESPACES['dcterms']['title'], Literal(cell)))
+ columns.append(self._lookup_schemaorg_term(cell))
+ col_num = col_num + 1
+ return columns
- return s
+ def _add_row(self, fURI, columns, row, row_num, sink):
+ t = URIRef(fURI + '#table')
+ r = URIRef(fURI + '#row:' + str(row_num))
+ # row-level:
+ sink.add((t, URIRef(SchemaOrgCSVParser.NAMESPACES['scsv']['row']), r))
+ sink.add((r, RDF.type, URIRef(SchemaOrgCSVParser.NAMESPACES['scsv']['Row'])))
+ sink.add((r, SchemaOrgCSVParser.NAMESPACES['dcterms']['title'], Literal('row ' + str(row_num))))
+ # cell-level:
+ col_num = 1
+ for cell in row:
+ c = URIRef(fURI + '#row:' + str(row_num) + ',' + 'col:' + str(col_num))
+ sink.add((r, URIRef(SchemaOrgCSVParser.NAMESPACES['scsv']['cell']), c))
+ sink.add((c, RDF.type, URIRef(columns[col_num - 1])))
+ if cell.startswith('http://'):
+ sink.add((c, RDF.value, URIRef(cell)))
+ else:
+ sink.add((c, RDF.value, Literal(cell)))
+ col_num = col_num + 1
+ return r
+
+ def _lookup_schemaorg_term(self, cell):
+ return 'http://schema.org/' + cell # TODO: look up cell value in http://schema.rdfs.org/all-classes.csv
@@ -70,7 +70,7 @@ def dump_data(self):
self.g.bind('schema', 'http://schema.org/', True)
self.g.bind('scsv', 'http://purl.org/NET/schema-org-csv#', True)
self.g.bind('dcterms', 'http://purl.org/dc/terms/', True)
- print(self.g.serialize()) #format='n3')) ... doesn't work - TODO: ask Ed
+ print(self.g.serialize()) #format='n3')) ... doesn't work
else:
print('Sorry, nothing to show - use parse_str() or parse_URL() to parse data with Schema.org terms ...')
@@ -1,4 +1,4 @@
-Thing,Mass
-Earth,"5.9721986×10^24 kg"
-Mars,"6.4191×10^23 kg"
-Sun,"1.988435×10^30 kg"
+Thing,Mass,URL
+Earth,"5.9721986×10^24 kg",http://dbpedia.org/resource/Earth
+Mars,"6.4191×10^23 kg",http://dbpedia.org/resource/Mars
+Sun,"1.988435×10^30 kg",http://dbpedia.org/resource/Sun
@@ -0,0 +1,89 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<rdf:RDF
+ xmlns:dcterms="http://purl.org/dc/terms/"
+ xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#"
+ xmlns:scsv="http://purl.org/NET/schema-org-csv#"
+>
+ <rdf:Description rdf:about="https://raw.github.com/mhausenblas/schema-org-rdf/master/tools/schema-gateway/test/solar-system.csv#row:2">
+ <scsv:cell rdf:resource="https://raw.github.com/mhausenblas/schema-org-rdf/master/tools/schema-gateway/test/solar-system.csv#row:2,col:1"/>
+ <scsv:cell rdf:resource="https://raw.github.com/mhausenblas/schema-org-rdf/master/tools/schema-gateway/test/solar-system.csv#row:2,col:2"/>
+ <scsv:cell rdf:resource="https://raw.github.com/mhausenblas/schema-org-rdf/master/tools/schema-gateway/test/solar-system.csv#row:2,col:3"/>
+ <dcterms:title>row 2</dcterms:title>
+ <rdf:type rdf:resource="http://purl.org/NET/schema-org-csv#Row"/>
+ </rdf:Description>
+ <rdf:Description rdf:about="https://raw.github.com/mhausenblas/schema-org-rdf/master/tools/schema-gateway/test/solar-system.csv#row:4,col:2">
+ <rdf:value>1.988435×10^30 kg</rdf:value>
+ <rdf:type rdf:resource="http://schema.org/Mass"/>
+ </rdf:Description>
+ <rdf:Description rdf:about="https://raw.github.com/mhausenblas/schema-org-rdf/master/tools/schema-gateway/test/solar-system.csv#row:2,col:1">
+ <rdf:value>Earth</rdf:value>
+ <rdf:type rdf:resource="http://schema.org/Thing"/>
+ </rdf:Description>
+ <rdf:Description rdf:about="https://raw.github.com/mhausenblas/schema-org-rdf/master/tools/schema-gateway/test/solar-system.csv#row:2,col:2">
+ <rdf:value>5.9721986×10^24 kg</rdf:value>
+ <rdf:type rdf:resource="http://schema.org/Mass"/>
+ </rdf:Description>
+ <rdf:Description rdf:about="https://raw.github.com/mhausenblas/schema-org-rdf/master/tools/schema-gateway/test/solar-system.csv#row:4">
+ <scsv:cell rdf:resource="https://raw.github.com/mhausenblas/schema-org-rdf/master/tools/schema-gateway/test/solar-system.csv#row:4,col:3"/>
+ <scsv:cell rdf:resource="https://raw.github.com/mhausenblas/schema-org-rdf/master/tools/schema-gateway/test/solar-system.csv#row:4,col:2"/>
+ <scsv:cell rdf:resource="https://raw.github.com/mhausenblas/schema-org-rdf/master/tools/schema-gateway/test/solar-system.csv#row:4,col:1"/>
+ <dcterms:title>row 4</dcterms:title>
+ <rdf:type rdf:resource="http://purl.org/NET/schema-org-csv#Row"/>
+ </rdf:Description>
+ <rdf:Description rdf:about="https://raw.github.com/mhausenblas/schema-org-rdf/master/tools/schema-gateway/test/solar-system.csv#row:3">
+ <scsv:cell rdf:resource="https://raw.github.com/mhausenblas/schema-org-rdf/master/tools/schema-gateway/test/solar-system.csv#row:3,col:1"/>
+ <scsv:cell rdf:resource="https://raw.github.com/mhausenblas/schema-org-rdf/master/tools/schema-gateway/test/solar-system.csv#row:3,col:2"/>
+ <scsv:cell rdf:resource="https://raw.github.com/mhausenblas/schema-org-rdf/master/tools/schema-gateway/test/solar-system.csv#row:3,col:3"/>
+ <dcterms:title>row 3</dcterms:title>
+ <rdf:type rdf:resource="http://purl.org/NET/schema-org-csv#Row"/>
+ </rdf:Description>
+ <rdf:Description rdf:about="https://raw.github.com/mhausenblas/schema-org-rdf/master/tools/schema-gateway/test/solar-system.csv#row:1">
+ <scsv:cell rdf:resource="https://raw.github.com/mhausenblas/schema-org-rdf/master/tools/schema-gateway/test/solar-system.csv#row:1,col:1"/>
+ <scsv:cell rdf:resource="https://raw.github.com/mhausenblas/schema-org-rdf/master/tools/schema-gateway/test/solar-system.csv#row:1,col:3"/>
+ <scsv:cell rdf:resource="https://raw.github.com/mhausenblas/schema-org-rdf/master/tools/schema-gateway/test/solar-system.csv#row:1,col:2"/>
+ <dcterms:title>header</dcterms:title>
+ <rdf:type rdf:resource="http://purl.org/NET/schema-org-csv#HeaderRow"/>
+ </rdf:Description>
+ <rdf:Description rdf:about="https://raw.github.com/mhausenblas/schema-org-rdf/master/tools/schema-gateway/test/solar-system.csv#row:1,col:1">
+ <dcterms:title>Thing</dcterms:title>
+ </rdf:Description>
+ <rdf:Description rdf:about="https://raw.github.com/mhausenblas/schema-org-rdf/master/tools/schema-gateway/test/solar-system.csv#row:4,col:1">
+ <rdf:value>Sun</rdf:value>
+ <rdf:type rdf:resource="http://schema.org/Thing"/>
+ </rdf:Description>
+ <rdf:Description rdf:about="https://raw.github.com/mhausenblas/schema-org-rdf/master/tools/schema-gateway/test/solar-system.csv#row:3,col:3">
+ <rdf:value rdf:resource="http://dbpedia.org/resource/Mars"/>
+ <rdf:type rdf:resource="http://schema.org/URL"/>
+ </rdf:Description>
+ <rdf:Description rdf:about="https://raw.github.com/mhausenblas/schema-org-rdf/master/tools/schema-gateway/test/solar-system.csv#row:1,col:2">
+ <dcterms:title>Mass</dcterms:title>
+ </rdf:Description>
+ <rdf:Description rdf:about="https://raw.github.com/mhausenblas/schema-org-rdf/master/tools/schema-gateway/test/solar-system.csv#row:3,col:1">
+ <rdf:value>Mars</rdf:value>
+ <rdf:type rdf:resource="http://schema.org/Thing"/>
+ </rdf:Description>
+ <rdf:Description rdf:about="https://raw.github.com/mhausenblas/schema-org-rdf/master/tools/schema-gateway/test/solar-system.csv#row:4,col:3">
+ <rdf:value rdf:resource="http://dbpedia.org/resource/Sun"/>
+ <rdf:type rdf:resource="http://schema.org/URL"/>
+ </rdf:Description>
+ <rdf:Description rdf:about="https://raw.github.com/mhausenblas/schema-org-rdf/master/tools/schema-gateway/test/solar-system.csv#table">
+ <dcterms:source rdf:resource="https://raw.github.com/mhausenblas/schema-org-rdf/master/tools/schema-gateway/test/solar-system.csv"/>
+ <scsv:row rdf:resource="https://raw.github.com/mhausenblas/schema-org-rdf/master/tools/schema-gateway/test/solar-system.csv#row:3"/>
+ <scsv:row rdf:resource="https://raw.github.com/mhausenblas/schema-org-rdf/master/tools/schema-gateway/test/solar-system.csv#row:4"/>
+ <scsv:row rdf:resource="https://raw.github.com/mhausenblas/schema-org-rdf/master/tools/schema-gateway/test/solar-system.csv#row:1"/>
+ <scsv:row rdf:resource="https://raw.github.com/mhausenblas/schema-org-rdf/master/tools/schema-gateway/test/solar-system.csv#row:2"/>
+ <dcterms:title>solar-system.csv</dcterms:title>
+ <rdf:type rdf:resource="http://purl.org/NET/schema-org-csv#Table"/>
+ </rdf:Description>
+ <rdf:Description rdf:about="https://raw.github.com/mhausenblas/schema-org-rdf/master/tools/schema-gateway/test/solar-system.csv#row:1,col:3">
+ <dcterms:title>URL</dcterms:title>
+ </rdf:Description>
+ <rdf:Description rdf:about="https://raw.github.com/mhausenblas/schema-org-rdf/master/tools/schema-gateway/test/solar-system.csv#row:2,col:3">
+ <rdf:value rdf:resource="http://dbpedia.org/resource/Earth"/>
+ <rdf:type rdf:resource="http://schema.org/URL"/>
+ </rdf:Description>
+ <rdf:Description rdf:about="https://raw.github.com/mhausenblas/schema-org-rdf/master/tools/schema-gateway/test/solar-system.csv#row:3,col:2">
+ <rdf:value>6.4191×10^23 kg</rdf:value>
+ <rdf:type rdf:resource="http://schema.org/Mass"/>
+ </rdf:Description>
+</rdf:RDF>

0 comments on commit 7190933

Please sign in to comment.