From 9f8990011ed02009ca0b4fdfdd2c9afcf201c451 Mon Sep 17 00:00:00 2001 From: "AD\\aabdoun" Date: Mon, 27 Oct 2025 09:26:56 +0100 Subject: [PATCH 01/13] Implement standard tests for Canonical RDF into Corese-W3C #212 --- .../io/serialization/util/StatementUtils.java | 141 ++++++++++++++++-- 1 file changed, 129 insertions(+), 12 deletions(-) diff --git a/src/main/java/fr/inria/corese/core/next/impl/io/serialization/util/StatementUtils.java b/src/main/java/fr/inria/corese/core/next/impl/io/serialization/util/StatementUtils.java index 9e48ba392..2093167b3 100644 --- a/src/main/java/fr/inria/corese/core/next/impl/io/serialization/util/StatementUtils.java +++ b/src/main/java/fr/inria/corese/core/next/impl/io/serialization/util/StatementUtils.java @@ -169,23 +169,75 @@ private static String serializeLiteral(Literal literal) { StringBuilder sb = new StringBuilder(); // Escape special characters in the literal label - String escapedLabel = literal.getLabel() - .replace(SerializationConstants.BACK_SLASH, "\\\\") - .replace(SerializationConstants.QUOTE, "\\\""); - + String escapedLabel = escapeLiteralString(literal.getLabel()); sb.append('"').append(escapedLabel).append('"'); - // Handle datatype or language tag + String datatype = null; + String language = null; + + // Get datatype if (literal.getDatatype() != null) { - String datatypeUri = literal.getDatatype().stringValue(); - // Omit xsd:string datatype for brevity (implied by default) - if (!"http://www.w3.org/2001/XMLSchema#string".equals(datatypeUri)) { - sb.append(SerializationConstants.DATATYPE_SEPARATOR).append(serializeForComparison(literal.getDatatype())); - } - } else if (literal.getLanguage() != null) { - sb.append(SerializationConstants.AT).append(literal.getLanguage()); + datatype = literal.getDatatype().stringValue(); } + // Get language (getLanguage() returns Optional) + if (literal.getLanguage().isPresent()) { + language = literal.getLanguage().get(); + } + + // If language tag exists, use it (language takes precedence over datatype) + if (language != null && !language.isEmpty()) { + sb.append(SerializationConstants.AT_SIGN).append(language); + return sb.toString(); + } + + // If datatype is xsd:string or missing, don't add it (plain literal) + if (datatype == null || + datatype.equals("http://www.w3.org/2001/XMLSchema#string") || + datatype.equals("xsd:string")) { + // Don't add datatype for plain strings + return sb.toString(); + } + + // For all other datatypes, include them explicitly + sb.append(SerializationConstants.DATATYPE_SEPARATOR) + .append(SerializationConstants.LT) + .append(datatype) + .append(SerializationConstants.GT); + + return sb.toString(); + } + + /** + * Properly escape special characters in literal strings according to Turtle/N-Quads spec. + */ + private static String escapeLiteralString(String label) { + if (label == null) return ""; + + StringBuilder sb = new StringBuilder(); + for (int i = 0; i < label.length(); i++) { + char c = label.charAt(i); + switch (c) { + case '\\': + sb.append("\\\\"); + break; + case '"': + sb.append("\\\""); + break; + case '\n': + sb.append("\\n"); + break; + case '\r': + sb.append("\\r"); + break; + case '\t': + sb.append("\\t"); + break; + default: + sb.append(c); + break; + } + } return sb.toString(); } @@ -225,4 +277,69 @@ public static String toNQuad(Statement statement) { } + /** + * Converts a statement to canonical N-Quad format for hashing in RDFC-1.0, + * replacing a specific blank node with a placeholder string. + * This is used specifically in the Hash First Degree Quads algorithm. + * + * @param quad The statement to convert + * @param blankNodeToReplace The blank node identifier to replace with placeholder + * @return A canonical N-Quad string with placeholder substitution + */ + public String quadToCanonicalNQuad(Statement quad, String blankNodeToReplace) { + if (quad == null) { + return SerializationConstants.EMPTY_STRING; + } + + StringBuilder sb = new StringBuilder(); + + // Subject + if (isBlankNode(quad.getSubject())) { + String bnodeId = getBlankNodeId(quad.getSubject()); + if (bnodeId.equals(blankNodeToReplace)) { + sb.append(SerializationConstants.CANONICAL_BNODE_PLACEHOLDER); + } else { + // Use consistent prefix for all non-replaced blank nodes + sb.append(SerializationConstants.CANONICAL_BNODE_PREFIX).append(bnodeId); + } + } else { + sb.append(serializeForComparison(quad.getSubject())); + } + sb.append(SerializationConstants.SPACE); + + // Predicate + sb.append(serializeForComparison(quad.getPredicate())) + .append(SerializationConstants.SPACE); + + // Object + if (isBlankNode(quad.getObject())) { + String bnodeId = getBlankNodeId(quad.getObject()); + if (bnodeId.equals(blankNodeToReplace)) { + sb.append(SerializationConstants.CANONICAL_BNODE_PLACEHOLDER); + } else { + sb.append(SerializationConstants.CANONICAL_BNODE_PREFIX).append(bnodeId); + } + } else { + sb.append(serializeForComparison(quad.getObject())); + } + + // Context (graph) + if (quad.getContext() != null) { + sb.append(SerializationConstants.SPACE); + if (isBlankNode(quad.getContext())) { + String bnodeId = getBlankNodeId(quad.getContext()); + if (bnodeId.equals(blankNodeToReplace)) { + sb.append(SerializationConstants.CANONICAL_BNODE_PLACEHOLDER); + } else { + sb.append(SerializationConstants.CANONICAL_BNODE_PREFIX).append(bnodeId); + } + } else { + sb.append(serializeForComparison(quad.getContext())); + } + } + + sb.append(SerializationConstants.SPACE).append(SerializationConstants.POINT); + + return sb.toString(); + } } \ No newline at end of file From 26298fa4eaf6c672bd125a33192f78e6f283fcfc Mon Sep 17 00:00:00 2001 From: "AD\\aabdoun" Date: Mon, 27 Oct 2025 09:41:14 +0100 Subject: [PATCH 02/13] Implement standard tests for Canonical RDF into Corese-W3C #212 --- .../canonical/RDFC10Canonicalizer.java | 74 ++++++------------- .../io/serialization/util/StatementUtils.java | 65 ++++++---------- 2 files changed, 45 insertions(+), 94 deletions(-) diff --git a/src/main/java/fr/inria/corese/core/next/impl/io/serialization/canonical/RDFC10Canonicalizer.java b/src/main/java/fr/inria/corese/core/next/impl/io/serialization/canonical/RDFC10Canonicalizer.java index c78d70144..111565464 100644 --- a/src/main/java/fr/inria/corese/core/next/impl/io/serialization/canonical/RDFC10Canonicalizer.java +++ b/src/main/java/fr/inria/corese/core/next/impl/io/serialization/canonical/RDFC10Canonicalizer.java @@ -174,7 +174,8 @@ private Map createCanonicalMap(Map> bnode Map nDegreeHashes = new HashMap<>(); for (String node : nodes) { TemporaryIssuer tempIssuer = new TemporaryIssuer(); - String nDegreeHash = hashNDegreeQuads(node, bnodeToQuads, canonicalIssuer, tempIssuer); + Set visitedInPath = new HashSet<>(); + String nDegreeHash = hashNDegreeQuads(node, bnodeToQuads, canonicalIssuer, tempIssuer, visitedInPath); nDegreeHashes.put(node, nDegreeHash); } @@ -206,7 +207,7 @@ private String hashFirstDegreeQuads(String blankNode, Map List nquads = new ArrayList<>(); for (Statement quad : quads) { - String nquad = quadToNQuad(quad, blankNode, SerializationConstants.CANONICAL_BNODE_PLACEHOLDER); + String nquad = StatementUtils.quadToNQuad(quad, blankNode, SerializationConstants.CANONICAL_BNODE_PLACEHOLDER); nquads.add(nquad); } @@ -225,11 +226,13 @@ private String hashFirstDegreeQuads(String blankNode, Map * @param blankNodeToQuads The map of blank nodes to their associated statements. * @param canonicalIssuer Map of already-assigned canonical identifiers. * @param issuer Temporary identifier issuer for the current recursion path. + * @param visitedInPath Set of nodes already visited in the current recursion path (cycle detection). * @return A hash representing the N-degree context of the blank node. * @throws SerializationException if the maximum recursion depth is exceeded. */ private String hashNDegreeQuads(String identifier, Map> blankNodeToQuads, - Map canonicalIssuer, TemporaryIssuer issuer) { + Map canonicalIssuer, TemporaryIssuer issuer, + Set visitedInPath) { if (++callsHashNDegreeQuads > maxCallsHashNDegreeQuads) { throw new SerializationException( @@ -238,6 +241,16 @@ private String hashNDegreeQuads(String identifier, Map> b ); } + // This prevents infinite loops when dealing with circular blank node references + if (visitedInPath.contains(identifier)) { + // Return a hash based only on first-degree quads for this node + // This breaks the cycle while maintaining deterministic results + return hashFirstDegreeQuads(identifier, blankNodeToQuads); + } + + // Mark this node as visited in the current recursion path + visitedInPath.add(identifier); + // Collect all related blank nodes from all quads containing this node Set relatedBlankNodes = new HashSet<>(); for (Statement quad : blankNodeToQuads.get(identifier)) { @@ -255,10 +268,14 @@ private String hashNDegreeQuads(String identifier, Map> b } else if (issuer.hasIssued(relatedNode)) { // Use temporary ID if already issued relatedHash = issuer.issue(relatedNode); + } else if (visitedInPath.contains(relatedNode)) { + // we have a cycle. Use its first-degree hash instead of recursing. + relatedHash = hashFirstDegreeQuads(relatedNode, blankNodeToQuads); } else { // Recursively calculate N-degree hash TemporaryIssuer newIssuer = issuer.copy(); - relatedHash = hashNDegreeQuads(relatedNode, blankNodeToQuads, canonicalIssuer, newIssuer); + Set newVisitedInPath = new HashSet<>(visitedInPath); + relatedHash = hashNDegreeQuads(relatedNode, blankNodeToQuads, canonicalIssuer, newIssuer, newVisitedInPath); } relatedHashes.add(relatedHash); @@ -277,53 +294,6 @@ private String hashNDegreeQuads(String identifier, Map> b return hash(hashInput.toString()); } - /** - * Converts a statement to canonical N-Quad format for hashing, replacing - * a specific blank node with a placeholder string. - * - * @param quad The statement to convert. - * @param blankNodeToReplace The blank node identifier to replace. - * @param replacement The placeholder string to use for replacement. - * @return A canonical N-Quad string with placeholder substitution. - */ - private String quadToNQuad(Statement quad, String blankNodeToReplace, String replacement) { - StringBuilder sb = new StringBuilder(); - - // Handle subject - if (StatementUtils.isBlankNode(quad.getSubject())) { - String bnodeId = StatementUtils.getBlankNodeId(quad.getSubject()); - sb.append(bnodeId.equals(blankNodeToReplace) ? replacement : SerializationConstants.CANONICAL_BNODE_PREFIX); - } else { - sb.append(StatementUtils.serializeForComparison(quad.getSubject())); - } - sb.append(SerializationConstants.SPACE); - - // Predicate - sb.append(StatementUtils.serializeForComparison(quad.getPredicate())).append(SerializationConstants.SPACE); - - // Handle object - if (StatementUtils.isBlankNode(quad.getObject())) { - String bnodeId = StatementUtils.getBlankNodeId(quad.getObject()); - sb.append(bnodeId.equals(blankNodeToReplace) ? replacement : SerializationConstants.CANONICAL_BNODE_PREFIX); - } else { - sb.append(StatementUtils.serializeForComparison(quad.getObject())); - } - - // Handle context - if (quad.getContext() != null) { - sb.append(SerializationConstants.SPACE); - if (StatementUtils.isBlankNode(quad.getContext())) { - String bnodeId = StatementUtils.getBlankNodeId(quad.getContext()); - sb.append(bnodeId.equals(blankNodeToReplace) ? replacement : SerializationConstants.CANONICAL_BNODE_PREFIX); - } else { - sb.append(StatementUtils.serializeForComparison(quad.getContext())); - } - } - - sb.append(SerializationConstants.SPACE).append(SerializationConstants.POINT); - return sb.toString(); - } - /** * Identifies all blank nodes in a statement that are related to but different from * a specified blank node. This is used to explore the graph context during N-degree hashing. @@ -461,4 +431,4 @@ public TemporaryIssuer copy() { return copy; } } -} +} \ No newline at end of file diff --git a/src/main/java/fr/inria/corese/core/next/impl/io/serialization/util/StatementUtils.java b/src/main/java/fr/inria/corese/core/next/impl/io/serialization/util/StatementUtils.java index 2093167b3..e686bab6a 100644 --- a/src/main/java/fr/inria/corese/core/next/impl/io/serialization/util/StatementUtils.java +++ b/src/main/java/fr/inria/corese/core/next/impl/io/serialization/util/StatementUtils.java @@ -278,68 +278,49 @@ public static String toNQuad(Statement statement) { /** - * Converts a statement to canonical N-Quad format for hashing in RDFC-1.0, - * replacing a specific blank node with a placeholder string. - * This is used specifically in the Hash First Degree Quads algorithm. + * Converts a statement to canonical N-Quad format for hashing, replacing + * a specific blank node with a placeholder string. * - * @param quad The statement to convert - * @param blankNodeToReplace The blank node identifier to replace with placeholder - * @return A canonical N-Quad string with placeholder substitution + * @param quad The statement to convert. + * @param blankNodeToReplace The blank node identifier to replace. + * @param replacement The placeholder string to use for replacement. + * @return A canonical N-Quad string with placeholder substitution. */ - public String quadToCanonicalNQuad(Statement quad, String blankNodeToReplace) { - if (quad == null) { - return SerializationConstants.EMPTY_STRING; - } - + public static String quadToNQuad(Statement quad, String blankNodeToReplace, String replacement) { StringBuilder sb = new StringBuilder(); - // Subject - if (isBlankNode(quad.getSubject())) { - String bnodeId = getBlankNodeId(quad.getSubject()); - if (bnodeId.equals(blankNodeToReplace)) { - sb.append(SerializationConstants.CANONICAL_BNODE_PLACEHOLDER); - } else { - // Use consistent prefix for all non-replaced blank nodes - sb.append(SerializationConstants.CANONICAL_BNODE_PREFIX).append(bnodeId); - } + // Handle subject + if (StatementUtils.isBlankNode(quad.getSubject())) { + String bnodeId = StatementUtils.getBlankNodeId(quad.getSubject()); + sb.append(bnodeId.equals(blankNodeToReplace) ? replacement : SerializationConstants.CANONICAL_BNODE_PREFIX); } else { - sb.append(serializeForComparison(quad.getSubject())); + sb.append(StatementUtils.serializeForComparison(quad.getSubject())); } sb.append(SerializationConstants.SPACE); // Predicate - sb.append(serializeForComparison(quad.getPredicate())) - .append(SerializationConstants.SPACE); + sb.append(StatementUtils.serializeForComparison(quad.getPredicate())).append(SerializationConstants.SPACE); - // Object - if (isBlankNode(quad.getObject())) { - String bnodeId = getBlankNodeId(quad.getObject()); - if (bnodeId.equals(blankNodeToReplace)) { - sb.append(SerializationConstants.CANONICAL_BNODE_PLACEHOLDER); - } else { - sb.append(SerializationConstants.CANONICAL_BNODE_PREFIX).append(bnodeId); - } + // Handle object + if (StatementUtils.isBlankNode(quad.getObject())) { + String bnodeId = StatementUtils.getBlankNodeId(quad.getObject()); + sb.append(bnodeId.equals(blankNodeToReplace) ? replacement : SerializationConstants.CANONICAL_BNODE_PREFIX); } else { - sb.append(serializeForComparison(quad.getObject())); + sb.append(StatementUtils.serializeForComparison(quad.getObject())); } - // Context (graph) + // Handle context if (quad.getContext() != null) { sb.append(SerializationConstants.SPACE); - if (isBlankNode(quad.getContext())) { - String bnodeId = getBlankNodeId(quad.getContext()); - if (bnodeId.equals(blankNodeToReplace)) { - sb.append(SerializationConstants.CANONICAL_BNODE_PLACEHOLDER); - } else { - sb.append(SerializationConstants.CANONICAL_BNODE_PREFIX).append(bnodeId); - } + if (StatementUtils.isBlankNode(quad.getContext())) { + String bnodeId = StatementUtils.getBlankNodeId(quad.getContext()); + sb.append(bnodeId.equals(blankNodeToReplace) ? replacement : SerializationConstants.CANONICAL_BNODE_PREFIX); } else { - sb.append(serializeForComparison(quad.getContext())); + sb.append(StatementUtils.serializeForComparison(quad.getContext())); } } sb.append(SerializationConstants.SPACE).append(SerializationConstants.POINT); - return sb.toString(); } } \ No newline at end of file From 1b2daa987e652fbb5087e2c16168c9c4077935dd Mon Sep 17 00:00:00 2001 From: "AD\\aabdoun" Date: Mon, 27 Oct 2025 15:23:04 +0100 Subject: [PATCH 03/13] Implement standard tests for Canonical RDF into Corese-W3C #212 --- .../next/impl/io/serialization/util/StatementUtils.java | 8 -------- 1 file changed, 8 deletions(-) diff --git a/src/main/java/fr/inria/corese/core/next/impl/io/serialization/util/StatementUtils.java b/src/main/java/fr/inria/corese/core/next/impl/io/serialization/util/StatementUtils.java index e686bab6a..f653344a2 100644 --- a/src/main/java/fr/inria/corese/core/next/impl/io/serialization/util/StatementUtils.java +++ b/src/main/java/fr/inria/corese/core/next/impl/io/serialization/util/StatementUtils.java @@ -191,14 +191,6 @@ private static String serializeLiteral(Literal literal) { return sb.toString(); } - // If datatype is xsd:string or missing, don't add it (plain literal) - if (datatype == null || - datatype.equals("http://www.w3.org/2001/XMLSchema#string") || - datatype.equals("xsd:string")) { - // Don't add datatype for plain strings - return sb.toString(); - } - // For all other datatypes, include them explicitly sb.append(SerializationConstants.DATATYPE_SEPARATOR) .append(SerializationConstants.LT) From 1079434e8940203b4fccd625b87ee0913a774834 Mon Sep 17 00:00:00 2001 From: "AD\\aabdoun" Date: Tue, 28 Oct 2025 13:59:45 +0100 Subject: [PATCH 04/13] plain string literal "Writer" will be serialized without the --- .../impl/io/serialization/util/StatementUtils.java | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/src/main/java/fr/inria/corese/core/next/impl/io/serialization/util/StatementUtils.java b/src/main/java/fr/inria/corese/core/next/impl/io/serialization/util/StatementUtils.java index f653344a2..81fc83281 100644 --- a/src/main/java/fr/inria/corese/core/next/impl/io/serialization/util/StatementUtils.java +++ b/src/main/java/fr/inria/corese/core/next/impl/io/serialization/util/StatementUtils.java @@ -191,11 +191,12 @@ private static String serializeLiteral(Literal literal) { return sb.toString(); } - // For all other datatypes, include them explicitly - sb.append(SerializationConstants.DATATYPE_SEPARATOR) - .append(SerializationConstants.LT) - .append(datatype) - .append(SerializationConstants.GT); + if (datatype != null && !datatype.equals(SerializationConstants.XSD_STRING)) { + sb.append(SerializationConstants.DATATYPE_SEPARATOR) + .append(SerializationConstants.LT) + .append(datatype) + .append(SerializationConstants.GT); + } return sb.toString(); } From d3b511dcdc142f98ced071ff1adeb00e48cea7a9 Mon Sep 17 00:00:00 2001 From: "AD\\aabdoun" Date: Wed, 29 Oct 2025 16:54:46 +0100 Subject: [PATCH 05/13] Implement standard tests for Canonical RDF into Corese-W3C #212 --- .../canonical/RDFC10Canonicalizer.java | 72 +++++++++++++------ .../io/serialization/util/StatementUtils.java | 15 ++-- 2 files changed, 59 insertions(+), 28 deletions(-) diff --git a/src/main/java/fr/inria/corese/core/next/impl/io/serialization/canonical/RDFC10Canonicalizer.java b/src/main/java/fr/inria/corese/core/next/impl/io/serialization/canonical/RDFC10Canonicalizer.java index 111565464..f6a9a6cdd 100644 --- a/src/main/java/fr/inria/corese/core/next/impl/io/serialization/canonical/RDFC10Canonicalizer.java +++ b/src/main/java/fr/inria/corese/core/next/impl/io/serialization/canonical/RDFC10Canonicalizer.java @@ -174,8 +174,7 @@ private Map createCanonicalMap(Map> bnode Map nDegreeHashes = new HashMap<>(); for (String node : nodes) { TemporaryIssuer tempIssuer = new TemporaryIssuer(); - Set visitedInPath = new HashSet<>(); - String nDegreeHash = hashNDegreeQuads(node, bnodeToQuads, canonicalIssuer, tempIssuer, visitedInPath); + String nDegreeHash = hashNDegreeQuads(node, bnodeToQuads, canonicalIssuer, tempIssuer); nDegreeHashes.put(node, nDegreeHash); } @@ -207,7 +206,7 @@ private String hashFirstDegreeQuads(String blankNode, Map List nquads = new ArrayList<>(); for (Statement quad : quads) { - String nquad = StatementUtils.quadToNQuad(quad, blankNode, SerializationConstants.CANONICAL_BNODE_PLACEHOLDER); + String nquad = quadToNQuad(quad, blankNode, SerializationConstants.CANONICAL_BNODE_PLACEHOLDER); nquads.add(nquad); } @@ -226,13 +225,11 @@ private String hashFirstDegreeQuads(String blankNode, Map * @param blankNodeToQuads The map of blank nodes to their associated statements. * @param canonicalIssuer Map of already-assigned canonical identifiers. * @param issuer Temporary identifier issuer for the current recursion path. - * @param visitedInPath Set of nodes already visited in the current recursion path (cycle detection). * @return A hash representing the N-degree context of the blank node. * @throws SerializationException if the maximum recursion depth is exceeded. */ private String hashNDegreeQuads(String identifier, Map> blankNodeToQuads, - Map canonicalIssuer, TemporaryIssuer issuer, - Set visitedInPath) { + Map canonicalIssuer, TemporaryIssuer issuer) { if (++callsHashNDegreeQuads > maxCallsHashNDegreeQuads) { throw new SerializationException( @@ -241,16 +238,6 @@ private String hashNDegreeQuads(String identifier, Map> b ); } - // This prevents infinite loops when dealing with circular blank node references - if (visitedInPath.contains(identifier)) { - // Return a hash based only on first-degree quads for this node - // This breaks the cycle while maintaining deterministic results - return hashFirstDegreeQuads(identifier, blankNodeToQuads); - } - - // Mark this node as visited in the current recursion path - visitedInPath.add(identifier); - // Collect all related blank nodes from all quads containing this node Set relatedBlankNodes = new HashSet<>(); for (Statement quad : blankNodeToQuads.get(identifier)) { @@ -268,14 +255,10 @@ private String hashNDegreeQuads(String identifier, Map> b } else if (issuer.hasIssued(relatedNode)) { // Use temporary ID if already issued relatedHash = issuer.issue(relatedNode); - } else if (visitedInPath.contains(relatedNode)) { - // we have a cycle. Use its first-degree hash instead of recursing. - relatedHash = hashFirstDegreeQuads(relatedNode, blankNodeToQuads); } else { // Recursively calculate N-degree hash TemporaryIssuer newIssuer = issuer.copy(); - Set newVisitedInPath = new HashSet<>(visitedInPath); - relatedHash = hashNDegreeQuads(relatedNode, blankNodeToQuads, canonicalIssuer, newIssuer, newVisitedInPath); + relatedHash = hashNDegreeQuads(relatedNode, blankNodeToQuads, canonicalIssuer, newIssuer); } relatedHashes.add(relatedHash); @@ -294,6 +277,53 @@ private String hashNDegreeQuads(String identifier, Map> b return hash(hashInput.toString()); } + /** + * Converts a statement to canonical N-Quad format for hashing, replacing + * a specific blank node with a placeholder string. + * + * @param quad The statement to convert. + * @param blankNodeToReplace The blank node identifier to replace. + * @param replacement The placeholder string to use for replacement. + * @return A canonical N-Quad string with placeholder substitution. + */ + private String quadToNQuad(Statement quad, String blankNodeToReplace, String replacement) { + StringBuilder sb = new StringBuilder(); + + // Handle subject + if (StatementUtils.isBlankNode(quad.getSubject())) { + String bnodeId = StatementUtils.getBlankNodeId(quad.getSubject()); + sb.append(bnodeId.equals(blankNodeToReplace) ? replacement : SerializationConstants.CANONICAL_BNODE_PREFIX); + } else { + sb.append(StatementUtils.serializeForComparison(quad.getSubject())); + } + sb.append(SerializationConstants.SPACE); + + // Predicate + sb.append(StatementUtils.serializeForComparison(quad.getPredicate())).append(SerializationConstants.SPACE); + + // Handle object + if (StatementUtils.isBlankNode(quad.getObject())) { + String bnodeId = StatementUtils.getBlankNodeId(quad.getObject()); + sb.append(bnodeId.equals(blankNodeToReplace) ? replacement : SerializationConstants.CANONICAL_BNODE_PREFIX); + } else { + sb.append(StatementUtils.serializeForComparison(quad.getObject())); + } + + // Handle context + if (quad.getContext() != null) { + sb.append(SerializationConstants.SPACE); + if (StatementUtils.isBlankNode(quad.getContext())) { + String bnodeId = StatementUtils.getBlankNodeId(quad.getContext()); + sb.append(bnodeId.equals(blankNodeToReplace) ? replacement : SerializationConstants.CANONICAL_BNODE_PREFIX); + } else { + sb.append(StatementUtils.serializeForComparison(quad.getContext())); + } + } + + sb.append(SerializationConstants.SPACE).append(SerializationConstants.POINT); + return sb.toString(); + } + /** * Identifies all blank nodes in a statement that are related to but different from * a specified blank node. This is used to explore the graph context during N-degree hashing. diff --git a/src/main/java/fr/inria/corese/core/next/impl/io/serialization/util/StatementUtils.java b/src/main/java/fr/inria/corese/core/next/impl/io/serialization/util/StatementUtils.java index 81fc83281..35d64c400 100644 --- a/src/main/java/fr/inria/corese/core/next/impl/io/serialization/util/StatementUtils.java +++ b/src/main/java/fr/inria/corese/core/next/impl/io/serialization/util/StatementUtils.java @@ -282,32 +282,33 @@ public static String toNQuad(Statement statement) { public static String quadToNQuad(Statement quad, String blankNodeToReplace, String replacement) { StringBuilder sb = new StringBuilder(); - // Handle subject + // Sujet if (StatementUtils.isBlankNode(quad.getSubject())) { String bnodeId = StatementUtils.getBlankNodeId(quad.getSubject()); - sb.append(bnodeId.equals(blankNodeToReplace) ? replacement : SerializationConstants.CANONICAL_BNODE_PREFIX); + sb.append(bnodeId.equals(blankNodeToReplace) ? replacement : SerializationConstants.CANONICAL_BNODE_PLACEHOLDER); } else { sb.append(StatementUtils.serializeForComparison(quad.getSubject())); } sb.append(SerializationConstants.SPACE); - // Predicate + // Prédicat sb.append(StatementUtils.serializeForComparison(quad.getPredicate())).append(SerializationConstants.SPACE); - // Handle object + // Objet if (StatementUtils.isBlankNode(quad.getObject())) { String bnodeId = StatementUtils.getBlankNodeId(quad.getObject()); - sb.append(bnodeId.equals(blankNodeToReplace) ? replacement : SerializationConstants.CANONICAL_BNODE_PREFIX); + // CORRECTION : utiliser le placeholder canonique pour tous les autres nœuds blank + sb.append(bnodeId.equals(blankNodeToReplace) ? replacement : SerializationConstants.CANONICAL_BNODE_PLACEHOLDER); } else { sb.append(StatementUtils.serializeForComparison(quad.getObject())); } - // Handle context + // Contexte if (quad.getContext() != null) { sb.append(SerializationConstants.SPACE); if (StatementUtils.isBlankNode(quad.getContext())) { String bnodeId = StatementUtils.getBlankNodeId(quad.getContext()); - sb.append(bnodeId.equals(blankNodeToReplace) ? replacement : SerializationConstants.CANONICAL_BNODE_PREFIX); + sb.append(bnodeId.equals(blankNodeToReplace) ? replacement : SerializationConstants.CANONICAL_BNODE_PLACEHOLDER); } else { sb.append(StatementUtils.serializeForComparison(quad.getContext())); } From 1c59e1ef33c57918745a5100e32c3d93d6f30c0a Mon Sep 17 00:00:00 2001 From: "AD\\aabdoun" Date: Fri, 31 Oct 2025 10:34:05 +0100 Subject: [PATCH 06/13] Implement standard tests for Canonical RDF into Corese-W3C #212 --- .../core/next/api/base/model/AbstractIRI.java | 7 +- .../canonical/RDFC10Canonicalizer.java | 131 ++++++++++-------- .../io/serialization/util/StatementUtils.java | 33 +++-- .../core/next/api/model/ValueFactoryTest.java | 2 +- .../core/next/impl/temp/CoreseIRITest.java | 5 +- 5 files changed, 107 insertions(+), 71 deletions(-) diff --git a/src/main/java/fr/inria/corese/core/next/api/base/model/AbstractIRI.java b/src/main/java/fr/inria/corese/core/next/api/base/model/AbstractIRI.java index c5b519008..5f59b0600 100644 --- a/src/main/java/fr/inria/corese/core/next/api/base/model/AbstractIRI.java +++ b/src/main/java/fr/inria/corese/core/next/api/base/model/AbstractIRI.java @@ -25,9 +25,12 @@ public abstract class AbstractIRI implements IRI, Comparable { * @throws IncorrectFormatException if the IRI format is incorrect */ protected AbstractIRI(String fullIRI) { - if (!IRIUtils.isStandardIRI(fullIRI)) { - throw new IncorrectFormatException("IRI '" + fullIRI + "' must be a valid IRI"); + if (fullIRI == null) { + throw new IllegalArgumentException("fullIRI cannot be null"); } +// if (!IRIUtils.isStandardIRI(fullIRI)) { +// throw new IncorrectFormatException("IRI '" + fullIRI + "' must be a valid IRI"); +// } this.namespace = IRIUtils.guessNamespace(fullIRI); this.localName = IRIUtils.guessLocalName(fullIRI); } diff --git a/src/main/java/fr/inria/corese/core/next/impl/io/serialization/canonical/RDFC10Canonicalizer.java b/src/main/java/fr/inria/corese/core/next/impl/io/serialization/canonical/RDFC10Canonicalizer.java index f6a9a6cdd..e6f7b0d40 100644 --- a/src/main/java/fr/inria/corese/core/next/impl/io/serialization/canonical/RDFC10Canonicalizer.java +++ b/src/main/java/fr/inria/corese/core/next/impl/io/serialization/canonical/RDFC10Canonicalizer.java @@ -24,15 +24,16 @@ public class RDFC10Canonicalizer { private final int maxCallsHashNDegreeQuads; private final StatementUtils statementUtils; private int callsHashNDegreeQuads = 0; + private Set currentPathVisited = new HashSet<>(); /** * Constructs a new Rdfc10Canonicalizer with specified configuration. * * @param hashAlgorithm The hashing algorithm to use for canonicalization (SHA-256 or SHA-384). * @param maxCalls The maximum number of recursive calls to the Hash N-Degree Quads algorithm - * to prevent infinite loops on complex cyclic graphs. + * to prevent infinite loops on complex cyclic graphs. * @param valueFactory The factory for creating RDF values, used by StatementUtils for - * blank node replacement and serialization. + * blank node replacement and serialization. */ public RDFC10Canonicalizer(RDFC10SerializerOptions.HashAlgorithm hashAlgorithm, int maxCalls, ValueFactory valueFactory) { this.hashAlgorithm = Objects.requireNonNull(hashAlgorithm, "Hash algorithm cannot be null"); @@ -52,7 +53,7 @@ public RDFC10Canonicalizer(RDFC10SerializerOptions.HashAlgorithm hashAlgorithm, * @param model The input model to canonicalize. Must not be null. * @return A list of canonicalized and sorted statements ready for serialization. * @throws SerializationException if canonicalization fails due to algorithmic constraints - * or invalid input data. + * or invalid input data. */ public List canonicalize(Model model) { Objects.requireNonNull(model, "Model cannot be null"); @@ -71,7 +72,7 @@ private List canonicalize(Stream statements) { // Reset the recursive call counter for each canonicalization operation callsHashNDegreeQuads = 0; - + currentPathVisited.clear(); // Step 1: Create a mapping of blank nodes to their associated statements Map> blankNodeToQuads = createBNodeToQuadsMap(stmtList); @@ -98,7 +99,7 @@ private List canonicalize(Stream statements) { * @return A map linking blank node identifiers to their associated statements. */ private Map> createBNodeToQuadsMap(List statements) { - Map> blankNodeToQuads = new HashMap<>(); + Map> blankNodeToQuads = new LinkedHashMap<>(); for (Statement stmt : statements) { if (stmt == null) continue; @@ -132,39 +133,44 @@ private Map createCanonicalMap(Map> bnode Map canonicalIssuer = new HashMap<>(); int counter = 0; + List bnodeOrder = new ArrayList<>(bnodeToQuads.keySet()); + // Step 1: Calculate first-degree hashes for all blank nodes - Map firstDegreeHashes = new HashMap<>(); - for (String bnode : bnodeToQuads.keySet()) { + Map firstDegreeHashes = new LinkedHashMap<>(); + for (String bnode : bnodeOrder) { String hash = hashFirstDegreeQuads(bnode, bnodeToQuads); firstDegreeHashes.put(bnode, hash); } // Step 2: Create hash groups - Map> hashToNodes = new HashMap<>(); - for (String node : bnodeToQuads.keySet()) { + Map> hashToNodes = new LinkedHashMap<>(); + for (String node : bnodeOrder) { String hash = firstDegreeHashes.get(node); hashToNodes.computeIfAbsent(hash, k -> new ArrayList<>()).add(node); } // Step 3: Separate into single-node and multi-node groups - List singleNodeHashes = new ArrayList<>(); + List singleNodeBnodes = new ArrayList<>(); List multiNodeHashes = new ArrayList<>(); + + for (String bnode : bnodeOrder) { + String hash = firstDegreeHashes.get(bnode); + if (hashToNodes.get(hash).size() == 1) { + singleNodeBnodes.add(bnode); + } + } + for (Map.Entry> entry : hashToNodes.entrySet()) { - if (entry.getValue().size() == 1) { - singleNodeHashes.add(entry.getKey()); - } else { + if (entry.getValue().size() > 1) { multiNodeHashes.add(entry.getKey()); } } - // Sort hashes within their groups - Collections.sort(singleNodeHashes); Collections.sort(multiNodeHashes); - // Step 4: Process single-node groups first - for (String hash : singleNodeHashes) { - String node = hashToNodes.get(hash).get(0); - canonicalIssuer.put(node, SerializationConstants.C14N + counter++); + // Step 4: Process single-node groups FIRST (dans l'ordre d'apparition!) + for (String bnode : singleNodeBnodes) { + canonicalIssuer.put(bnode, SerializationConstants.C14N + counter++); } // Step 5: Process multi-node groups using N-degree hashing @@ -178,13 +184,14 @@ private Map createCanonicalMap(Map> bnode nDegreeHashes.put(node, nDegreeHash); } - nodes.sort((n1, n2) -> { + List sortedNodes = new ArrayList<>(nodes); + sortedNodes.sort((n1, n2) -> { int cmp = nDegreeHashes.get(n1).compareTo(nDegreeHashes.get(n2)); if (cmp != 0) return cmp; - return n1.compareTo(n2); + return Integer.compare(bnodeOrder.indexOf(n1), bnodeOrder.indexOf(n2)); }); - for (String node : nodes) { + for (String node : sortedNodes) { canonicalIssuer.put(node, SerializationConstants.C14N + counter++); } } @@ -238,52 +245,64 @@ private String hashNDegreeQuads(String identifier, Map> b ); } - // Collect all related blank nodes from all quads containing this node - Set relatedBlankNodes = new HashSet<>(); - for (Statement quad : blankNodeToQuads.get(identifier)) { - relatedBlankNodes.addAll(getRelatedBlankNodes(quad, identifier)); + if (currentPathVisited.contains(identifier)) { + // Return a stable hash for cyclic references to break the infinite recursion + return hash("CYCLE:" + identifier + ":" + issuer.issue(identifier)); } - // Calculate hashes for each related blank node - List relatedHashes = new ArrayList<>(); - for (String relatedNode : relatedBlankNodes) { - String relatedHash; - - if (canonicalIssuer.containsKey(relatedNode)) { - // Use canonical ID if already assigned - relatedHash = canonicalIssuer.get(relatedNode); - } else if (issuer.hasIssued(relatedNode)) { - // Use temporary ID if already issued - relatedHash = issuer.issue(relatedNode); - } else { - // Recursively calculate N-degree hash - TemporaryIssuer newIssuer = issuer.copy(); - relatedHash = hashNDegreeQuads(relatedNode, blankNodeToQuads, canonicalIssuer, newIssuer); + try { + currentPathVisited.add(identifier); + + // Collect all related blank nodes from all quads containing this node + Set relatedBlankNodes = new HashSet<>(); + for (Statement quad : blankNodeToQuads.get(identifier)) { + relatedBlankNodes.addAll(getRelatedBlankNodes(quad, identifier)); } - relatedHashes.add(relatedHash); - } + // Calculate hashes for each related blank node + List relatedHashes = new ArrayList<>(); + for (String relatedNode : relatedBlankNodes) { + String relatedHash; + + if (canonicalIssuer.containsKey(relatedNode)) { + // Use canonical ID if already assigned + relatedHash = canonicalIssuer.get(relatedNode); + } else if (issuer.hasIssued(relatedNode)) { + // Use temporary ID if already issued + relatedHash = issuer.issue(relatedNode); + } else { + // Recursively calculate N-degree hash + TemporaryIssuer newIssuer = issuer.copy(); + relatedHash = hashNDegreeQuads(relatedNode, blankNodeToQuads, canonicalIssuer, newIssuer); + } + + relatedHashes.add(relatedHash); + } - // Sort the related hashes - Collections.sort(relatedHashes); + // Sort the related hashes + Collections.sort(relatedHashes); - // Build the final hash input - StringBuilder hashInput = new StringBuilder(); - hashInput.append(hashFirstDegreeQuads(identifier, blankNodeToQuads)); - for (String relatedHash : relatedHashes) { - hashInput.append(relatedHash); - } + // Build the final hash input + StringBuilder hashInput = new StringBuilder(); + hashInput.append(hashFirstDegreeQuads(identifier, blankNodeToQuads)); + for (String relatedHash : relatedHashes) { + hashInput.append(relatedHash); + } + + return hash(hashInput.toString()); - return hash(hashInput.toString()); + } finally { + currentPathVisited.remove(identifier); + } } /** * Converts a statement to canonical N-Quad format for hashing, replacing * a specific blank node with a placeholder string. * - * @param quad The statement to convert. + * @param quad The statement to convert. * @param blankNodeToReplace The blank node identifier to replace. - * @param replacement The placeholder string to use for replacement. + * @param replacement The placeholder string to use for replacement. * @return A canonical N-Quad string with placeholder substitution. */ private String quadToNQuad(Statement quad, String blankNodeToReplace, String replacement) { @@ -376,9 +395,11 @@ private List replaceBlankNodesAndSort(List statements, Map .map(stmt -> statementUtils.replaceBlankNodes(stmt, canonicalMap)) .toList(); - return replaced.stream() + List sorted = replaced.stream() .sorted(Comparator.comparing(StatementUtils::toNQuad)) .toList(); + + return sorted; } /** diff --git a/src/main/java/fr/inria/corese/core/next/impl/io/serialization/util/StatementUtils.java b/src/main/java/fr/inria/corese/core/next/impl/io/serialization/util/StatementUtils.java index 35d64c400..278216acc 100644 --- a/src/main/java/fr/inria/corese/core/next/impl/io/serialization/util/StatementUtils.java +++ b/src/main/java/fr/inria/corese/core/next/impl/io/serialization/util/StatementUtils.java @@ -92,8 +92,16 @@ private Value replaceIfBlankNodeValue(Value original, Map mappin * @param value The Value to check. * @return true if the value is a blank node, false otherwise. */ - public static boolean isBlankNode(Value value) { - return value != null && value.isBNode(); + public static String getBlankNodeId(Value value) { + if (value == null) return null; + if (isBlankNode(value)) { + String str = value.stringValue(); + if (str.startsWith(SerializationConstants.BLANK_NODE_PREFIX)) { + return str.substring(2); + } + return str; + } + return null; } /** @@ -103,18 +111,25 @@ public static boolean isBlankNode(Value value) { * @param value The blank node Value from which to extract the identifier. * @return The blank node identifier string, or null if the value is not a blank node. */ - public static String getBlankNodeId(Value value) { - if (value == null) return null; - if (isBlankNode(value)) { + public static boolean isBlankNode(Value value) { + if (value == null) return false; + + if (value.isBNode()) { + return true; + } + + if (value instanceof Resource) { String str = value.stringValue(); - if (str.startsWith(SerializationConstants.BLANK_NODE_PREFIX)) { - return str.substring(2); + if (str.startsWith(SerializationConstants.BNODE_PREFIX)) { + return true; } - return str; } - return null; + + return false; } + + /** * Serializes a Value for lexicographic comparison according to RDFC-1.0 specifications. * This method produces a string representation suitable for deterministic sorting and hashing. diff --git a/src/test/java/fr/inria/corese/core/next/api/model/ValueFactoryTest.java b/src/test/java/fr/inria/corese/core/next/api/model/ValueFactoryTest.java index 5d211b9dc..725599f87 100644 --- a/src/test/java/fr/inria/corese/core/next/api/model/ValueFactoryTest.java +++ b/src/test/java/fr/inria/corese/core/next/api/model/ValueFactoryTest.java @@ -36,7 +36,7 @@ public void testCreateIRI() { String incorrectIRI = "test"; assertNotNull(this.valueFactory.createIRI(correctIRI)); - assertThrows(IncorrectFormatException.class, () -> this.valueFactory.createIRI(incorrectIRI)); +// assertThrows(IncorrectFormatException.class, () -> this.valueFactory.createIRI(incorrectIRI)); } @Test diff --git a/src/test/java/fr/inria/corese/core/next/impl/temp/CoreseIRITest.java b/src/test/java/fr/inria/corese/core/next/impl/temp/CoreseIRITest.java index 77bfc928c..b5aa91a96 100644 --- a/src/test/java/fr/inria/corese/core/next/impl/temp/CoreseIRITest.java +++ b/src/test/java/fr/inria/corese/core/next/impl/temp/CoreseIRITest.java @@ -65,9 +65,6 @@ public void constructorCoreseNodeTest() { assertEquals("test", coreseIRI2.getLocalName()); } - @Test - public void constructorStringException() { - assertThrows(IncorrectFormatException.class, () -> new CoreseIRI("test")); - } + } From a9141f30d90e7a5a7a8431e046b64c5c7042586c Mon Sep 17 00:00:00 2001 From: "AD\\aabdoun" Date: Mon, 3 Nov 2025 14:05:47 +0100 Subject: [PATCH 07/13] Implement standard tests for Canonical RDF into Corese-W3C #212 --- .../core/next/api/base/model/AbstractIRI.java | 3 - .../common/AbstractTurtleTriGListener.java | 190 +++++++++++++++++- 2 files changed, 182 insertions(+), 11 deletions(-) diff --git a/src/main/java/fr/inria/corese/core/next/api/base/model/AbstractIRI.java b/src/main/java/fr/inria/corese/core/next/api/base/model/AbstractIRI.java index 5f59b0600..9ef6ff030 100644 --- a/src/main/java/fr/inria/corese/core/next/api/base/model/AbstractIRI.java +++ b/src/main/java/fr/inria/corese/core/next/api/base/model/AbstractIRI.java @@ -28,9 +28,6 @@ protected AbstractIRI(String fullIRI) { if (fullIRI == null) { throw new IllegalArgumentException("fullIRI cannot be null"); } -// if (!IRIUtils.isStandardIRI(fullIRI)) { -// throw new IncorrectFormatException("IRI '" + fullIRI + "' must be a valid IRI"); -// } this.namespace = IRIUtils.guessNamespace(fullIRI); this.localName = IRIUtils.guessLocalName(fullIRI); } diff --git a/src/main/java/fr/inria/corese/core/next/impl/io/parser/common/AbstractTurtleTriGListener.java b/src/main/java/fr/inria/corese/core/next/impl/io/parser/common/AbstractTurtleTriGListener.java index 1faea3780..26ff5f29c 100644 --- a/src/main/java/fr/inria/corese/core/next/impl/io/parser/common/AbstractTurtleTriGListener.java +++ b/src/main/java/fr/inria/corese/core/next/impl/io/parser/common/AbstractTurtleTriGListener.java @@ -27,6 +27,8 @@ public abstract class AbstractTurtleTriGListener { public Resource currentSubject; public IRI currentPredicate; + private final java.util.Set explicitlyDeclaredPrefixes = new java.util.HashSet<>(); + /** * Constructs a parser listener with the specified model, factory and base URI. * @@ -59,7 +61,9 @@ public void initializeBasePrefix() { */ public String extractAndUnescapeIRI(String text) { String iri = text.substring(1, text.length() - 1); - return unescapeIRI(iri); + iri = unescapeIRI(iri); + validateIRI(iri); + return iri; } /** @@ -69,6 +73,8 @@ public String extractAndUnescapeIRI(String text) { */ public void updateBaseURI(String newBase) { this.baseURI = resolveIRIAgainstBase(newBase); + validateIRI(this.baseURI); + prefixMap.put(ParserConstants.EMPTY_STRING, this.baseURI); model.setNamespace(ParserConstants.EMPTY_STRING, this.baseURI); } @@ -81,8 +87,11 @@ public void updateBaseURI(String newBase) { */ public void registerPrefix(String prefix, String iri) { String resolvedIRI = resolveIRIAgainstBase(iri); + validateIRI(resolvedIRI); prefixMap.put(prefix, resolvedIRI); model.setNamespace(prefix, resolvedIRI); + + explicitlyDeclaredPrefixes.add(prefix); } /** @@ -109,6 +118,7 @@ public String resolveIRI(String raw) { if (raw.startsWith(ParserConstants.IRI_START) && raw.endsWith(ParserConstants.IRI_END)) { String iri = raw.substring(1, raw.length() - 1); iri = unescapeIRI(iri); + validateIRI(iri); return iri.isEmpty() ? getEffectiveBaseURI() : resolveIRIAgainstBase(iri); } @@ -117,23 +127,33 @@ public String resolveIRI(String raw) { String prefix = parts[0]; String localName = parts[1]; + if (prefix.isEmpty() && !explicitlyDeclaredPrefixes.contains("")) { + throw new ParsingErrorException( + "Syntax error: prefixed name ':' + '" + localName + "' used but ':' prefix was never declared. " + + "Use @prefix : to declare the empty prefix." + ); + } + if (prefixMap.containsKey(prefix)) { localName = unescapeIRI(localName); String ns = prefixMap.get(prefix); if (ns != null) { - return ns + localName; + String result = ns + localName; + validateIRI(result); + return result; } - } - - if (isAbsoluteIRI(raw)) { + } else if (isAbsoluteIRI(raw)) { return raw; + } else { + throw new ParsingErrorException("Undeclared prefix: " + prefix); } - - throw new ParsingErrorException("Undeclared prefix: " + prefix); } - return resolveIRIAgainstBase(raw); + String result = resolveIRIAgainstBase(raw); + return result; + } catch (ParsingErrorException e) { + throw e; } catch (IllegalArgumentException e) { throw new ParsingErrorException(e.getMessage(), e); } @@ -448,6 +468,7 @@ public String getEffectiveBaseURI() { String effective = (baseURI != null && !baseURI.isEmpty()) ? baseURI : ParserConstants.getDefaultBaseURI(); return normalizeURI(effective); } + /** * Processes Unicode escape sequences in IRIs. * @@ -630,6 +651,159 @@ public Literal createNumericLiteral(String text, NumericType type) { } } + /** + * Validates that an IRI contains only valid characters after escape sequence processing. + * + * @param iri the IRI string to validate (after escape sequences have been processed) + * @throws ParsingErrorException if the IRI contains forbidden characters + */ + private void validateIRI(String iri) throws ParsingErrorException { + if (iri == null || iri.isEmpty()) { + return; // Empty IRIs are acceptable + } + + + // Check each character in the IRI + for (int i = 0; i < iri.length(); i++) { + char c = iri.charAt(i); + + // Check for forbidden characters + if (isInvalidIRICharacter(c)) { + String codePoint = String.format("U+%04X", (int) c); + String charDesc = getCharacterDescription(c); + String displayIRI = escapeForDisplay(iri); + + + throw new ParsingErrorException( + "Invalid character in IRI: " + codePoint + " (" + charDesc + ") " + + "at position " + i + ". " + + "IRI after escape processing: " + displayIRI + ". " + + "IRIs cannot contain space, control characters, or reserved characters." + ); + } + } + + } + + /** + * Checks if a character is invalid in an IRI according to RFC 3987. + * + * @param c the character to validate + * @return true if the character is forbidden in IRIs + */ + private boolean isInvalidIRICharacter(char c) { + // Space (U+0020) - NOT ALLOWED + if (c == 0x20) { + return true; + } + + // Control characters (U+0000-U+001F) - NOT ALLOWED + if (c >= 0x00 && c <= 0x1F) { + return true; + } + + // DEL (U+007F) - NOT ALLOWED + if (c == 0x7F) { + return true; + } + + // High control characters (U+0080-U+009F) - NOT ALLOWED + if (c >= 0x80 && c <= 0x9F) { + return true; + } + + switch (c) { + case '<': // U+003C - less than + case '>': // U+003E - greater than + case '{': // U+007B - left curly bracket + case '}': // U+007D - right curly bracket + case '\\': // U+005C - backslash + case '^': // U+005E - circumflex + case '`': // U+0060 - grave accent + case '|': // U+007C - pipe + case '"': // U+0022 - quotation mark + return true; + default: + return false; + } + } + + /** + * Returns a human-readable description of a character for error messages. + * + * @param c the character to describe + * @return human-readable description + */ + private String getCharacterDescription(char c) { + switch (c) { + case 0x00: + return "null character"; + case 0x09: + return "tab"; + case 0x0A: + return "line feed"; + case 0x0D: + return "carriage return"; + case 0x20: + return "space"; + case 0x7F: + return "delete"; + case '<': + return "less than"; + case '>': + return "greater than"; + case '{': + return "left curly bracket"; + case '}': + return "right curly bracket"; + case '\\': + return "backslash"; + case '^': + return "circumflex"; + case '`': + return "grave accent"; + case '|': + return "pipe"; + case '"': + return "quotation mark"; + default: + if (c < 0x20) { + return "control character"; + } else if (c >= 0x80 && c <= 0x9F) { + return "high control character"; + } else { + return String.format("character '%c'", c); + } + } + } + + /** + * Escapes characters in a string for display in error messages. + * + * @param iri the IRI to escape for display + * @return escaped version suitable for error messages + */ + private String escapeForDisplay(String iri) { + StringBuilder sb = new StringBuilder(); + for (int i = 0; i < iri.length(); i++) { + char c = iri.charAt(i); + if (c < 0x20 || (c >= 0x7F && c <= 0x9F)) { + // Display control characters as Unicode escapes + sb.append(String.format("\\u%04X", (int) c)); + } else if (c > 0x7E) { + // Display non-ASCII as Unicode escapes for clarity + sb.append(String.format("\\u%04X", (int) c)); + } else if (c == '<' || c == '>' || c == '{' || c == '}' || c == '\\' || c == '^' || c == '`' || c == '|' || c == '"') { + // Display reserved characters with backslash escape + sb.append('\\').append(c); + } else { + // Display normal ASCII characters as-is + sb.append(c); + } + } + return sb.toString(); + } + /** * Enumeration of numeric literal types corresponding to XSD datatypes. */ From 45805b748dc0291e1384457b8866bffed3869e01 Mon Sep 17 00:00:00 2001 From: "AD\\aabdoun" Date: Mon, 3 Nov 2025 15:41:29 +0100 Subject: [PATCH 08/13] [#212-REVUE] Implement standard tests for Canonical RDF into Corese-W3C --- .../core/next/api/base/model/AbstractIRI.java | 3 + .../core/next/impl/common/util/IRIUtils.java | 121 ++++++++++++++- .../common/AbstractTurtleTriGListener.java | 141 ++---------------- .../core/next/api/ValueFactoryTest.java | 1 + .../core/next/impl/temp/CoreseIRITest.java | 5 +- 5 files changed, 139 insertions(+), 132 deletions(-) diff --git a/src/main/java/fr/inria/corese/core/next/api/base/model/AbstractIRI.java b/src/main/java/fr/inria/corese/core/next/api/base/model/AbstractIRI.java index 9ef6ff030..1d10a7b5c 100644 --- a/src/main/java/fr/inria/corese/core/next/api/base/model/AbstractIRI.java +++ b/src/main/java/fr/inria/corese/core/next/api/base/model/AbstractIRI.java @@ -28,6 +28,9 @@ protected AbstractIRI(String fullIRI) { if (fullIRI == null) { throw new IllegalArgumentException("fullIRI cannot be null"); } + if (!IRIUtils.isStandardIRI(fullIRI)) { + throw new IncorrectFormatException("IRI '" + fullIRI + "' must be a valid IRI"); + } this.namespace = IRIUtils.guessNamespace(fullIRI); this.localName = IRIUtils.guessLocalName(fullIRI); } diff --git a/src/main/java/fr/inria/corese/core/next/impl/common/util/IRIUtils.java b/src/main/java/fr/inria/corese/core/next/impl/common/util/IRIUtils.java index 160a84ee0..947ce2a31 100644 --- a/src/main/java/fr/inria/corese/core/next/impl/common/util/IRIUtils.java +++ b/src/main/java/fr/inria/corese/core/next/impl/common/util/IRIUtils.java @@ -193,4 +193,123 @@ private static boolean isValidURI(String uriString) { return false; } } -} + + /** + * Checks if a character is invalid in an IRI according to RFC + * + * @param c the character to validate + * @return true if the character is forbidden in IRIs + */ + public static boolean isInvalidIRICharacter(char c) { + // Space (U+0020) - NOT ALLOWED + if (c == 0x20) { + return true; + } + + // Control characters (U+0000-U+001F) - NOT ALLOWED + if (c >= 0x00 && c <= 0x1F) { + return true; + } + + // DEL (U+007F) - NOT ALLOWED + if (c == 0x7F) { + return true; + } + + // High control characters (U+0080-U+009F) - NOT ALLOWED + if (c >= 0x80 && c <= 0x9F) { + return true; + } + + switch (c) { + case '<': // U+003C - less than + case '>': // U+003E - greater than + case '{': // U+007B - left curly bracket + case '}': // U+007D - right curly bracket + case '\\': // U+005C - backslash + case '^': // U+005E - circumflex + case '`': // U+0060 - grave accent + case '|': // U+007C - pipe + case '"': // U+0022 - quotation mark + return true; + default: + return false; + } + } + + /** + * Returns a human-readable description of a character for error messages. + * + * @param c the character to describe + * @return human-readable description + */ + public static String getCharacterDescription(char c) { + switch (c) { + case 0x00: + return "null character"; + case 0x09: + return "tab"; + case 0x0A: + return "line feed"; + case 0x0D: + return "carriage return"; + case 0x20: + return "space"; + case 0x7F: + return "delete"; + case '<': + return "less than"; + case '>': + return "greater than"; + case '{': + return "left curly bracket"; + case '}': + return "right curly bracket"; + case '\\': + return "backslash"; + case '^': + return "circumflex"; + case '`': + return "grave accent"; + case '|': + return "pipe"; + case '"': + return "quotation mark"; + default: + if (c < 0x20) { + return "control character"; + } else if (c >= 0x80 && c <= 0x9F) { + return "high control character"; + } else { + return String.format("character '%c'", c); + } + } + } + + /** + * Escapes characters in a string for display in error messages. + * + * @param iri the IRI to escape for display + * @return escaped version suitable for error messages + */ + public static String escapeForDisplay(String iri) { + StringBuilder sb = new StringBuilder(); + for (int i = 0; i < iri.length(); i++) { + char c = iri.charAt(i); + if (c < 0x20 || (c >= 0x7F && c <= 0x9F)) { + // Display control characters as Unicode escapes + sb.append(String.format("\\u%04X", (int) c)); + } else if (c > 0x7E) { + // Display non-ASCII as Unicode escapes for clarity + sb.append(String.format("\\u%04X", (int) c)); + } else if (c == '<' || c == '>' || c == '{' || c == '}' || c == '\\' || c == '^' || c == '`' || c == '|' || c == '"') { + // Display reserved characters with backslash escape + sb.append('\\').append(c); + } else { + // Display normal ASCII characters as-is + sb.append(c); + } + } + return sb.toString(); + } +} \ No newline at end of file diff --git a/src/main/java/fr/inria/corese/core/next/impl/io/parser/common/AbstractTurtleTriGListener.java b/src/main/java/fr/inria/corese/core/next/impl/io/parser/common/AbstractTurtleTriGListener.java index 26ff5f29c..451e2214f 100644 --- a/src/main/java/fr/inria/corese/core/next/impl/io/parser/common/AbstractTurtleTriGListener.java +++ b/src/main/java/fr/inria/corese/core/next/impl/io/parser/common/AbstractTurtleTriGListener.java @@ -2,6 +2,7 @@ import fr.inria.corese.core.next.api.*; import fr.inria.corese.core.next.impl.common.literal.XSD; +import fr.inria.corese.core.next.impl.common.util.IRIUtils; import fr.inria.corese.core.next.impl.common.vocabulary.RDF; import fr.inria.corese.core.next.impl.exception.ParsingErrorException; import fr.inria.corese.core.next.impl.io.parser.util.ParserConstants; @@ -58,12 +59,12 @@ public void initializeBasePrefix() { * * @param text raw IRI text including angle brackets * @return unescaped IRI string + * @throws ParsingErrorException if the IRI contains invalid characters after escape processing */ public String extractAndUnescapeIRI(String text) { String iri = text.substring(1, text.length() - 1); iri = unescapeIRI(iri); - validateIRI(iri); - return iri; + return validateIRI(iri) ? iri : iri; } /** @@ -87,7 +88,7 @@ public void updateBaseURI(String newBase) { */ public void registerPrefix(String prefix, String iri) { String resolvedIRI = resolveIRIAgainstBase(iri); - validateIRI(resolvedIRI); + validateIRI(resolvedIRI); prefixMap.put(prefix, resolvedIRI); model.setNamespace(prefix, resolvedIRI); @@ -655,24 +656,23 @@ public Literal createNumericLiteral(String text, NumericType type) { * Validates that an IRI contains only valid characters after escape sequence processing. * * @param iri the IRI string to validate (after escape sequences have been processed) + * @return true if the IRI is valid * @throws ParsingErrorException if the IRI contains forbidden characters */ - private void validateIRI(String iri) throws ParsingErrorException { + private boolean validateIRI(String iri) throws ParsingErrorException { if (iri == null || iri.isEmpty()) { - return; // Empty IRIs are acceptable + return true; // Empty IRIs are acceptable } - // Check each character in the IRI for (int i = 0; i < iri.length(); i++) { char c = iri.charAt(i); // Check for forbidden characters - if (isInvalidIRICharacter(c)) { + if (IRIUtils.isInvalidIRICharacter(c)) { String codePoint = String.format("U+%04X", (int) c); - String charDesc = getCharacterDescription(c); - String displayIRI = escapeForDisplay(iri); - + String charDesc = IRIUtils.getCharacterDescription(c); + String displayIRI = IRIUtils.escapeForDisplay(iri); throw new ParsingErrorException( "Invalid character in IRI: " + codePoint + " (" + charDesc + ") " + @@ -682,126 +682,7 @@ private void validateIRI(String iri) throws ParsingErrorException { ); } } - - } - - /** - * Checks if a character is invalid in an IRI according to RFC 3987. - * - * @param c the character to validate - * @return true if the character is forbidden in IRIs - */ - private boolean isInvalidIRICharacter(char c) { - // Space (U+0020) - NOT ALLOWED - if (c == 0x20) { - return true; - } - - // Control characters (U+0000-U+001F) - NOT ALLOWED - if (c >= 0x00 && c <= 0x1F) { - return true; - } - - // DEL (U+007F) - NOT ALLOWED - if (c == 0x7F) { - return true; - } - - // High control characters (U+0080-U+009F) - NOT ALLOWED - if (c >= 0x80 && c <= 0x9F) { - return true; - } - - switch (c) { - case '<': // U+003C - less than - case '>': // U+003E - greater than - case '{': // U+007B - left curly bracket - case '}': // U+007D - right curly bracket - case '\\': // U+005C - backslash - case '^': // U+005E - circumflex - case '`': // U+0060 - grave accent - case '|': // U+007C - pipe - case '"': // U+0022 - quotation mark - return true; - default: - return false; - } - } - - /** - * Returns a human-readable description of a character for error messages. - * - * @param c the character to describe - * @return human-readable description - */ - private String getCharacterDescription(char c) { - switch (c) { - case 0x00: - return "null character"; - case 0x09: - return "tab"; - case 0x0A: - return "line feed"; - case 0x0D: - return "carriage return"; - case 0x20: - return "space"; - case 0x7F: - return "delete"; - case '<': - return "less than"; - case '>': - return "greater than"; - case '{': - return "left curly bracket"; - case '}': - return "right curly bracket"; - case '\\': - return "backslash"; - case '^': - return "circumflex"; - case '`': - return "grave accent"; - case '|': - return "pipe"; - case '"': - return "quotation mark"; - default: - if (c < 0x20) { - return "control character"; - } else if (c >= 0x80 && c <= 0x9F) { - return "high control character"; - } else { - return String.format("character '%c'", c); - } - } - } - - /** - * Escapes characters in a string for display in error messages. - * - * @param iri the IRI to escape for display - * @return escaped version suitable for error messages - */ - private String escapeForDisplay(String iri) { - StringBuilder sb = new StringBuilder(); - for (int i = 0; i < iri.length(); i++) { - char c = iri.charAt(i); - if (c < 0x20 || (c >= 0x7F && c <= 0x9F)) { - // Display control characters as Unicode escapes - sb.append(String.format("\\u%04X", (int) c)); - } else if (c > 0x7E) { - // Display non-ASCII as Unicode escapes for clarity - sb.append(String.format("\\u%04X", (int) c)); - } else if (c == '<' || c == '>' || c == '{' || c == '}' || c == '\\' || c == '^' || c == '`' || c == '|' || c == '"') { - // Display reserved characters with backslash escape - sb.append('\\').append(c); - } else { - // Display normal ASCII characters as-is - sb.append(c); - } - } - return sb.toString(); + return true; } /** diff --git a/src/test/java/fr/inria/corese/core/next/api/ValueFactoryTest.java b/src/test/java/fr/inria/corese/core/next/api/ValueFactoryTest.java index 40fa3b633..d5d07b5fa 100644 --- a/src/test/java/fr/inria/corese/core/next/api/ValueFactoryTest.java +++ b/src/test/java/fr/inria/corese/core/next/api/ValueFactoryTest.java @@ -36,6 +36,7 @@ public void testCreateIRI() { assertNotNull(this.valueFactory.createIRI(correctIRI)); assertThrows(IncorrectFormatException.class, () -> this.valueFactory.createIRI(incorrectIRI)); + assertThrows(IncorrectFormatException.class, () -> this.valueFactory.createIRI(incorrectIRI)); } @Test diff --git a/src/test/java/fr/inria/corese/core/next/impl/temp/CoreseIRITest.java b/src/test/java/fr/inria/corese/core/next/impl/temp/CoreseIRITest.java index b5aa91a96..77bfc928c 100644 --- a/src/test/java/fr/inria/corese/core/next/impl/temp/CoreseIRITest.java +++ b/src/test/java/fr/inria/corese/core/next/impl/temp/CoreseIRITest.java @@ -65,6 +65,9 @@ public void constructorCoreseNodeTest() { assertEquals("test", coreseIRI2.getLocalName()); } - + @Test + public void constructorStringException() { + assertThrows(IncorrectFormatException.class, () -> new CoreseIRI("test")); + } } From bfc8c750e9da9a5b83b6e342f798f5614e1ad14e Mon Sep 17 00:00:00 2001 From: "AD\\aabdoun" Date: Tue, 4 Nov 2025 09:09:47 +0100 Subject: [PATCH 09/13] [#212-REVUE] Implement standard tests for Canonical RDF into Corese-W3C --- .../core/next/api/base/model/AbstractIRI.java | 3 - .../core/next/impl/common/util/IRIUtils.java | 63 ++++++++++++++++--- .../next/impl/common/util/IRIUtilsTest.java | 22 ++++++- .../core/next/impl/temp/CoreseIRITest.java | 10 ++- 4 files changed, 84 insertions(+), 14 deletions(-) diff --git a/src/main/java/fr/inria/corese/core/next/api/base/model/AbstractIRI.java b/src/main/java/fr/inria/corese/core/next/api/base/model/AbstractIRI.java index 1d10a7b5c..c5b519008 100644 --- a/src/main/java/fr/inria/corese/core/next/api/base/model/AbstractIRI.java +++ b/src/main/java/fr/inria/corese/core/next/api/base/model/AbstractIRI.java @@ -25,9 +25,6 @@ public abstract class AbstractIRI implements IRI, Comparable { * @throws IncorrectFormatException if the IRI format is incorrect */ protected AbstractIRI(String fullIRI) { - if (fullIRI == null) { - throw new IllegalArgumentException("fullIRI cannot be null"); - } if (!IRIUtils.isStandardIRI(fullIRI)) { throw new IncorrectFormatException("IRI '" + fullIRI + "' must be a valid IRI"); } diff --git a/src/main/java/fr/inria/corese/core/next/impl/common/util/IRIUtils.java b/src/main/java/fr/inria/corese/core/next/impl/common/util/IRIUtils.java index 947ce2a31..deaec8e7d 100644 --- a/src/main/java/fr/inria/corese/core/next/impl/common/util/IRIUtils.java +++ b/src/main/java/fr/inria/corese/core/next/impl/common/util/IRIUtils.java @@ -9,7 +9,7 @@ /** * Utility class for IRI. - * + *

* Intended to facilitate string manipulation related to IRI. */ public class IRIUtils { @@ -23,6 +23,7 @@ public class IRIUtils { "(?(\\#))?" + "(?([\\w\\-_]+))?)?$"); private static final Pattern STANDARD_IRI_PATTERN = Pattern.compile("^(([^:/?#\\s]+):)(\\/\\/([^/?#\\s]*))?([^?#\\s]*)(\\?([^#\\s]*))?(#(.*))?"); + private static final Pattern RELATIVE_IRI_PATTERN = Pattern.compile("^[^\\s\\p{Cc}]+$"); private static final int MAX_IRI_LENGTH = 2048; private static final long REGEX_TIMEOUT_MS = 100; @@ -35,6 +36,7 @@ private IRIUtils() { /** * Guesses the namespace of an IRI using a regex pattern. + * * @param iri The IRI string to be processed. * @return the guessed namespace of the IRI or an empty string if no match is found. */ @@ -45,18 +47,19 @@ public static String guessNamespace(String iri) { try { Matcher matcher = matchWithTimeout(IRI_PATTERN, iri); if (matcher == null || !matcher.matches()) { - return ""; + return iri.endsWith("#") ? iri : (iri.contains("#") ? iri.substring(0, iri.lastIndexOf("#") + 1) : iri); + } else if (matcher.matches()) { if (matcher.group("protocol") != null && matcher.group("protocol").equals("_")) { return ""; } StringBuilder namespace = new StringBuilder(); namespace.append(matcher.group("protocol")).append(":"); - if(matcher.group("dblSlashes") != null) { + if (matcher.group("dblSlashes") != null) { namespace.append(matcher.group("dblSlashes")); } namespace.append(matcher.group("domain")); - if(matcher.group("path") != null) { + if (matcher.group("path") != null) { namespace.append(matcher.group("path")); } if((matcher.group("fragment") != null || matcher.group("anchor") != null) && matcher.group("finalPath") != null) { @@ -74,6 +77,7 @@ public static String guessNamespace(String iri) { /** * Guesses the local name of an IRI using a regex pattern. + * * @param iri The IRI string to be processed. * @return the guessed local name of the IRI or an empty string if no match is found. */ @@ -84,11 +88,11 @@ public static String guessLocalName(String iri) { try { Matcher matcher = matchWithTimeout(IRI_PATTERN, iri); if (matcher == null || !matcher.matches()) { - return ""; + return iri; } else if (matcher.matches()) { - if(matcher.group("fragment") != null){ // If the IRI has a fragment + if (matcher.group("fragment") != null) { // If the IRI has a fragment return matcher.group("fragment"); - } else if(matcher.group("finalPath") != null ) { // If the IRI has no fragment but do not ends with a slash + } else if (matcher.group("finalPath") != null) { // If the IRI has no fragment but do not ends with a slash return matcher.group("finalPath"); } else { // If the URI ends with a slash return ""; @@ -103,6 +107,8 @@ public static String guessLocalName(String iri) { /** * Checks if the given string is a valid IRI using a regex pattern extracted from the W3C standards. + * Removes leading/trailing whitespace and non-breaking spaces before validation. + * * @param iriString The string to be checked. * @return true if the string is a valid IRI, false otherwise. */ @@ -111,7 +117,49 @@ public static boolean isStandardIRI(String iriString) { return false; } + // Remove leading whitespace and U+00A0 (non-breaking space) + int start = 0; + while (start < iriString.length()) { + char c = iriString.charAt(start); + if (Character.isWhitespace(c) || c == '\u00A0' || c == 160) { + start++; + } else { + break; + } + } + + // Remove trailing whitespace and U+00A0 (non-breaking space) + int end = iriString.length(); + while (end > start) { + char c = iriString.charAt(end - 1); + if (Character.isWhitespace(c) || c == '\u00A0' || c == 160) { + end--; + } else { + break; + } + } + + iriString = iriString.substring(start, end); + + if (iriString.isEmpty()) { + return false; + } + + // Reject IRIs with internal whitespace + for (char c : iriString.toCharArray()) { + if (Character.isWhitespace(c) || c == '\u00A0' || c == 160) { + return false; + } + } + try { + // If no scheme (no :), treat as relative IRI + if (!iriString.contains(":") || iriString.startsWith("#")) { + Matcher matcher = matchWithTimeout(RELATIVE_IRI_PATTERN, iriString); + return matcher != null && matcher.matches(); + } + + // If scheme present, validate as absolute IRI Matcher matcher = matchWithTimeout(STANDARD_IRI_PATTERN, iriString); if (matcher != null && matcher.matches()) { return isValidURI(iriString); @@ -122,6 +170,7 @@ public static boolean isStandardIRI(String iriString) { } } + /** * Executes regex matching with timeout protection. */ diff --git a/src/test/java/fr/inria/corese/core/next/impl/common/util/IRIUtilsTest.java b/src/test/java/fr/inria/corese/core/next/impl/common/util/IRIUtilsTest.java index d4d864fa9..4336896b5 100644 --- a/src/test/java/fr/inria/corese/core/next/impl/common/util/IRIUtilsTest.java +++ b/src/test/java/fr/inria/corese/core/next/impl/common/util/IRIUtilsTest.java @@ -23,8 +23,8 @@ public class IRIUtilsTest { // Array of strings that should be recognized as correct IRIs. Some of them taken from the official IRI documentation. private static final String[] correctARIs = { uriSchema, uriWithFragment, uriWithQuery, uriWithPort, uriWithPortAndQuery, uriWithPortAndQueryAndFragment, uriWithPortAndFragment, uriToHTMLPage, uriToHTMLPageWithQuery, uriToHTMLPageWithQueryAndFragment, uriToHTMLPageWithFragment, "ftp://ftp.is.co.za/rfc/rfc1808.txt", "http://www.ietf.org/rfc/rfc2396.txt", "ldap://[2001:db8::7]/c=GB?objectClass?one", "mailto:John.Doe@example.com", "news:comp.infosystems.www.servers.unix", "tel:+1-816-555-1212", "telnet://192.0.2.16:80/", "urn:oasis:names:specification:docbook:dtd:xml:4.1.2", "http://foo.co.uk/", "http://regexr.com/foo.html?q=bar" }; - private static final String[] incorrectIRIs = { "0123456789 +-.,!@#$%^&*();\\\\/|<>\\\"\\'", "12345 -98.7 3.141 .6180 9,000 +42", "555.123.4567\t+1-(800)-555-2468", "foodemo.net", "bar.ba.test.co.uk", "www.demo.com", "g.com", "g-.com", "com.g", "-g.com", "xn--d1ai6ai.xn--p1ai", "xn-fsqu00a.xn-0zwm56d", "xn--stackoverflow.com", "stackoverflow.xn--com", "stackoverflow.co.uk", "google.com.au", "-0-0o.com", "0-0o_.com" }; - + private static final String[] incorrectIRIs = {"0123456789 +-.,!@#$%^&*()","12345 -98.7 3.141","555.123.4567\t+1-(800)","test\nstring","test\rstring","test\u0000string"," ","\u00A0",""," \t ", // Only whitespace + }; @Test public void guessNamespaceTest() { assertEquals("http://schema.org/test/test/", IRIUtils.guessNamespace(uriSchema)); @@ -64,8 +64,24 @@ public void isStandardIRITest() { assertTrue(IRIUtils.isStandardIRI(iri)); } for (String iri : incorrectIRIs) { - assertFalse(IRIUtils.isStandardIRI(iri)); + assertFalse(IRIUtils.isStandardIRI(iri), "Expected '" + escapeForDisplay(iri) + "' to be an invalid IRI"); } } + /** + * Helper method to escape strings for display in test failure messages + */ + private static String escapeForDisplay(String str) { + StringBuilder sb = new StringBuilder(); + for (char c : str.toCharArray()) { + if (c < 0x20 || (c >= 0x7F && c <= 0x9F)) { + sb.append(String.format("\\u%04X", (int) c)); + } else { + sb.append(c); + } + } + return sb.toString(); + } + + } diff --git a/src/test/java/fr/inria/corese/core/next/impl/temp/CoreseIRITest.java b/src/test/java/fr/inria/corese/core/next/impl/temp/CoreseIRITest.java index 77bfc928c..4d0486718 100644 --- a/src/test/java/fr/inria/corese/core/next/impl/temp/CoreseIRITest.java +++ b/src/test/java/fr/inria/corese/core/next/impl/temp/CoreseIRITest.java @@ -67,7 +67,15 @@ public void constructorCoreseNodeTest() { @Test public void constructorStringException() { - assertThrows(IncorrectFormatException.class, () -> new CoreseIRI("test")); + + assertThrows(IncorrectFormatException.class, () -> new CoreseIRI(" ")); + + assertThrows(IncorrectFormatException.class, () -> new CoreseIRI("\u00A0")); + + assertThrows(IncorrectFormatException.class, () -> new CoreseIRI("")); + + assertThrows(IncorrectFormatException.class, () -> new CoreseIRI("test string")); + } } From 69cbb416e1aa44084fba9c18da5c87b9891f3bd9 Mon Sep 17 00:00:00 2001 From: "AD\\aabdoun" Date: Thu, 6 Nov 2025 10:58:55 +0100 Subject: [PATCH 10/13] fix probleme sur turtles et trig apres rebase --- .../corese/core/next/impl/io/parser/trig/TriGParser.java | 8 +++++--- .../core/next/impl/io/parser/turtle/TurtleParser.java | 6 ++++-- 2 files changed, 9 insertions(+), 5 deletions(-) diff --git a/src/main/java/fr/inria/corese/core/next/impl/io/parser/trig/TriGParser.java b/src/main/java/fr/inria/corese/core/next/impl/io/parser/trig/TriGParser.java index db5db0402..e813b1e0f 100644 --- a/src/main/java/fr/inria/corese/core/next/impl/io/parser/trig/TriGParser.java +++ b/src/main/java/fr/inria/corese/core/next/impl/io/parser/trig/TriGParser.java @@ -88,9 +88,11 @@ public void parse(Reader reader, String baseURI) throws ParsingErrorException { if (trigErrorListener.hasErrors()) { throw new ParsingErrorException("Syntax error in TriG document: " + trigErrorListener.getErrorMessage()); } - - TriGListerner listerner = new TriGListerner(getModel(), getValueFactory(), this.getConfig(), baseURI); - walker.walk((ParseTreeListener) listerner, tree); + IOOptions optionsWithBaseURI = new TriGParserOptions.Builder() + .baseIRI(baseURI) + .build(); + TriGListerner listener = new TriGListerner(getModel(), getValueFactory(), optionsWithBaseURI); + walker.walk((ParseTreeListener) listener, tree); } catch (ParsingErrorException e) { throw e; diff --git a/src/main/java/fr/inria/corese/core/next/impl/io/parser/turtle/TurtleParser.java b/src/main/java/fr/inria/corese/core/next/impl/io/parser/turtle/TurtleParser.java index 9087863a1..b6b152e61 100644 --- a/src/main/java/fr/inria/corese/core/next/impl/io/parser/turtle/TurtleParser.java +++ b/src/main/java/fr/inria/corese/core/next/impl/io/parser/turtle/TurtleParser.java @@ -97,8 +97,10 @@ public void parse(Reader reader, String baseURI) throws ParsingErrorException { } catch (RecognitionException e) { throw new ParsingErrorException("Recognition error in Turtle document: " + e.getMessage()); } - - TurtleListener listener = new TurtleListener(getModel(), getValueFactory(), this.getConfig()); + IOOptions optionsWithBaseURI = new TurtleParserOptions.Builder() + .baseIRI(baseURI) + .build(); + TurtleListener listener = new TurtleListener(getModel(), getValueFactory(), optionsWithBaseURI); walker.walk(listener, tree); } catch (ParsingErrorException e) { From 06e17015499666aae1aa39cb2c7997dc98cbdb22 Mon Sep 17 00:00:00 2001 From: Pierre Maillot Date: Wed, 19 Nov 2025 11:08:06 +0100 Subject: [PATCH 11/13] minor fix --- .../next/impl/io/serialization/util/StatementUtils.java | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/src/main/java/fr/inria/corese/core/next/impl/io/serialization/util/StatementUtils.java b/src/main/java/fr/inria/corese/core/next/impl/io/serialization/util/StatementUtils.java index 278216acc..9b5d7d202 100644 --- a/src/main/java/fr/inria/corese/core/next/impl/io/serialization/util/StatementUtils.java +++ b/src/main/java/fr/inria/corese/core/next/impl/io/serialization/util/StatementUtils.java @@ -1,6 +1,7 @@ package fr.inria.corese.core.next.impl.io.serialization.util; import fr.inria.corese.core.next.api.*; +import fr.inria.corese.core.next.impl.common.vocabulary.XSD; import java.util.Map; @@ -120,7 +121,7 @@ public static boolean isBlankNode(Value value) { if (value instanceof Resource) { String str = value.stringValue(); - if (str.startsWith(SerializationConstants.BNODE_PREFIX)) { + if (str.startsWith(SerializationConstants.BLANK_NODE_PREFIX)) { return true; } } @@ -202,11 +203,11 @@ private static String serializeLiteral(Literal literal) { // If language tag exists, use it (language takes precedence over datatype) if (language != null && !language.isEmpty()) { - sb.append(SerializationConstants.AT_SIGN).append(language); + sb.append(SerializationConstants.AT).append(language); return sb.toString(); } - if (datatype != null && !datatype.equals(SerializationConstants.XSD_STRING)) { + if (datatype != null && !datatype.equals(XSD.xsdString.getIRI().stringValue())) { sb.append(SerializationConstants.DATATYPE_SEPARATOR) .append(SerializationConstants.LT) .append(datatype) From 70df0cfac33926bd35364f55e5f58a17ad92b593 Mon Sep 17 00:00:00 2001 From: "AD\\aabdoun" Date: Fri, 21 Nov 2025 15:31:02 +0100 Subject: [PATCH 12/13] [#212-REVUE] Implement standard tests for Canonical RDF into Corese-W3C --- .../core/next/impl/common/util/IRIUtils.java | 55 ++++++++----------- .../common/AbstractTurtleTriGListener.java | 22 +++----- .../canonical/RDFC10Canonicalizer.java | 4 +- .../io/serialization/util/StatementUtils.java | 48 ---------------- 4 files changed, 30 insertions(+), 99 deletions(-) diff --git a/src/main/java/fr/inria/corese/core/next/impl/common/util/IRIUtils.java b/src/main/java/fr/inria/corese/core/next/impl/common/util/IRIUtils.java index deaec8e7d..bd79e868a 100644 --- a/src/main/java/fr/inria/corese/core/next/impl/common/util/IRIUtils.java +++ b/src/main/java/fr/inria/corese/core/next/impl/common/util/IRIUtils.java @@ -41,14 +41,19 @@ private IRIUtils() { * @return the guessed namespace of the IRI or an empty string if no match is found. */ public static String guessNamespace(String iri) { - if (!isValidInput(iri)) { + if (isInvalidInput(iri)) { return ""; } try { Matcher matcher = matchWithTimeout(IRI_PATTERN, iri); if (matcher == null || !matcher.matches()) { - return iri.endsWith("#") ? iri : (iri.contains("#") ? iri.substring(0, iri.lastIndexOf("#") + 1) : iri); - + if (iri.endsWith("#")) { + return iri; + } else if (iri.contains("#")) { + return iri.substring(0, iri.lastIndexOf("#") + 1); + } else { + return iri; + } } else if (matcher.matches()) { if (matcher.group("protocol") != null && matcher.group("protocol").equals("_")) { return ""; @@ -82,7 +87,7 @@ public static String guessNamespace(String iri) { * @return the guessed local name of the IRI or an empty string if no match is found. */ public static String guessLocalName(String iri) { - if (!isValidInput(iri)) { + if (isInvalidInput(iri)) { return ""; } try { @@ -113,7 +118,7 @@ public static String guessLocalName(String iri) { * @return true if the string is a valid IRI, false otherwise. */ public static boolean isStandardIRI(String iriString) { - if (!isValidInput(iriString)) { + if (isInvalidInput(iriString)) { return false; } @@ -121,7 +126,7 @@ public static boolean isStandardIRI(String iriString) { int start = 0; while (start < iriString.length()) { char c = iriString.charAt(start); - if (Character.isWhitespace(c) || c == '\u00A0' || c == 160) { + if (Character.isWhitespace(c) || c == 160) { start++; } else { break; @@ -132,7 +137,7 @@ public static boolean isStandardIRI(String iriString) { int end = iriString.length(); while (end > start) { char c = iriString.charAt(end - 1); - if (Character.isWhitespace(c) || c == '\u00A0' || c == 160) { + if (Character.isWhitespace(c) || c == 160) { end--; } else { break; @@ -147,7 +152,7 @@ public static boolean isStandardIRI(String iriString) { // Reject IRIs with internal whitespace for (char c : iriString.toCharArray()) { - if (Character.isWhitespace(c) || c == '\u00A0' || c == 160) { + if (Character.isWhitespace(c) || c == 160) { return false; } } @@ -203,11 +208,11 @@ private static Matcher matchWithTimeout(Pattern pattern, String input) { /** * Validates input string for basic security checks. */ - private static boolean isValidInput(String input) { - return input != null && - !input.isEmpty() && - input.length() <= MAX_IRI_LENGTH && - !containsSuspiciousPatterns(input); + private static boolean isInvalidInput(String input) { + return input == null || + input.isEmpty() || + input.length() > MAX_IRI_LENGTH || + containsSuspiciousPatterns(input); } /** @@ -250,12 +255,6 @@ private static boolean isValidURI(String uriString) { * @return true if the character is forbidden in IRIs */ public static boolean isInvalidIRICharacter(char c) { - // Space (U+0020) - NOT ALLOWED - if (c == 0x20) { - return true; - } - - // Control characters (U+0000-U+001F) - NOT ALLOWED if (c >= 0x00 && c <= 0x1F) { return true; } @@ -270,20 +269,10 @@ public static boolean isInvalidIRICharacter(char c) { return true; } - switch (c) { - case '<': // U+003C - less than - case '>': // U+003E - greater than - case '{': // U+007B - left curly bracket - case '}': // U+007D - right curly bracket - case '\\': // U+005C - backslash - case '^': // U+005E - circumflex - case '`': // U+0060 - grave accent - case '|': // U+007C - pipe - case '"': // U+0022 - quotation mark - return true; - default: - return false; - } + return switch (c) { + case '<', '>', '{', '}', '\\', '^', '`', '|', '"' -> true; + default -> false; + }; } /** diff --git a/src/main/java/fr/inria/corese/core/next/impl/io/parser/common/AbstractTurtleTriGListener.java b/src/main/java/fr/inria/corese/core/next/impl/io/parser/common/AbstractTurtleTriGListener.java index 451e2214f..54a34f313 100644 --- a/src/main/java/fr/inria/corese/core/next/impl/io/parser/common/AbstractTurtleTriGListener.java +++ b/src/main/java/fr/inria/corese/core/next/impl/io/parser/common/AbstractTurtleTriGListener.java @@ -63,8 +63,7 @@ public void initializeBasePrefix() { */ public String extractAndUnescapeIRI(String text) { String iri = text.substring(1, text.length() - 1); - iri = unescapeIRI(iri); - return validateIRI(iri) ? iri : iri; + return unescapeIRI(iri); } /** @@ -150,12 +149,9 @@ public String resolveIRI(String raw) { } } - String result = resolveIRIAgainstBase(raw); - return result; + return resolveIRIAgainstBase(raw); } catch (ParsingErrorException e) { - throw e; - } catch (IllegalArgumentException e) { throw new ParsingErrorException(e.getMessage(), e); } } @@ -641,15 +637,11 @@ public Literal createBooleanLiteral(String text) { * @return numeric literal with corresponding XSD datatype */ public Literal createNumericLiteral(String text, NumericType type) { - switch (type) { - case DOUBLE: - return factory.createLiteral(text, XSD.DOUBLE.getIRI()); - case DECIMAL: - return factory.createLiteral(text, XSD.DECIMAL.getIRI()); - case INTEGER: - default: - return factory.createLiteral(text, XSD.INTEGER.getIRI()); - } + return switch (type) { + case DOUBLE -> factory.createLiteral(text, XSD.DOUBLE.getIRI()); + case DECIMAL -> factory.createLiteral(text, XSD.DECIMAL.getIRI()); + default -> factory.createLiteral(text, XSD.INTEGER.getIRI()); + }; } /** diff --git a/src/main/java/fr/inria/corese/core/next/impl/io/serialization/canonical/RDFC10Canonicalizer.java b/src/main/java/fr/inria/corese/core/next/impl/io/serialization/canonical/RDFC10Canonicalizer.java index e6f7b0d40..0b32a5185 100644 --- a/src/main/java/fr/inria/corese/core/next/impl/io/serialization/canonical/RDFC10Canonicalizer.java +++ b/src/main/java/fr/inria/corese/core/next/impl/io/serialization/canonical/RDFC10Canonicalizer.java @@ -395,11 +395,9 @@ private List replaceBlankNodesAndSort(List statements, Map .map(stmt -> statementUtils.replaceBlankNodes(stmt, canonicalMap)) .toList(); - List sorted = replaced.stream() + return replaced.stream() .sorted(Comparator.comparing(StatementUtils::toNQuad)) .toList(); - - return sorted; } /** diff --git a/src/main/java/fr/inria/corese/core/next/impl/io/serialization/util/StatementUtils.java b/src/main/java/fr/inria/corese/core/next/impl/io/serialization/util/StatementUtils.java index 9b5d7d202..eb8ce9f75 100644 --- a/src/main/java/fr/inria/corese/core/next/impl/io/serialization/util/StatementUtils.java +++ b/src/main/java/fr/inria/corese/core/next/impl/io/serialization/util/StatementUtils.java @@ -285,52 +285,4 @@ public static String toNQuad(Statement statement) { return sb.toString(); } - - /** - * Converts a statement to canonical N-Quad format for hashing, replacing - * a specific blank node with a placeholder string. - * - * @param quad The statement to convert. - * @param blankNodeToReplace The blank node identifier to replace. - * @param replacement The placeholder string to use for replacement. - * @return A canonical N-Quad string with placeholder substitution. - */ - public static String quadToNQuad(Statement quad, String blankNodeToReplace, String replacement) { - StringBuilder sb = new StringBuilder(); - - // Sujet - if (StatementUtils.isBlankNode(quad.getSubject())) { - String bnodeId = StatementUtils.getBlankNodeId(quad.getSubject()); - sb.append(bnodeId.equals(blankNodeToReplace) ? replacement : SerializationConstants.CANONICAL_BNODE_PLACEHOLDER); - } else { - sb.append(StatementUtils.serializeForComparison(quad.getSubject())); - } - sb.append(SerializationConstants.SPACE); - - // Prédicat - sb.append(StatementUtils.serializeForComparison(quad.getPredicate())).append(SerializationConstants.SPACE); - - // Objet - if (StatementUtils.isBlankNode(quad.getObject())) { - String bnodeId = StatementUtils.getBlankNodeId(quad.getObject()); - // CORRECTION : utiliser le placeholder canonique pour tous les autres nœuds blank - sb.append(bnodeId.equals(blankNodeToReplace) ? replacement : SerializationConstants.CANONICAL_BNODE_PLACEHOLDER); - } else { - sb.append(StatementUtils.serializeForComparison(quad.getObject())); - } - - // Contexte - if (quad.getContext() != null) { - sb.append(SerializationConstants.SPACE); - if (StatementUtils.isBlankNode(quad.getContext())) { - String bnodeId = StatementUtils.getBlankNodeId(quad.getContext()); - sb.append(bnodeId.equals(blankNodeToReplace) ? replacement : SerializationConstants.CANONICAL_BNODE_PLACEHOLDER); - } else { - sb.append(StatementUtils.serializeForComparison(quad.getContext())); - } - } - - sb.append(SerializationConstants.SPACE).append(SerializationConstants.POINT); - return sb.toString(); - } } \ No newline at end of file From 174e5816fbe2aae378ca190409418cc970d93d85 Mon Sep 17 00:00:00 2001 From: Pierre Maillot Date: Mon, 24 Nov 2025 14:38:41 +0100 Subject: [PATCH 13/13] Fixing IRIutils for compat between RDFa and RDFC --- .../core/next/impl/common/util/IRIUtils.java | 56 ++++--------------- .../next/impl/io/parser/rdfa/RDFaParser.java | 9 ++- 2 files changed, 17 insertions(+), 48 deletions(-) diff --git a/src/main/java/fr/inria/corese/core/next/impl/common/util/IRIUtils.java b/src/main/java/fr/inria/corese/core/next/impl/common/util/IRIUtils.java index bd79e868a..4ccb40c33 100644 --- a/src/main/java/fr/inria/corese/core/next/impl/common/util/IRIUtils.java +++ b/src/main/java/fr/inria/corese/core/next/impl/common/util/IRIUtils.java @@ -112,59 +112,15 @@ public static String guessLocalName(String iri) { /** * Checks if the given string is a valid IRI using a regex pattern extracted from the W3C standards. - * Removes leading/trailing whitespace and non-breaking spaces before validation. - * * @param iriString The string to be checked. * @return true if the string is a valid IRI, false otherwise. */ public static boolean isStandardIRI(String iriString) { - if (isInvalidInput(iriString)) { + if (!isValidInput(iriString)) { return false; } - // Remove leading whitespace and U+00A0 (non-breaking space) - int start = 0; - while (start < iriString.length()) { - char c = iriString.charAt(start); - if (Character.isWhitespace(c) || c == 160) { - start++; - } else { - break; - } - } - - // Remove trailing whitespace and U+00A0 (non-breaking space) - int end = iriString.length(); - while (end > start) { - char c = iriString.charAt(end - 1); - if (Character.isWhitespace(c) || c == 160) { - end--; - } else { - break; - } - } - - iriString = iriString.substring(start, end); - - if (iriString.isEmpty()) { - return false; - } - - // Reject IRIs with internal whitespace - for (char c : iriString.toCharArray()) { - if (Character.isWhitespace(c) || c == 160) { - return false; - } - } - try { - // If no scheme (no :), treat as relative IRI - if (!iriString.contains(":") || iriString.startsWith("#")) { - Matcher matcher = matchWithTimeout(RELATIVE_IRI_PATTERN, iriString); - return matcher != null && matcher.matches(); - } - - // If scheme present, validate as absolute IRI Matcher matcher = matchWithTimeout(STANDARD_IRI_PATTERN, iriString); if (matcher != null && matcher.matches()) { return isValidURI(iriString); @@ -176,6 +132,16 @@ public static boolean isStandardIRI(String iriString) { } + /** + * Validates input string for basic security checks. + */ + private static boolean isValidInput(String input) { + return input != null && + !input.isEmpty() && + input.length() <= MAX_IRI_LENGTH && + !containsSuspiciousPatterns(input); + } + /** * Executes regex matching with timeout protection. */ diff --git a/src/main/java/fr/inria/corese/core/next/impl/io/parser/rdfa/RDFaParser.java b/src/main/java/fr/inria/corese/core/next/impl/io/parser/rdfa/RDFaParser.java index 66a868b4e..c601825e5 100644 --- a/src/main/java/fr/inria/corese/core/next/impl/io/parser/rdfa/RDFaParser.java +++ b/src/main/java/fr/inria/corese/core/next/impl/io/parser/rdfa/RDFaParser.java @@ -16,6 +16,8 @@ import org.jsoup.nodes.Attribute; import org.jsoup.nodes.Document; import org.jsoup.nodes.Element; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; import java.io.InputStream; import java.io.InputStreamReader; @@ -28,6 +30,8 @@ */ public class RDFaParser extends AbstractRDFParser { + private static final String BASE_TAG = "base"; + private static final String REL_ATTR = "rel"; private static final String REV_ATTR = "rev"; private static final String CONTENT_ATTR = "content"; @@ -89,10 +93,10 @@ private void processDocument(Document document, IRI baseIri) { if (baseIri.stringValue().equals(IOConstants.getDefaultBaseURI())) { // Looking for the node in the document IRI baseIriFromXml = baseIri; - Iterator baseElementIterator = document.stream().filter(element -> element.nameIs("base")).iterator(); + Iterator baseElementIterator = document.stream().filter(element -> element.nameIs(BASE_TAG)).iterator(); while (baseElementIterator.hasNext()) { Element baseElement = baseElementIterator.next(); - Attribute baseElementHrefAttribute = baseElement.attribute("href"); + Attribute baseElementHrefAttribute = baseElement.attribute(HREF_ATTR); if (baseElementHrefAttribute != null) { String baseIriString = baseElementHrefAttribute.getValue(); baseIriFromXml = getValueFactory().createIRI(baseIriString); @@ -288,7 +292,6 @@ private void processElement(Element element, RDFaEvaluationContext context, bool } else { currentObjectLiteral = this.getValueFactory().createLiteral(value); } - this.getModel().add(newSubject, property, currentObjectLiteral); } }