Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
153 changes: 138 additions & 15 deletions src/main/java/fr/inria/corese/core/next/impl/common/util/IRIUtils.java
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@

/**
* Utility class for IRI.
*
* <p>
* Intended to facilitate string manipulation related to IRI.
*/
public class IRIUtils {
Expand All @@ -23,6 +23,7 @@ public class IRIUtils {
"(?<anchor>(\\#))?" +
"(?<fragment>([\\w\\-_]+))?)?$");
private static final Pattern STANDARD_IRI_PATTERN = Pattern.compile("^(([^:/?#\\s]+):)(\\/\\/([^/?#\\s]*))?([^?#\\s]*)(\\?([^#\\s]*))?(#(.*))?");
private static final Pattern RELATIVE_IRI_PATTERN = Pattern.compile("^[^\\s\\p{Cc}]+$");
private static final int MAX_IRI_LENGTH = 2048;
private static final long REGEX_TIMEOUT_MS = 100;

Expand All @@ -35,28 +36,35 @@ private IRIUtils() {

/**
* Guesses the namespace of an IRI using a regex pattern.
*
* @param iri The IRI string to be processed.
* @return the guessed namespace of the IRI or an empty string if no match is found.
*/
public static String guessNamespace(String iri) {
if (!isValidInput(iri)) {
if (isInvalidInput(iri)) {
return "";
}
try {
Matcher matcher = matchWithTimeout(IRI_PATTERN, iri);
if (matcher == null || !matcher.matches()) {
return "";
if (iri.endsWith("#")) {
return iri;
} else if (iri.contains("#")) {
return iri.substring(0, iri.lastIndexOf("#") + 1);
} else {
return iri;
}
} else if (matcher.matches()) {
if (matcher.group("protocol") != null && matcher.group("protocol").equals("_")) {
return "";
}
StringBuilder namespace = new StringBuilder();
namespace.append(matcher.group("protocol")).append(":");
if(matcher.group("dblSlashes") != null) {
if (matcher.group("dblSlashes") != null) {
namespace.append(matcher.group("dblSlashes"));
}
namespace.append(matcher.group("domain"));
if(matcher.group("path") != null) {
if (matcher.group("path") != null) {
namespace.append(matcher.group("path"));
}
if((matcher.group("fragment") != null || matcher.group("anchor") != null) && matcher.group("finalPath") != null) {
Expand All @@ -74,21 +82,22 @@ public static String guessNamespace(String iri) {

/**
* Guesses the local name of an IRI using a regex pattern.
*
* @param iri The IRI string to be processed.
* @return the guessed local name of the IRI or an empty string if no match is found.
*/
public static String guessLocalName(String iri) {
if (!isValidInput(iri)) {
if (isInvalidInput(iri)) {
return "";
}
try {
Matcher matcher = matchWithTimeout(IRI_PATTERN, iri);
if (matcher == null || !matcher.matches()) {
return "";
return iri;
} else if (matcher.matches()) {
if(matcher.group("fragment") != null){ // If the IRI has a fragment
if (matcher.group("fragment") != null) { // If the IRI has a fragment
return matcher.group("fragment");
} else if(matcher.group("finalPath") != null ) { // If the IRI has no fragment but do not ends with a slash
} else if (matcher.group("finalPath") != null) { // If the IRI has no fragment but do not ends with a slash
return matcher.group("finalPath");
} else { // If the URI ends with a slash
return "";
Expand Down Expand Up @@ -122,6 +131,17 @@ public static boolean isStandardIRI(String iriString) {
}
}


/**
* Validates input string for basic security checks.
*/
private static boolean isValidInput(String input) {
return input != null &&
!input.isEmpty() &&
input.length() <= MAX_IRI_LENGTH &&
!containsSuspiciousPatterns(input);
}

/**
* Executes regex matching with timeout protection.
*/
Expand Down Expand Up @@ -154,11 +174,11 @@ private static Matcher matchWithTimeout(Pattern pattern, String input) {
/**
* Validates input string for basic security checks.
*/
private static boolean isValidInput(String input) {
return input != null &&
!input.isEmpty() &&
input.length() <= MAX_IRI_LENGTH &&
!containsSuspiciousPatterns(input);
private static boolean isInvalidInput(String input) {
return input == null ||
input.isEmpty() ||
input.length() > MAX_IRI_LENGTH ||
containsSuspiciousPatterns(input);
}

/**
Expand Down Expand Up @@ -193,4 +213,107 @@ private static boolean isValidURI(String uriString) {
return false;
}
}
}

/**
* Checks if a character is invalid in an IRI according to RFC
*
* @param c the character to validate
* @return true if the character is forbidden in IRIs
*/
public static boolean isInvalidIRICharacter(char c) {
if (c >= 0x00 && c <= 0x1F) {
return true;
}

// DEL (U+007F) - NOT ALLOWED
if (c == 0x7F) {
return true;
}

// High control characters (U+0080-U+009F) - NOT ALLOWED
if (c >= 0x80 && c <= 0x9F) {
return true;
}

return switch (c) {
case '<', '>', '{', '}', '\\', '^', '`', '|', '"' -> true;
default -> false;
};
}

/**
* Returns a human-readable description of a character for error messages.
*
* @param c the character to describe
* @return human-readable description
*/
public static String getCharacterDescription(char c) {
switch (c) {
case 0x00:
return "null character";
case 0x09:
return "tab";
case 0x0A:
return "line feed";
case 0x0D:
return "carriage return";
case 0x20:
return "space";
case 0x7F:
return "delete";
case '<':
return "less than";
case '>':
return "greater than";
case '{':
return "left curly bracket";
case '}':
return "right curly bracket";
case '\\':
return "backslash";
case '^':
return "circumflex";
case '`':
return "grave accent";
case '|':
return "pipe";
case '"':
return "quotation mark";
default:
if (c < 0x20) {
return "control character";
} else if (c >= 0x80 && c <= 0x9F) {
return "high control character";
} else {
return String.format("character '%c'", c);
}
}
}

/**
* Escapes characters in a string for display in error messages.
*
* @param iri the IRI to escape for display
* @return escaped version suitable for error messages
*/
public static String escapeForDisplay(String iri) {
StringBuilder sb = new StringBuilder();
for (int i = 0; i < iri.length(); i++) {
char c = iri.charAt(i);
if (c < 0x20 || (c >= 0x7F && c <= 0x9F)) {
// Display control characters as Unicode escapes
sb.append(String.format("\\u%04X", (int) c));
} else if (c > 0x7E) {
// Display non-ASCII as Unicode escapes for clarity
sb.append(String.format("\\u%04X", (int) c));
} else if (c == '<' || c == '>' || c == '{' || c == '}' || c == '\\' || c == '^' || c == '`' || c == '|' || c == '"') {
// Display reserved characters with backslash escape
sb.append('\\').append(c);
} else {
// Display normal ASCII characters as-is
sb.append(c);
}
}
return sb.toString();
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@

import fr.inria.corese.core.next.api.*;
import fr.inria.corese.core.next.impl.common.literal.XSD;
import fr.inria.corese.core.next.impl.common.util.IRIUtils;
import fr.inria.corese.core.next.impl.common.vocabulary.RDF;
import fr.inria.corese.core.next.impl.exception.ParsingErrorException;
import fr.inria.corese.core.next.impl.io.parser.util.ParserConstants;
Expand All @@ -27,6 +28,8 @@ public abstract class AbstractTurtleTriGListener {
public Resource currentSubject;
public IRI currentPredicate;

private final java.util.Set<String> explicitlyDeclaredPrefixes = new java.util.HashSet<>();

/**
* Constructs a parser listener with the specified model, factory and base URI.
*
Expand Down Expand Up @@ -56,6 +59,7 @@ public void initializeBasePrefix() {
*
* @param text raw IRI text including angle brackets
* @return unescaped IRI string
* @throws ParsingErrorException if the IRI contains invalid characters after escape processing
*/
public String extractAndUnescapeIRI(String text) {
String iri = text.substring(1, text.length() - 1);
Expand All @@ -69,6 +73,8 @@ public String extractAndUnescapeIRI(String text) {
*/
public void updateBaseURI(String newBase) {
this.baseURI = resolveIRIAgainstBase(newBase);
validateIRI(this.baseURI);

prefixMap.put(ParserConstants.EMPTY_STRING, this.baseURI);
model.setNamespace(ParserConstants.EMPTY_STRING, this.baseURI);
}
Expand All @@ -81,8 +87,11 @@ public void updateBaseURI(String newBase) {
*/
public void registerPrefix(String prefix, String iri) {
String resolvedIRI = resolveIRIAgainstBase(iri);
validateIRI(resolvedIRI);
prefixMap.put(prefix, resolvedIRI);
model.setNamespace(prefix, resolvedIRI);

explicitlyDeclaredPrefixes.add(prefix);
}

/**
Expand All @@ -109,6 +118,7 @@ public String resolveIRI(String raw) {
if (raw.startsWith(ParserConstants.IRI_START) && raw.endsWith(ParserConstants.IRI_END)) {
String iri = raw.substring(1, raw.length() - 1);
iri = unescapeIRI(iri);
validateIRI(iri);
return iri.isEmpty() ? getEffectiveBaseURI() : resolveIRIAgainstBase(iri);
}

Expand All @@ -117,24 +127,31 @@ public String resolveIRI(String raw) {
String prefix = parts[0];
String localName = parts[1];

if (prefix.isEmpty() && !explicitlyDeclaredPrefixes.contains("")) {
throw new ParsingErrorException(
"Syntax error: prefixed name ':' + '" + localName + "' used but ':' prefix was never declared. " +
"Use @prefix : <baseURI> to declare the empty prefix."
);
}

if (prefixMap.containsKey(prefix)) {
localName = unescapeIRI(localName);
String ns = prefixMap.get(prefix);
if (ns != null) {
return ns + localName;
String result = ns + localName;
validateIRI(result);
return result;
}
}

if (isAbsoluteIRI(raw)) {
} else if (isAbsoluteIRI(raw)) {
return raw;
} else {
throw new ParsingErrorException("Undeclared prefix: " + prefix);
}

throw new ParsingErrorException("Undeclared prefix: " + prefix);
}

return resolveIRIAgainstBase(raw);
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Minor: Put back the return in inline form here


} catch (IllegalArgumentException e) {
} catch (ParsingErrorException e) {
throw new ParsingErrorException(e.getMessage(), e);
}
}
Expand Down Expand Up @@ -448,6 +465,7 @@ public String getEffectiveBaseURI() {
String effective = (baseURI != null && !baseURI.isEmpty()) ? baseURI : ParserConstants.getDefaultBaseURI();
return normalizeURI(effective);
}

/**
* Processes Unicode escape sequences in IRIs.
*
Expand Down Expand Up @@ -619,15 +637,44 @@ public Literal createBooleanLiteral(String text) {
* @return numeric literal with corresponding XSD datatype
*/
public Literal createNumericLiteral(String text, NumericType type) {
switch (type) {
case DOUBLE:
return factory.createLiteral(text, XSD.DOUBLE.getIRI());
case DECIMAL:
return factory.createLiteral(text, XSD.DECIMAL.getIRI());
case INTEGER:
default:
return factory.createLiteral(text, XSD.INTEGER.getIRI());
return switch (type) {
case DOUBLE -> factory.createLiteral(text, XSD.DOUBLE.getIRI());
case DECIMAL -> factory.createLiteral(text, XSD.DECIMAL.getIRI());
default -> factory.createLiteral(text, XSD.INTEGER.getIRI());
};
}

/**
* Validates that an IRI contains only valid characters after escape sequence processing.
*
* @param iri the IRI string to validate (after escape sequences have been processed)
* @return true if the IRI is valid
* @throws ParsingErrorException if the IRI contains forbidden characters
*/
private boolean validateIRI(String iri) throws ParsingErrorException {
if (iri == null || iri.isEmpty()) {
return true; // Empty IRIs are acceptable
}

// Check each character in the IRI
for (int i = 0; i < iri.length(); i++) {
char c = iri.charAt(i);

// Check for forbidden characters
if (IRIUtils.isInvalidIRICharacter(c)) {
String codePoint = String.format("U+%04X", (int) c);
String charDesc = IRIUtils.getCharacterDescription(c);
String displayIRI = IRIUtils.escapeForDisplay(iri);

throw new ParsingErrorException(
"Invalid character in IRI: " + codePoint + " (" + charDesc + ") " +
"at position " + i + ". " +
"IRI after escape processing: " + displayIRI + ". " +
"IRIs cannot contain space, control characters, or reserved characters."
);
}
}
return true;
}

/**
Expand Down
Loading