Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -179,6 +179,7 @@ public void handleTagOpen(TagNode tag) {
attrName = attrName.toLowerCase(Locale.ROOT);
if (globalHrefAttributes.contains(attrName)) {
attrValue = decodeCharEnt(attrValue);
attrValue = trimDataUrl(attrValue);
data.addHref(PATH,makePath(name,attrName),"url",attrValue);
}
}
Expand Down Expand Up @@ -382,24 +383,36 @@ private static void addBasicHrefs(HTMLMetaData data, TagNode node, String... att
String val = node.getAttribute(attr);
if(val != null) {
val = decodeCharEnt(val);
val = trimDataUrl(val);
data.addHref(PATH,makePath(node.getTagName(),attr),"url",val);
}
}
}

private static ArrayList<String> getAttrList(TagNode node, String... attrs) {
ArrayList<String> l = new ArrayList<String>();
boolean isOgImage = false;
for(String attr : attrs) {
String val = node.getAttribute(attr);
if(val != null) {
val = decodeCharEnt(val);
l.add(attr);
l.add(val);
if (attr.equals("property") && val.equals("og:image")) {
isOgImage = true;
}
}
}
if(l.size() == 0) {
return null;
}
if (isOgImage) {
// trim data: URLs in og:image metadata
int content = l.indexOf("content");
if (content > -1 && (content % 2) == 0) {
l.set(content + 1, trimDataUrl(l.get(content + 1)));
}
}
return l;
}

Expand All @@ -409,6 +422,7 @@ private static ArrayList<String> getAttrListUrl(TagNode node,
ArrayList<String> l = null;
if(url != null) {
url = decodeCharEnt(url);
url = trimDataUrl(url);
l = new ArrayList<String>();
l.add(PATH);
l.add(makePath(node.getTagName(),urlAttr));
Expand Down Expand Up @@ -442,6 +456,7 @@ private static void addHrefsOnclick(HTMLMetaData data, TagNode node) {
for (Pattern pattern : jsOnClickUrlPatterns) {
String url = patternJSExtract(pattern, onclick);
if (url != null) {
url = trimDataUrl(url);
data.addHref(PATH, path, "url", url);
}
}
Expand Down Expand Up @@ -483,6 +498,7 @@ public void extract(HTMLMetaData data, TagNode node, ExtractingParseObserver obs
if(url != null) {
// got data:
url = decodeCharEnt(url);
url = trimDataUrl(url);
l.add(PATH);
l.add(makePath("A","href"));
l.add("url");
Expand Down Expand Up @@ -520,6 +536,7 @@ public void extract(HTMLMetaData data, TagNode node, ExtractingParseObserver obs
String url = node.getAttribute("href");
if(url != null) {
url = decodeCharEnt(url);
url = trimDataUrl(url);
ArrayList<String> l = new ArrayList<String>();
l.add(PATH);
l.add(makePath("AREA","href"));
Expand Down Expand Up @@ -583,6 +600,7 @@ public void extract(HTMLMetaData data, TagNode node, ExtractingParseObserver obs
String url = node.getAttribute("action");
if(url != null) {
url = decodeCharEnt(url);
url = trimDataUrl(url);
// got data:
l.add(PATH);
l.add(makePath("FORM","action"));
Expand Down Expand Up @@ -728,7 +746,8 @@ private void patternCSSExtract(HTMLMetaData data, Pattern pattern, String conten
String url = m.group(1);
url = cssUrlTrimPattern.matcher(url).replaceAll("");
if (!url.isEmpty()) {
data.addHref("path","STYLE/#text","href", url);
url = trimDataUrl(url);
data.addHref("path", "STYLE/#text", "href", url);
}
}
}
Expand Down Expand Up @@ -757,4 +776,36 @@ public static String decodeCharEnt(String text, boolean inAttribute) {
return text;
}
}

/**
* Trim data from
* <a href="https://www.rfc-editor.org/rfc/rfc2397#section-2">data URLs</a>.
*
* Any data (after the comma) is trimmed from a data URL. If no comma is
* found within the first 128 characters of the URL, the URL is trimmed to
* 128 characters.
*
* @param url
* URL to be trimmed
* @return
*/
public static String trimDataUrl(String url) {
if (url.startsWith("data:")) {
int posComma = url.indexOf(',', 5);
if (posComma == -1) {
// no comma, trim to 128 characters if necessary
if (url.length() > 128) {
return url.substring(0, 128);
}
return url;
} else if (posComma > 128) {
return url.substring(0, 128);
} else if (posComma == 6) {
return "data:,";
} else if (posComma > 6) {
return url.substring(0, posComma + 1);
}
}
return url;
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -60,18 +60,24 @@ public void testHandleStyleNodeExceptions() throws Exception {
}

public void testHandleStyleNode() throws Exception {
String[][] tests = {
{""},
{"url(foo.gif)","foo.gif"},
{"url('foo.gif')","foo.gif"},
{"url(\"foo.gif\")","foo.gif"},
{"url(\\\"foo.gif\\\")","foo.gif"},
{"url(\\'foo.gif\\')","foo.gif"},
{"url(''foo.gif'')","foo.gif"},
{"url( foo.gif )","foo.gif"},
{"url('''')"},
{"url('foo.gif'')","foo.gif"},
};
String[][] tests = { //
{""}, //
{"url(foo.gif)","foo.gif"}, //
{"url('foo.gif')","foo.gif"}, //
{"url(\"foo.gif\")","foo.gif"}, //
{"url(\\\"foo.gif\\\")","foo.gif"}, //
{"url(\\'foo.gif\\')","foo.gif"}, //
{"url(''foo.gif'')","foo.gif"}, //
{"url( foo.gif )","foo.gif"}, //
{"url('''')"}, //
{"url('foo.gif'')","foo.gif"}, //
{"url('')","data:image/png;base64,"}, //
{"url(\"data:image/svg+xml,%3Csvg%20xmlns=%22http://www.w3.org/2000/svg%22%20viewBox=%220%200%2080%2080%22%3E%3C/svg%3E\")",
"data:image/svg+xml," },
// would fail: the pattern extractor stops at the first white space in the data URL
// {"background-image: url('data:image/svg+xml,%3Csvg xmlns=\"http://www.w3.org/2000/svg\" viewBox=\"0 0 40 40\"%3E%3Ccircle r=\"18\" cx=\"20\" cy=\"20\" fill=\"red\" /%3E%3C/svg%3E');\n",
// "data:image/svg+xml," },
};
for(String[] testa : tests) {
checkExtract(testa);
}
Expand Down Expand Up @@ -125,7 +131,7 @@ private void checkExtract(String[] data) throws JSONException {
}
JSONArray a = md.optJSONArray("Links");
if(data.length > 1) {
assertNotNull(a);
assertNotNull("CSS link extraction failed for <" + css + ">", a);
assertEquals(data.length-1,a.length());
for(int i = 1; i < data.length; i++) {
Object o = a.optJSONObject(i-1);
Expand Down Expand Up @@ -531,4 +537,22 @@ public void testHtmlParserEntityDecoding() {
}
}

public void testTrimDataURLs() {
String[][] urls = { //
{ "", "data:image/png;base64," }, //
{ "data:image/svg+xml,%3Csvg%20xmlns=%22http://www.w3.org/2000/svg%22%20viewBox=%220%200%2080%2080%22%3E%3C/svg%3E",
"data:image/svg+xml," }, //
{ "data:image/svg+xml,%3Csvg xmlns=\"http://www.w3.org/2000/svg\" viewBox=\"0 0 40 40\"%3E%3Ccircle r=\"18\" cx=\"20\" cy=\"20\" fill=\"red\" /%3E%3C/svg%3E",
"data:image/svg+xml," }, //
{ "data:image/svg+xml;utf9,<svg%20version='1.1'%20xmlns='http://www.w3.org/2000/svg'><filter%20id='blur'><feGaussianBlur%20stdDeviation='10'%20/></filter></svg>#blur",
"data:image/svg+xml;utf9," }, //
{ "data:application/font-woff;charset=utf-8;base64,d09GRgABAAAAAAUQAA0AAAAA",
"data:application/font-woff;charset=utf-8;base64," }, //
{ "data:text/plain;charset=iso-8859-7,%be%fg%be", "data:text/plain;charset=iso-8859-7," }, //
};
for (String[] url : urls) {
String u = ExtractingParseObserver.trimDataUrl(url[0]);
assertEquals("Entity " + url[0] + " not properly trimmed", url[1], u);
}
}
}
Loading