Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
import java.util.ArrayList;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.Locale;
import java.util.Map;
import java.util.Set;
Expand Down Expand Up @@ -110,6 +111,8 @@ public class ExtractingParseObserver implements ParseObserver {
extractors.put("AUDIO", new EmbedTagExtractor());
extractors.put("TRACK", new EmbedTagExtractor());
extractors.put("SOURCE", new EmbedTagExtractor());
// language from HTML root element
extractors.put("HTML", new HTMLTagExtractor());

globalHrefAttributes = new HashSet<String>();
globalHrefAttributes.add("background");
Expand Down Expand Up @@ -604,6 +607,23 @@ public void extract(HTMLMetaData data, TagNode node, ExtractingParseObserver obs
}
}

private static class HTMLTagExtractor implements TagExtractor {
@Override
public void extract(HTMLMetaData data, TagNode node, ExtractingParseObserver obs) {
ArrayList<String> l = getAttrList(node, "lang", "xml:lang");
if(l != null) {
Iterator<String> it = l.iterator();
while (it.hasNext()) {
String name = it.next();
if (it.hasNext()) {
String lang = it.next();
data.addMeta("name", makePath("HTML", name), "content", lang);
}
}
}
}
}

private static class IFrameTagExtractor implements TagExtractor {
@Override
public void extract(HTMLMetaData data, TagNode node, ExtractingParseObserver obs) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import java.util.Map;
import java.util.logging.Logger;

import org.archive.extract.ExtractingResourceFactoryMapper;
Expand Down Expand Up @@ -240,6 +241,20 @@ private void checkLinks(Resource resource, String[][] expectedLinks) {
}
}

private void checkExtractHtmlLangAttribute(Resource resource, String... langAttributes)
throws JSONException {
assertNotNull(resource);
assertTrue("Wrong instance type of Resource: " + resource.getClass(), resource instanceof HTMLResource);
JSONArray metas = resource.getMetaData().getJSONObject("Head").getJSONArray("Metas");
assertNotNull(metas);
JSONObject meta = metas.getJSONObject(0);
for (int i = 0; i < langAttributes.length; i += 2) {
String key = langAttributes[i];
assertNotNull(meta.get(key));
assertEquals(meta.get(key), langAttributes[i+1]);
}
}

public void testLinkExtraction() throws ResourceParseException, IOException {
String testFileName = "link-extraction-test.warc";
ResourceProducer producer = ProducerUtils.getProducer(getClass().getResource(testFileName).getPath());
Expand Down Expand Up @@ -414,6 +429,18 @@ public void testTitleExtraction() throws ResourceParseException, IOException {
checkTitle(resource, "Testing title extraction with embedded SVG");
}

public void testHtmlLanguageAttributeExtraction() throws ResourceParseException, IOException {
String testFileName = "html-lang-attribute.warc";
ResourceProducer producer = ProducerUtils.getProducer(getClass().getResource(testFileName).getPath());
ResourceFactoryMapper mapper = new ExtractingResourceFactoryMapper();
ExtractingResourceProducer extractor = new ExtractingResourceProducer(producer, mapper);
checkExtractHtmlLangAttribute(extractor.getNext(), "name", "HTML@/lang", "content", "en");
checkExtractHtmlLangAttribute(extractor.getNext(), "name", "HTML@/lang", "content", "zh-CN");
checkExtractHtmlLangAttribute(extractor.getNext(), "name", "HTML@/lang", "content", "cs-cz");
checkExtractHtmlLangAttribute(extractor.getNext(), "name", "HTML@/lang", "content", "en");
checkExtractHtmlLangAttribute(extractor.getNext(), "name", "HTML@/xml:lang", "content", "es-MX");
}

public void testHtmlParserEntityDecoding() {
String[][] entities = { //
/* ampersand */
Expand Down
106 changes: 106 additions & 0 deletions src/test/resources/org/archive/resource/html/html-lang-attribute.warc
Original file line number Diff line number Diff line change
@@ -0,0 +1,106 @@
WARC/1.0
WARC-Type: response
WARC-Date: 2024-12-05T10:47:02Z
Content-Length: 169
Content-Type: application/http; msgtype=response
WARC-Target-URI: https://www.example.org/1
WARC-Identified-Payload-Type: text/html

HTTP/1.1 200
content-type: text/html; charset=UTF-8

<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="UTF-8">
<title>Test</title>
</head>
<body/>
</html>



WARC/1.0
WARC-Type: response
WARC-Date: 2024-12-05T10:47:02Z
Content-Length: 185
Content-Type: application/http; msgtype=response
WARC-Target-URI: https://www.example.org/2
WARC-Identified-Payload-Type: text/html

HTTP/1.1 200
content-type: text/html; charset=UTF-8

<!DOCTYPE html>
<html lang="zh-CN" xmlns="http://www.w3.org/1999/xhtml">
<head>
<title>Test</title>
</head>
<body/>
</html>



WARC/1.0
WARC-Type: response
WARC-Date: 2024-12-05T10:47:02Z
Content-Length: 158
Content-Type: application/http; msgtype=response
WARC-Target-URI: https://www.example.org/3
WARC-Identified-Payload-Type: text/html

HTTP/1.1 200
content-type: text/html; charset=UTF-8

<!DOCTYPE html>
<html dir="ltr" lang="cs-cz">
<head>
<title>Test</title>
</head>
<body/>
</html>



WARC/1.0
WARC-Type: response
WARC-Date: 2024-12-05T10:47:02Z
Content-Length: 319
Content-Type: application/http; msgtype=response
WARC-Target-URI: https://www.example.org/4
WARC-Identified-Payload-Type: text/html

HTTP/1.1 200
content-type: text/html; charset=UTF-8

<!DOCTYPE html>
<html xmlns:fb="http://www.facebook.com/2008/fbml" xmlns:og="http://ogp.me/ns#" xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en" dir="ltr" style="overflow-x: hidden !important;">
<head>
<title>Test</title>
</head>
<body/>
</html>



WARC/1.0
WARC-Type: response
WARC-Date: 2024-12-05T10:47:02Z
Content-Length: 189
Content-Type: application/http; msgtype=response
WARC-Target-URI: https://www.example.org/5
WARC-Identified-Payload-Type: text/html

HTTP/1.1 200
content-type: text/html; charset=UTF-8

<!DOCTYPE html>
<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="es-MX">
<head>
<title>Test</title>
</head>
<body/>
</html>



Loading