From 581b43a07138c220852609bb984c001868995787 Mon Sep 17 00:00:00 2001
From: Sebastian Nagel
Date: Thu, 5 Dec 2024 19:26:50 +0100
Subject: [PATCH 1/2] WAT extractor: add attributes of the element as
metadata, fixes #35 - add lang attributes from root element as
metadata { "name": "HTML@/lang", "content": "es-MX" }
---
.../html/ExtractingParseObserver.java | 20 ++++
.../html/ExtractingParseObserverTest.java | 26 +++++
.../resource/html/html-lang-attribute.warc | 106 ++++++++++++++++++
3 files changed, 152 insertions(+)
create mode 100644 src/test/resources/org/archive/resource/html/html-lang-attribute.warc
diff --git a/src/main/java/org/archive/resource/html/ExtractingParseObserver.java b/src/main/java/org/archive/resource/html/ExtractingParseObserver.java
index ad3ad463..b0b37f4a 100644
--- a/src/main/java/org/archive/resource/html/ExtractingParseObserver.java
+++ b/src/main/java/org/archive/resource/html/ExtractingParseObserver.java
@@ -3,6 +3,7 @@
import java.util.ArrayList;
import java.util.HashMap;
import java.util.HashSet;
+import java.util.Iterator;
import java.util.Locale;
import java.util.Map;
import java.util.Set;
@@ -110,6 +111,8 @@ public class ExtractingParseObserver implements ParseObserver {
extractors.put("AUDIO", new EmbedTagExtractor());
extractors.put("TRACK", new EmbedTagExtractor());
extractors.put("SOURCE", new EmbedTagExtractor());
+ // language from HTML root element
+ extractors.put("HTML", new HTMLTagExtractor());
globalHrefAttributes = new HashSet();
globalHrefAttributes.add("background");
@@ -604,6 +607,23 @@ public void extract(HTMLMetaData data, TagNode node, ExtractingParseObserver obs
}
}
+ private static class HTMLTagExtractor implements TagExtractor {
+ @Override
+ public void extract(HTMLMetaData data, TagNode node, ExtractingParseObserver obs) {
+ ArrayList l = getAttrList(node, "lang", "xml:lang");
+ if(l != null) {
+ Iterator it = l.iterator();
+ while (it.hasNext()) {
+ String name = it.next();
+ if (it.hasNext()) {
+ String lang = it.next();
+ data.addMeta("name", makePath("HTML", name), "content", lang);
+ }
+ }
+ }
+ }
+ }
+
private static class IFrameTagExtractor implements TagExtractor {
@Override
public void extract(HTMLMetaData data, TagNode node, ExtractingParseObserver obs) {
diff --git a/src/test/java/org/archive/resource/html/ExtractingParseObserverTest.java b/src/test/java/org/archive/resource/html/ExtractingParseObserverTest.java
index 15098011..18f35767 100644
--- a/src/test/java/org/archive/resource/html/ExtractingParseObserverTest.java
+++ b/src/test/java/org/archive/resource/html/ExtractingParseObserverTest.java
@@ -3,6 +3,7 @@
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
+import java.util.Map;
import java.util.logging.Logger;
import org.archive.extract.ExtractingResourceFactoryMapper;
@@ -240,6 +241,19 @@ private void checkLinks(Resource resource, String[][] expectedLinks) {
}
}
+ private void checkExtractHtmlLangAttribute(Resource resource, Map langAttributes)
+ throws JSONException {
+ assertNotNull(resource);
+ assertTrue("Wrong instance type of Resource: " + resource.getClass(), resource instanceof HTMLResource);
+ JSONArray metas = resource.getMetaData().getJSONObject("Head").getJSONArray("Metas");
+ assertNotNull(metas);
+ JSONObject meta = metas.getJSONObject(0);
+ for (String key : langAttributes.keySet()) {
+ assertNotNull(meta.get(key));
+ assertEquals(meta.get(key), langAttributes.get(key));
+ }
+ }
+
public void testLinkExtraction() throws ResourceParseException, IOException {
String testFileName = "link-extraction-test.warc";
ResourceProducer producer = ProducerUtils.getProducer(getClass().getResource(testFileName).getPath());
@@ -414,6 +428,18 @@ public void testTitleExtraction() throws ResourceParseException, IOException {
checkTitle(resource, "Testing title extraction with embedded SVG");
}
+ public void testHtmlLanguageAttributeExtraction() throws ResourceParseException, IOException {
+ String testFileName = "html-lang-attribute.warc";
+ ResourceProducer producer = ProducerUtils.getProducer(getClass().getResource(testFileName).getPath());
+ ResourceFactoryMapper mapper = new ExtractingResourceFactoryMapper();
+ ExtractingResourceProducer extractor = new ExtractingResourceProducer(producer, mapper);
+ checkExtractHtmlLangAttribute(extractor.getNext(), Map.of("name", "HTML@/lang", "content", "en"));
+ checkExtractHtmlLangAttribute(extractor.getNext(), Map.of("name", "HTML@/lang", "content", "zh-CN"));
+ checkExtractHtmlLangAttribute(extractor.getNext(), Map.of("name", "HTML@/lang", "content", "cs-cz"));
+ checkExtractHtmlLangAttribute(extractor.getNext(), Map.of("name", "HTML@/lang", "content", "en"));
+ checkExtractHtmlLangAttribute(extractor.getNext(), Map.of("name", "HTML@/xml:lang", "content", "es-MX"));
+ }
+
public void testHtmlParserEntityDecoding() {
String[][] entities = { //
/* ampersand */
diff --git a/src/test/resources/org/archive/resource/html/html-lang-attribute.warc b/src/test/resources/org/archive/resource/html/html-lang-attribute.warc
new file mode 100644
index 00000000..b74e5c18
--- /dev/null
+++ b/src/test/resources/org/archive/resource/html/html-lang-attribute.warc
@@ -0,0 +1,106 @@
+WARC/1.0
+WARC-Type: response
+WARC-Date: 2024-12-05T10:47:02Z
+Content-Length: 169
+Content-Type: application/http; msgtype=response
+WARC-Target-URI: https://www.example.org/1
+WARC-Identified-Payload-Type: text/html
+
+HTTP/1.1 200
+content-type: text/html; charset=UTF-8
+
+
+
+
+
+ Test
+
+
+
+
+
+
+WARC/1.0
+WARC-Type: response
+WARC-Date: 2024-12-05T10:47:02Z
+Content-Length: 185
+Content-Type: application/http; msgtype=response
+WARC-Target-URI: https://www.example.org/2
+WARC-Identified-Payload-Type: text/html
+
+HTTP/1.1 200
+content-type: text/html; charset=UTF-8
+
+
+
+
+ Test
+
+
+
+
+
+
+WARC/1.0
+WARC-Type: response
+WARC-Date: 2024-12-05T10:47:02Z
+Content-Length: 158
+Content-Type: application/http; msgtype=response
+WARC-Target-URI: https://www.example.org/3
+WARC-Identified-Payload-Type: text/html
+
+HTTP/1.1 200
+content-type: text/html; charset=UTF-8
+
+
+
+
+ Test
+
+
+
+
+
+
+WARC/1.0
+WARC-Type: response
+WARC-Date: 2024-12-05T10:47:02Z
+Content-Length: 319
+Content-Type: application/http; msgtype=response
+WARC-Target-URI: https://www.example.org/4
+WARC-Identified-Payload-Type: text/html
+
+HTTP/1.1 200
+content-type: text/html; charset=UTF-8
+
+
+
+
+ Test
+
+
+
+
+
+
+WARC/1.0
+WARC-Type: response
+WARC-Date: 2024-12-05T10:47:02Z
+Content-Length: 189
+Content-Type: application/http; msgtype=response
+WARC-Target-URI: https://www.example.org/5
+WARC-Identified-Payload-Type: text/html
+
+HTTP/1.1 200
+content-type: text/html; charset=UTF-8
+
+
+
+
+ Test
+
+
+
+
+
+
From 8627773477bd206a4f3a0ded1f91ce06e78b7c52 Mon Sep 17 00:00:00 2001
From: Sebastian Nagel
Date: Thu, 5 Dec 2024 19:43:16 +0100
Subject: [PATCH 2/2] WAT extractor: add attributes of the element as
metadata
- make tests run also on JDK 8
---
.../html/ExtractingParseObserverTest.java | 17 +++++++++--------
1 file changed, 9 insertions(+), 8 deletions(-)
diff --git a/src/test/java/org/archive/resource/html/ExtractingParseObserverTest.java b/src/test/java/org/archive/resource/html/ExtractingParseObserverTest.java
index 18f35767..d6e5e802 100644
--- a/src/test/java/org/archive/resource/html/ExtractingParseObserverTest.java
+++ b/src/test/java/org/archive/resource/html/ExtractingParseObserverTest.java
@@ -241,16 +241,17 @@ private void checkLinks(Resource resource, String[][] expectedLinks) {
}
}
- private void checkExtractHtmlLangAttribute(Resource resource, Map langAttributes)
+ private void checkExtractHtmlLangAttribute(Resource resource, String... langAttributes)
throws JSONException {
assertNotNull(resource);
assertTrue("Wrong instance type of Resource: " + resource.getClass(), resource instanceof HTMLResource);
JSONArray metas = resource.getMetaData().getJSONObject("Head").getJSONArray("Metas");
assertNotNull(metas);
JSONObject meta = metas.getJSONObject(0);
- for (String key : langAttributes.keySet()) {
+ for (int i = 0; i < langAttributes.length; i += 2) {
+ String key = langAttributes[i];
assertNotNull(meta.get(key));
- assertEquals(meta.get(key), langAttributes.get(key));
+ assertEquals(meta.get(key), langAttributes[i+1]);
}
}
@@ -433,11 +434,11 @@ public void testHtmlLanguageAttributeExtraction() throws ResourceParseException,
ResourceProducer producer = ProducerUtils.getProducer(getClass().getResource(testFileName).getPath());
ResourceFactoryMapper mapper = new ExtractingResourceFactoryMapper();
ExtractingResourceProducer extractor = new ExtractingResourceProducer(producer, mapper);
- checkExtractHtmlLangAttribute(extractor.getNext(), Map.of("name", "HTML@/lang", "content", "en"));
- checkExtractHtmlLangAttribute(extractor.getNext(), Map.of("name", "HTML@/lang", "content", "zh-CN"));
- checkExtractHtmlLangAttribute(extractor.getNext(), Map.of("name", "HTML@/lang", "content", "cs-cz"));
- checkExtractHtmlLangAttribute(extractor.getNext(), Map.of("name", "HTML@/lang", "content", "en"));
- checkExtractHtmlLangAttribute(extractor.getNext(), Map.of("name", "HTML@/xml:lang", "content", "es-MX"));
+ checkExtractHtmlLangAttribute(extractor.getNext(), "name", "HTML@/lang", "content", "en");
+ checkExtractHtmlLangAttribute(extractor.getNext(), "name", "HTML@/lang", "content", "zh-CN");
+ checkExtractHtmlLangAttribute(extractor.getNext(), "name", "HTML@/lang", "content", "cs-cz");
+ checkExtractHtmlLangAttribute(extractor.getNext(), "name", "HTML@/lang", "content", "en");
+ checkExtractHtmlLangAttribute(extractor.getNext(), "name", "HTML@/xml:lang", "content", "es-MX");
}
public void testHtmlParserEntityDecoding() {