Skip to content

Commit feae6d4

Browse files
WAT extractor: do not add <meta itemprop="..." > from body as metadata
- rebase to recent head / master - unit test: merge methods to verify metadata attributes
1 parent febb13f commit feae6d4

File tree

1 file changed

+5
-19
lines changed

1 file changed

+5
-19
lines changed

src/test/java/org/archive/resource/html/ExtractingParseObserverTest.java

Lines changed: 5 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -255,20 +255,6 @@ private void checkLinks(Resource resource, String[][] expectedLinks) {
255255
}
256256
}
257257

258-
private void checkExtractHtmlLangAttribute(Resource resource, String... langAttributes)
259-
throws JSONException {
260-
assertNotNull(resource);
261-
assertTrue("Wrong instance type of Resource: " + resource.getClass(), resource instanceof HTMLResource);
262-
JSONArray metas = resource.getMetaData().getJSONObject("Head").getJSONArray("Metas");
263-
assertNotNull(metas);
264-
JSONObject meta = metas.getJSONObject(0);
265-
for (int i = 0; i < langAttributes.length; i += 2) {
266-
String key = langAttributes[i];
267-
assertNotNull(meta.get(key));
268-
assertEquals(meta.get(key), langAttributes[i+1]);
269-
}
270-
}
271-
272258
public void testLinkExtraction() throws ResourceParseException, IOException {
273259
String testFileName = "link-extraction-test.warc";
274260
ResourceProducer producer = ProducerUtils.getProducer(getClass().getResource(testFileName).getPath());
@@ -448,11 +434,11 @@ public void testHtmlLanguageAttributeExtraction() throws ResourceParseException,
448434
ResourceProducer producer = ProducerUtils.getProducer(getClass().getResource(testFileName).getPath());
449435
ResourceFactoryMapper mapper = new ExtractingResourceFactoryMapper();
450436
ExtractingResourceProducer extractor = new ExtractingResourceProducer(producer, mapper);
451-
checkExtractHtmlLangAttribute(extractor.getNext(), "name", "HTML@/lang", "content", "en");
452-
checkExtractHtmlLangAttribute(extractor.getNext(), "name", "HTML@/lang", "content", "zh-CN");
453-
checkExtractHtmlLangAttribute(extractor.getNext(), "name", "HTML@/lang", "content", "cs-cz");
454-
checkExtractHtmlLangAttribute(extractor.getNext(), "name", "HTML@/lang", "content", "en");
455-
checkExtractHtmlLangAttribute(extractor.getNext(), "name", "HTML@/xml:lang", "content", "es-MX");
437+
checkExtractedAttributes(extractor.getNext(), "name", "HTML@/lang", "content", "en");
438+
checkExtractedAttributes(extractor.getNext(), "name", "HTML@/lang", "content", "zh-CN");
439+
checkExtractedAttributes(extractor.getNext(), "name", "HTML@/lang", "content", "cs-cz");
440+
checkExtractedAttributes(extractor.getNext(), "name", "HTML@/lang", "content", "en");
441+
checkExtractedAttributes(extractor.getNext(), "name", "HTML@/xml:lang", "content", "es-MX");
456442
}
457443

458444
public void testBodyMetaElements() throws ResourceParseException, IOException {

0 commit comments

Comments
 (0)