Skip to content

Commit

Permalink
#23934 Including keys from tika 1.x that were missing in tika 2.x
Browse files Browse the repository at this point in the history
  • Loading branch information
nollymar authored and nollymar committed May 30, 2023
1 parent d508c01 commit de70fba
Show file tree
Hide file tree
Showing 4 changed files with 80 additions and 5 deletions.
5 changes: 5 additions & 0 deletions dotCMS/build.gradle
Original file line number Diff line number Diff line change
Expand Up @@ -755,6 +755,11 @@ war {
from(configurations.felixsystem) {
into felixSystemFolder.concat("/bundle")
}

from(configurations.runtimeClasspath) {
into felixSystemFolder.concat("/bundle")
include '**/org.apache.felix.http.api*.jar'
}
//}

from (configurations.starter) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,6 @@ public enum BasicMetadataFields {
VERSION_KEY ("version"),
NAME_META_KEY ("name"),
TITLE_META_KEY ("title"),
DC_TITLE_META_KEY ("dcTitle"),
PATH_META_KEY ("path"),
CONTENT_TYPE_META_KEY ("contentType"),
SHA256_META_KEY ("sha256"),
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,55 @@
package com.dotcms.storage.model;

import com.dotcms.util.CollectionsUtils;
import java.util.List;
import java.util.Map;
import java.util.stream.Collectors;
import java.util.stream.Stream;

/**
* Encapsulates a collection with keys generated by Tika.
* It is used to translate keys from Tika 2.0 to the old ones (list with possible values).
*/
public enum ExtendedMetadataFields {

DC_CREATOR("dcCreator", CollectionsUtils.list("author, metaAuthor")),
META_LAST_AUTHOR("metaLastAuthor", CollectionsUtils.list("lastAuthor")),
DC_TITLE("dcTitle", CollectionsUtils.list("title")),
DC_TERMS_CREATED("dctermsCreated", CollectionsUtils.list("date", "creationDate")),
DC_TERMS_MODIFIED("dctermsModified", CollectionsUtils.list("lastModified", "modified")),

META_SAVE_DATE("metaSaveDate", CollectionsUtils.list("lastSaveDate")),
EXTENDED_PROPERTIES_APPLICATION("extendedPropertiesApplication", CollectionsUtils.list("applicationName")),
META_CHARACTER_COUNT("metaCharacterCount", CollectionsUtils.list("characterCount")),
EXTENDED_PROPERTIES_COMPANY("extendedPropertiesCompany", CollectionsUtils.list("company")),
EXTENDED_PROPERTIES_TOTAL_TIME("extendedPropertiesTotalTime", CollectionsUtils.list("editTime")),
META_KEYWORD("metaKeyword", CollectionsUtils.list("keywords", "dcSubject")),
META_PAGE_COUNT("metaPageCount", CollectionsUtils.list("pageCount")),
REVISION_NUMBER("cpRevision", CollectionsUtils.list("revisionNumber")),
DC_SUBJECT("dcSubject", CollectionsUtils.list("subject", "cpSubject", "metaKeyword", "keywords")),
EXTENDED_TEMPLATE("extendedPropertiesTemplate", CollectionsUtils.list("template")),
WORD_COUNT("metaWordCount", CollectionsUtils.list("wordCount")),
DC_IDENTIFIER("dcIdentifier", CollectionsUtils.list("identifier")),
DC_PUBLISHER("dcPublisher", CollectionsUtils.list("publisher"));

private final String key;
private final List<String> possibleValues;

ExtendedMetadataFields(String key, List<String> possibleValues) {
this.key = key;
this.possibleValues = possibleValues;
}

public String key() {
return key;
}

public List<String> possibleValues() {
return possibleValues;
}

public static Map<String, List<String>> keyMap() {
return Stream.of(values()).collect(
Collectors.toMap(ExtendedMetadataFields::key, ExtendedMetadataFields::possibleValues));
}
}
24 changes: 20 additions & 4 deletions dotCMS/src/main/java/com/dotcms/tika/TikaUtils.java
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
import com.dotcms.contenttype.model.field.Field;
import com.dotcms.contenttype.model.type.BaseContentType;
import com.dotcms.osgi.OSGIConstants;
import com.dotcms.storage.model.ExtendedMetadataFields;
import org.apache.commons.io.FileUtils;
import org.apache.commons.io.IOUtils;
import com.dotcms.rest.api.v1.DotObjectMapperProvider;
Expand Down Expand Up @@ -388,10 +389,8 @@ public Map<String, Object> getForcedMetaDataMap(final File binFile,
metaMap.putAll(this.buildMetaDataMap());
metaMap.put(FileAssetAPI.CONTENT_FIELD, content);

//From Tika 2.x the title is mapped to dcTitle (https://cwiki.apache.org/confluence/display/TIKA/Migrating+to+Tika+2.0.0)
if(!metaMap.containsKey(TITLE_META_KEY.key()) && metaMap.containsKey(DC_TITLE_META_KEY.key())){
metaMap.put(TITLE_META_KEY.key(), metaMap.get(DC_TITLE_META_KEY.key()));
}
//Adding missing keys that were excluded in Tika 2.0
includeMissingKeys(metaMap);
} catch (IOException ioExc) {
if (this.isZeroByteFileException(ioExc.getCause())) {
logWarning(binFile, ioExc.getCause());
Expand All @@ -411,6 +410,23 @@ public Map<String, Object> getForcedMetaDataMap(final File binFile,
return metaMap;
}

/**
* This method adds missing keys from Tika 1.x that were excluded in Tika 2.0. For example: keywords and title
* For further details, please visit https://cwiki.apache.org/confluence/display/TIKA/Migrating+to+Tika+2.0.0
* @param metaMap
*/
private static void includeMissingKeys(Map<String, Object> metaMap) {
ExtendedMetadataFields.keyMap().forEach((key, value) -> {
if(metaMap.containsKey(key)){
value.forEach(v -> {
if(!metaMap.containsKey(v)){
metaMap.put(v, metaMap.get(key));
}
});
}
});
}

private void parseFallbackAsPlainText(final File binFile, final Map<String, Object> metaMap, final IOException ioExc) {
try {
//On error lets try a fallback operation
Expand Down

0 comments on commit de70fba

Please sign in to comment.