Merge branch 'master' into feature/issue1190-maui

* master: #1041 - Add parameter to enable lower-cased lookup of first word in sentence in SfstAnnotator #1362 - NifWriter does not write out NE identifier #1362 - NifWriter does not write out NE identifier #1152 - Introduce "order" feature on tokens #1366 - Added support in CONLL-U reader for document and paragraph IDs #1041 - Add parameter to enable lower-cased lookup of first word in sentence in SfstAnnotator #1366 - Added support in CONLL-U reader for document and paragraph IDs #1367 - Support TCF orthography via SofaChangeAnnotations #1041 - Add parameter to enable lower-cased lookup of first word in sentence in SfstAnnotator #1327 - Update LIF support #1366 - Added support in CONLL-U reader for document and paragraph IDs #1367 - Support TCF orthography via SofaChangeAnnotations Forgot to commit the list declaration Warn if CONLL-U file contains multiple documents Added support in CONLL-U reader for document and paragraph IDs #186 - Change artifactId to "dkpro-core-XXX" #1299 - Update to CoreNLP 3.9.2 #1337 - Connl2012 writer uses WordSense, but does not declare it #1299 - Update to CoreNLP 3.9.2 Added parameter to enable lower-cased lookup of first word in sentence.
dkpro · Jun 4, 2019 · b9dfaf3 · b9dfaf3
2 parents 6507c15 + b702a66
commit b9dfaf3
Show file tree

Hide file tree

Showing 41 changed files with 3,397 additions and 812 deletions.
diff --git a/...-resources-asl/src/main/java/org/dkpro/core/api/resources/ResourceObjectProviderBase.java b/...-resources-asl/src/main/java/org/dkpro/core/api/resources/ResourceObjectProviderBase.java
@@ -142,6 +142,7 @@ public abstract class ResourceObjectProviderBase<M>
      * resolved when {@link #configure()} is called. (optional)
      */
     public static final String GROUP_ID = "groupId";
+    public static final String COMPONENT_GROUP_ID = "componentGroupId";
 
     /**
      * The artifact ID of the Maven artifact containing a resource. Variables in the location are
@@ -212,6 +213,7 @@ public abstract class ResourceObjectProviderBase<M>
     protected void init()
     {
         setDefault(GROUP_ID, "de.tudarmstadt.ukp.dkpro.core");
+        setDefault(COMPONENT_GROUP_ID, "org.dkpro.core");
         setDefault(ARTIFACT_URI,
                 "mvn:${" + GROUP_ID + "}:${" + ARTIFACT_ID + "}:${" + VERSION + "}");
     }
@@ -374,7 +376,7 @@ public void applyAutoOverrides(Object aObject)
         }
     }
 
-    protected List<URL> getPomUrlsForClass(String aModelGroup, String aModelArtifact,
+    protected List<URL> getPomUrlsForClass(String aComponentGroupId, String aModelArtifactId,
             Class<?> aClass)
         throws IOException
     {
@@ -418,7 +420,7 @@ protected List<URL> getPomUrlsForClass(String aModelGroup, String aModelArtifact
             Matcher matcher = pattern.matcher(base);
             if (matcher.matches()) {
                 String artifactIdAndVersion = matcher.group("ID");
-                String pomPattern = base + "META-INF/maven/" + aModelGroup + "/"
+                String pomPattern = base + "META-INF/maven/" + aComponentGroupId + "/"
                         + artifactIdAndVersion + "/pom.xml";
                 lookupPatterns.add(pomPattern);
                 ResourcePatternResolver resolver = new PathMatchingResourcePatternResolver();
@@ -434,9 +436,9 @@ protected List<URL> getPomUrlsForClass(String aModelGroup, String aModelArtifact
         // models from the StanfordNLP module).
         if (urls.isEmpty()) {
             // This is the default strategy supposed to look in the JAR
-            String moduleArtifactId = aModelArtifact.split("-")[0];
-            String pomPattern = base + "META-INF/maven/" + aModelGroup + "/" + moduleArtifactId +
-                    "*/pom.xml";
+            String moduleArtifactId = aModelArtifactId.split("-")[0];
+            String pomPattern = base + "META-INF/maven/" + aComponentGroupId + "/"
+                    + moduleArtifactId + "*/pom.xml";
             lookupPatterns.add(pomPattern);
             ResourcePatternResolver resolver = new PathMatchingResourcePatternResolver();
             Resource[] resources = resolver.getResources(pomPattern);
@@ -468,11 +470,11 @@ protected List<URL> getPomUrlsForClass(String aModelGroup, String aModelArtifact
      *             the POM, or if no context object was set.
      * @return the version of the required model.
      */
-    protected String getModelVersionFromMavenPom(String aModelGroup, String aModelArtifact,
-            Class<?> aClass)
+    protected String getModelVersionFromMavenPom(String aComponentGroupId, String aModelGroupId,
+            String aModelArtifactId, Class<?> aClass)
         throws IOException
     {
-        List<URL> urls = getPomUrlsForClass(aModelGroup, aModelArtifact, contextClass);
+        List<URL> urls = getPomUrlsForClass(aComponentGroupId, aModelArtifactId, contextClass);
 
         for (URL pomUrl : urls) {
             // Parse the POM
@@ -492,8 +494,8 @@ protected String getModelVersionFromMavenPom(String aModelGroup, String aModelAr
                 List<Dependency> deps = model.getDependencyManagement().getDependencies();
                 for (Dependency dep : deps) {
                     if (
-                            StringUtils.equals(dep.getGroupId(), aModelGroup) && 
-                            StringUtils.equals(dep.getArtifactId(), aModelArtifact)
+                            StringUtils.equals(dep.getGroupId(), aModelGroupId) && 
+                            StringUtils.equals(dep.getArtifactId(), aModelArtifactId)
                     ) {
                         return dep.getVersion();
                     }
@@ -790,12 +792,22 @@ private Properties resolveDependency(Properties aProps)
                 resolved.getProperty(ARTIFACT_URI, "").contains("${" + VERSION + "}") && 
                 isNull(resolved.getProperty(VERSION))
         ) {
-            String groupId = pph.replacePlaceholders(aProps.getProperty(GROUP_ID), resolved);
+            String modelGroupId = pph.replacePlaceholders(aProps.getProperty(GROUP_ID), resolved);
+            String componentGroupId;
+
+            if (aProps.getProperty(COMPONENT_GROUP_ID) != null) {
+                componentGroupId = pph.replacePlaceholders(aProps.getProperty(COMPONENT_GROUP_ID),
+                        resolved);
+            }
+            else {
+                componentGroupId = modelGroupId;
+            }
+
             String artifactId = pph.replacePlaceholders(aProps.getProperty(ARTIFACT_ID), resolved);
             try {
                 // If the version is to be auto-detected, then we must have a groupId and artifactId
-                resolved.put(VERSION,
-                        getModelVersionFromMavenPom(groupId, artifactId, contextClass));
+                resolved.put(VERSION, getModelVersionFromMavenPom(componentGroupId, modelGroupId,
+                        artifactId, contextClass));
             }
             catch (Throwable e) {
                 log.error("Unable to obtain version from POM", e);

diff --git a/dkpro-core-api-segmentation-asl/src/main/resources/desc/type/LexicalUnits_customized.xml b/dkpro-core-api-segmentation-asl/src/main/resources/desc/type/LexicalUnits_customized.xml
@@ -1,72 +1,151 @@
 <?xml version="1.0" encoding="UTF-8"?>
 <typeSystemDescription xmlns="http://uima.apache.org/resourceSpecifier">
+
   <name>Segmentation</name>
+
   <description/>
+
   <version>${version}</version>
+
   <vendor>Ubiquitous Knowledge Processing (UKP) Lab, Technische Universität Darmstadt</vendor>
+
   <imports>
+
     <import name="desc.type.LexicalUnits"/>
+
   </imports>
+
   <types>
+
     <typeDescription>
+
       <name>de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Compound</name>
+
       <description>This type represents a decompounding word, i.e.: flowerpot. Each Compound one have at least two Splits.</description>
+
       <supertypeName>uima.tcas.Annotation</supertypeName>
+
       <features>
+
         <featureDescription>
+
           <name>splits</name>
+
           <description>A word that can be decomposed into different parts.</description>
+
           <rangeTypeName>uima.cas.FSArray</rangeTypeName>
+
           <elementType>de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Split</elementType>
+
         </featureDescription>
+
       </features>
+
     </typeDescription>
+
     <typeDescription>
+
       <name>de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token</name>
+
       <description>&lt;p&gt;Token is one of the two types commonly produced by a segmenter (the other being Sentence). A Token usually represents a word, although it may be used to represent multiple tightly connected words (e.g. "New York") or parts of a word (e.g. the possessive "'s"). One may choose to split compound words into multiple tokens, e.g. ("CamelCase" -&amp;gt; "Camel", "Case"; "Zauberstab" -&amp;gt; "Zauber", "stab"). Most processing components operate on Tokens, usually within the limits of the surrounding Sentence. E.g. a part-of-speech tagger analyses each Token in a Sentence and assigns a part-of-speech to each Token.&lt;/p&gt;</description>
+
       <supertypeName>uima.tcas.Annotation</supertypeName>
+
       <features>
+
         <featureDescription>
+
           <name>parent</name>
+
           <description>the parent of this token. This feature is meant to be used in when the token participates in a constituency parse and then refers to a constituent containing this token. The type of this feature is {@link Annotation} to avoid adding a dependency on the syntax API module.</description>
+
           <rangeTypeName>uima.tcas.Annotation</rangeTypeName>
+
         </featureDescription>
+
         <featureDescription>
+
           <name>lemma</name>
+
           <description/>
+
           <rangeTypeName>de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Lemma</rangeTypeName>
+
         </featureDescription>
+
         <featureDescription>
+
           <name>stem</name>
+
           <description/>
+
           <rangeTypeName>de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Stem</rangeTypeName>
+
         </featureDescription>
+
         <featureDescription>
+
           <name>pos</name>
+
           <description/>
+
           <rangeTypeName>de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.POS</rangeTypeName>
+
         </featureDescription>
+
         <featureDescription>
+
           <name>morph</name>
+
           <description>The morphological feature associated with this token.</description>
+
           <rangeTypeName>de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.morph.MorphologicalFeatures</rangeTypeName>
+
         </featureDescription>
+
         <featureDescription>
+
           <name>id</name>
+
           <description>If this unit had an ID in the source format from which it was imported, it may be stored here. IDs are typically not assigned by DKPro Core components. If an ID is present, it should be respected by writers.</description>
+
           <rangeTypeName>uima.cas.String</rangeTypeName>
+
         </featureDescription>
+
         <featureDescription>
+
           <name>form</name>
+
           <description>Potentially normalized form of the token text that should be used instead of the covered text if set.</description>
+
           <rangeTypeName>de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.TokenForm</rangeTypeName>
+
         </featureDescription>
+
         <featureDescription>
+
           <name>syntacticFunction</name>
+
           <description/>
+
           <rangeTypeName>uima.cas.String</rangeTypeName>
+
+        </featureDescription>
+
+        <featureDescription>
+
+          <name>order</name>
+
+          <description>Disambiguates the token order for tokens which have the same offsets, e.g. when the contraction "à" is analyzed as two tokens "a" and "a".</description>
+
+          <rangeTypeName>uima.cas.Integer</rangeTypeName>
+
         </featureDescription>
       </features>
+
     </typeDescription>
+
   </types>
+
 </typeSystemDescription>