-
Notifications
You must be signed in to change notification settings - Fork 40
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
* Baleen Entity Linking In this commit, we provide a framework for entity linking and an initial implementation. Entity linking is the identification of entities in a document against an external data source. We provide example candidate suppliers from DBpedia and a simple Mongo document based supplier. A general matching algorithm using a bag of words approach is also supplied but particular situations will benefit from tuned implementations. * Rename Propper to Proper * Simplifying Information Collector and Ranker test
- Loading branch information
1 parent
8781ec9
commit 666c139
Showing
50 changed files
with
3,691 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,69 @@ | ||
<?xml version="1.0" encoding="UTF-8"?> | ||
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" | ||
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd"> | ||
<parent> | ||
<groupId>uk.gov.dstl.baleen</groupId> | ||
<artifactId>baleen</artifactId> | ||
<version>2.6.0-SNAPSHOT</version> | ||
</parent> | ||
<modelVersion>4.0.0</modelVersion> | ||
|
||
<artifactId>baleen-entity-linking</artifactId> | ||
|
||
<dependencies> | ||
<dependency> | ||
<groupId>org.apache.jena</groupId> | ||
<artifactId>jena-arq</artifactId> | ||
<version>${jena.version}</version> | ||
</dependency> | ||
<dependency> | ||
<groupId>org.apache.jena</groupId> | ||
<artifactId>jena-querybuilder</artifactId> | ||
<version>${jena.version}</version> | ||
</dependency> | ||
<dependency> | ||
<groupId>uk.gov.dstl.baleen</groupId> | ||
<artifactId>baleen-annotators</artifactId> | ||
<version>2.6.0-SNAPSHOT</version> | ||
</dependency> | ||
|
||
<!-- JSON Flattener --> | ||
<dependency> | ||
<groupId>com.github.wnameless</groupId> | ||
<artifactId>json-flattener</artifactId> | ||
<version>0.4.1</version> | ||
</dependency> | ||
|
||
<!-- Fongo --> | ||
<dependency> | ||
<groupId>com.github.fakemongo</groupId> | ||
<artifactId>fongo</artifactId> | ||
<version>${fongo.version}</version> | ||
<scope>test</scope> | ||
</dependency> | ||
<dependency> | ||
<groupId>org.testcontainers</groupId> | ||
<artifactId>testcontainers</artifactId> | ||
<version>${testcontainers.version}</version> | ||
<scope>test</scope> | ||
</dependency> | ||
|
||
<!-- Testing --> | ||
<dependency> | ||
<groupId>uk.gov.dstl.baleen</groupId> | ||
<artifactId>baleen-annotators</artifactId> | ||
<version>${baleen.version}</version> | ||
<type>test-jar</type> | ||
<scope>test</scope> | ||
</dependency> | ||
<dependency> | ||
<groupId>uk.gov.dstl.baleen</groupId> | ||
<artifactId>baleen-uima</artifactId> | ||
<version>${baleen.version}</version> | ||
<type>test-jar</type> | ||
<scope>test</scope> | ||
</dependency> | ||
|
||
</dependencies> | ||
|
||
</project> |
246 changes: 246 additions & 0 deletions
246
...nking/src/main/java/uk/gov/dstl/baleen/annotators/coreference/EntityLinkingAnnotator.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,246 @@ | ||
// Copyright (c) Committed Software 2018, opensource@committed.io | ||
package uk.gov.dstl.baleen.annotators.coreference; | ||
|
||
import java.util.Collection; | ||
import java.util.Optional; | ||
import java.util.Set; | ||
|
||
import org.apache.uima.UimaContext; | ||
import org.apache.uima.analysis_engine.AnalysisEngineProcessException; | ||
import org.apache.uima.fit.descriptor.ConfigurationParameter; | ||
import org.apache.uima.fit.descriptor.ExternalResource; | ||
import org.apache.uima.jcas.JCas; | ||
import org.apache.uima.resource.ResourceInitializationException; | ||
|
||
import com.google.common.collect.ImmutableSet; | ||
|
||
import uk.gov.dstl.baleen.core.pipelines.orderers.AnalysisEngineAction; | ||
import uk.gov.dstl.baleen.core.utils.BuilderUtils; | ||
import uk.gov.dstl.baleen.entity.linking.Candidate; | ||
import uk.gov.dstl.baleen.entity.linking.CandidateRanker; | ||
import uk.gov.dstl.baleen.entity.linking.CandidateSupplier; | ||
import uk.gov.dstl.baleen.entity.linking.EntityInformation; | ||
import uk.gov.dstl.baleen.entity.linking.InformationCollector; | ||
import uk.gov.dstl.baleen.entity.linking.collector.JCasInformationCollector; | ||
import uk.gov.dstl.baleen.entity.linking.collector.ProperNounInformationCollector; | ||
import uk.gov.dstl.baleen.entity.linking.ranker.BagOfWordsCandidateRanker; | ||
import uk.gov.dstl.baleen.resources.SharedStopwordResource; | ||
import uk.gov.dstl.baleen.types.language.WordToken; | ||
import uk.gov.dstl.baleen.types.semantic.Entity; | ||
import uk.gov.dstl.baleen.types.semantic.ReferenceTarget; | ||
import uk.gov.dstl.baleen.uima.BaleenAnnotator; | ||
|
||
/** | ||
* Annotator for linking Entities to a known data source | ||
* | ||
* <p>Entity linking is made up of an {@link InformationCollector}, a {@link CandidateSupplier} and | ||
* a {@link CandidateRanker}. By default the {@link ProperNounInformationCollector} is used, however | ||
* this requires part of speech tagging to also be in the pipeline, so an implementation that does | ||
* not require this is also supplied by {@link JCasInformationCollector}. A {@link | ||
* BagOfWordsCandidateRanker} is also used by default. To configure, an entity type and a candidate | ||
* supplier must be configured e.g. | ||
* | ||
* <pre> | ||
* - class: coreference.EntityLinkingAnnotator | ||
* entityType: Person | ||
* candidateSupplier: dbpedia.DBPediaPersonCandidateSupplier | ||
* </pre> | ||
* | ||
* @param <T> The Entity type | ||
*/ | ||
public class EntityLinkingAnnotator<T extends Entity> extends BaleenAnnotator { | ||
|
||
private static final String SEMANTIC_ENTITY_PACKAGE = "uk.gov.dstl.baleen.types.semantic"; | ||
private static final String COMMON_ENTITY_PACKAGE = "uk.gov.dstl.baleen.types.common"; | ||
private static final String COLLECTOR_PACKAGE = "uk.gov.dstl.baleen.entity.linking.collector"; | ||
private static final String RANKER_PACKAGE = "uk.gov.dstl.baleen.entity.linking.ranker"; | ||
private static final String SUPPLIER_PACKAGE = "uk.gov.dstl.baleen.entity.linking.supplier"; | ||
|
||
/** | ||
* Connection to Stopwords Resource | ||
* | ||
* @baleen.resource uk.gov.dstl.baleen.resources.SharedStopwordResource | ||
*/ | ||
public static final String KEY_STOPWORDS = "stopwords"; | ||
|
||
@ExternalResource(key = KEY_STOPWORDS) | ||
private SharedStopwordResource stopwordResource; | ||
|
||
/** | ||
* The stoplist to use. If the stoplist matches one of the enum's provided in {@link | ||
* SharedStopwordResource#StopwordList}, then that list will be loaded. | ||
* | ||
* <p>Otherwise, the string is taken to be a file path and that file is used. The format of the | ||
* file is expected to be one stopword per line. | ||
* | ||
* @baleen.config DEFAULT | ||
*/ | ||
public static final String PARAM_STOPLIST = "stoplist"; | ||
|
||
@ConfigurationParameter(name = PARAM_STOPLIST, defaultValue = "DEFAULT") | ||
private String stoplist; | ||
|
||
/** | ||
* The entity type class name. Can be a simple class name if the class is in | ||
* uk.gov.dstl.baleen.common or uk.gov.dstl.baleen.semantic, or a fully qualified name | ||
* | ||
* @baleen.config | ||
*/ | ||
public static final String PARAM_ENTITY_TYPE = "entityType"; | ||
|
||
@ConfigurationParameter(name = PARAM_ENTITY_TYPE) | ||
private String entityTypeClassName; | ||
|
||
/** | ||
* The InformationCollector implementation class name. Can be a fully qualified name or simple | ||
* class name if a package is declared or the implementation is in | ||
* uk.gov.dstl.baleen.entity.linking.collector | ||
* | ||
* @baleen.config | ||
*/ | ||
public static final String PARAM_INFORMATION_COLLECTOR = "informationCollector"; | ||
|
||
@ConfigurationParameter( | ||
name = PARAM_INFORMATION_COLLECTOR, | ||
mandatory = false, | ||
defaultValue = "ProperNounInformationCollector" | ||
) | ||
private String informationCollectorClassName; | ||
|
||
/** | ||
* The CandidateSupplier implementation class name. Can be a fully qualified name or simple class | ||
* name if a package is declared or the implementation is in | ||
* uk.gov.dstl.baleen.entity.linking.supplier | ||
* | ||
* @baleen.config | ||
*/ | ||
public static final String PARAM_CANDIDATE_SUPPLIER = "candidateSupplier"; | ||
|
||
@ConfigurationParameter(name = PARAM_CANDIDATE_SUPPLIER) | ||
private String candidateSupplierClassName; | ||
|
||
/** | ||
* The CandidateRanker implementation class name. Can be a fully qualified name or simple class | ||
* name if a package is declared or the implementation is in | ||
* uk.gov.dstl.baleen.entity.linking.ranker | ||
* | ||
* @baleen.config | ||
*/ | ||
public static final String PARAM_CANDIDATE_RANKER = "candidateRanker"; | ||
|
||
@ConfigurationParameter( | ||
name = PARAM_CANDIDATE_RANKER, | ||
mandatory = false, | ||
defaultValue = "BagOfWordsCandidateRanker" | ||
) | ||
private String candidateRankerClassName; | ||
|
||
/** | ||
* Configuration arguments for candidate supplier. This is mandatory for the | ||
* MongoCandidateSupplier. Mandatory fields for MongoCandidateSupplier are "collection" and | ||
* "searchField" Should be an array of Strings of key value pairs. For example, ["collection", | ||
* "peopleCollection", "language", "en", "port", "1234"] | ||
* | ||
* @baleen.config {} | ||
*/ | ||
public static final String PARAM_CANDIDATE_SUPPLIER_CONFIG_ARGUMENTS = | ||
"candidateSupplierArguments"; | ||
|
||
@ConfigurationParameter( | ||
name = PARAM_CANDIDATE_SUPPLIER_CONFIG_ARGUMENTS, | ||
mandatory = false, | ||
defaultValue = {} | ||
) | ||
private String[] candidateSupplierOptions; | ||
|
||
private Class<T> entityClass; | ||
private InformationCollector informationCollector; | ||
private CandidateSupplier<T> candidateSupplier; | ||
private CandidateRanker<T> candidateRanker; | ||
|
||
@Override | ||
@SuppressWarnings("unchecked") | ||
public void doInitialize(UimaContext aContext) throws ResourceInitializationException { | ||
super.doInitialize(aContext); | ||
|
||
Collection<String> stopwords = stopwordResource.getStopwords(stoplist); | ||
|
||
try { | ||
entityClass = | ||
BuilderUtils.getClassFromString( | ||
entityTypeClassName, SEMANTIC_ENTITY_PACKAGE, COMMON_ENTITY_PACKAGE); | ||
|
||
informationCollector = | ||
(InformationCollector) | ||
BuilderUtils.getClassFromString(informationCollectorClassName, COLLECTOR_PACKAGE) | ||
.newInstance(); | ||
|
||
candidateSupplier = | ||
(CandidateSupplier<T>) | ||
BuilderUtils.getClassFromString(candidateSupplierClassName, SUPPLIER_PACKAGE) | ||
.newInstance(); | ||
|
||
candidateSupplier.configure(candidateSupplierOptions); | ||
|
||
candidateRanker = | ||
(CandidateRanker<T>) | ||
BuilderUtils.getClassFromString(candidateRankerClassName, RANKER_PACKAGE) | ||
.newInstance(); | ||
|
||
candidateRanker.initialize(stopwords); | ||
|
||
} catch (Exception e) { | ||
throw new ResourceInitializationException(e); | ||
} | ||
} | ||
|
||
@Override | ||
protected void doProcess(JCas jCas) throws AnalysisEngineProcessException { | ||
|
||
Set<EntityInformation<T>> entityInformationSet = | ||
informationCollector.getEntityInformation(jCas, entityClass); | ||
|
||
entityInformationSet.forEach( | ||
entityInformation -> { | ||
Collection<Candidate> candidates = candidateSupplier.getCandidates(entityInformation); | ||
if (candidates.size() == 1) { | ||
setLinking(jCas, entityInformation, candidates.iterator().next().getId()); | ||
return; | ||
} | ||
Optional<Candidate> topCandidatesOptional = | ||
candidateRanker.getTopCandidate(entityInformation, candidates); | ||
if (topCandidatesOptional.isPresent()) { | ||
Candidate topCandidate = topCandidatesOptional.get(); | ||
setLinking(jCas, entityInformation, topCandidate.getId()); | ||
} | ||
}); | ||
} | ||
|
||
private void setLinking(JCas jCas, EntityInformation<T> entityInformation, String linking) { | ||
removeFromJCasIndex(entityInformation.getReferenceTarget()); | ||
ReferenceTarget referenceTarget = new ReferenceTarget(jCas); | ||
referenceTarget.setLinking(linking); | ||
for (Entity e : entityInformation.getMentions()) { | ||
e.setReferent(referenceTarget); | ||
} | ||
addToJCasIndex(referenceTarget); | ||
addToJCasIndex(entityInformation.getMentions()); | ||
} | ||
|
||
@Override | ||
public AnalysisEngineAction getAction() { | ||
return new AnalysisEngineAction( | ||
ImmutableSet.of(Entity.class, WordToken.class, ReferenceTarget.class), | ||
ImmutableSet.of(ReferenceTarget.class)); | ||
} | ||
|
||
@Override | ||
protected void doDestroy() { | ||
try { | ||
candidateSupplier.close(); | ||
} catch (Exception e) { | ||
getMonitor().warn("Error closing resources", e); | ||
} | ||
super.doDestroy(); | ||
} | ||
} |
29 changes: 29 additions & 0 deletions
29
baleen-entity-linking/src/main/java/uk/gov/dstl/baleen/entity/linking/Candidate.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,29 @@ | ||
// Copyright (c) Committed Software 2018, opensource@committed.io | ||
package uk.gov.dstl.baleen.entity.linking; | ||
|
||
import java.util.Map; | ||
|
||
/** A Candidate to link to a ReferenceTarget */ | ||
public interface Candidate { | ||
|
||
/** | ||
* Get the unique identifier for the Candidate. This could be, for example, a URL or database ID | ||
* | ||
* @return The identifier String for the Candidate | ||
*/ | ||
String getId(); | ||
|
||
/** | ||
* Get the name of this candidate. This is the value of the property used to match the candidates. | ||
* | ||
* @return The name of the Candidate | ||
*/ | ||
String getName(); | ||
|
||
/** | ||
* Gets the key value pairs for the Candidate | ||
* | ||
* @return The map of key value pairs about the Candidate | ||
*/ | ||
Map<String, String> getKeyValuePairs(); | ||
} |
34 changes: 34 additions & 0 deletions
34
baleen-entity-linking/src/main/java/uk/gov/dstl/baleen/entity/linking/CandidateRanker.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,34 @@ | ||
// Copyright (c) Committed Software 2018, opensource@committed.io | ||
package uk.gov.dstl.baleen.entity.linking; | ||
|
||
import java.util.Collection; | ||
import java.util.Optional; | ||
|
||
import uk.gov.dstl.baleen.types.semantic.Entity; | ||
|
||
/** | ||
* Interface for ranking a collection of candidates | ||
* | ||
* @param <T> The Entity type | ||
*/ | ||
public interface CandidateRanker<T extends Entity> { | ||
|
||
/** | ||
* Get the top candidate as an Optional | ||
* | ||
* @param entityInformation The entity information the candidates relate to | ||
* @param candidates The collection of candidates to be ranked | ||
* @return An Optional of type Candidate | ||
*/ | ||
Optional<Candidate> getTopCandidate( | ||
EntityInformation<T> entityInformation, Collection<Candidate> candidates); | ||
|
||
/** | ||
* Supply stopwords to the ranker (if required) | ||
* | ||
* <p>This is present as configuration can not be passed directly to these classes. | ||
* | ||
* @param stopwords | ||
*/ | ||
void initialize(Collection<String> stopwords); | ||
} |
Oops, something went wrong.