Skip to content

Commit

Permalink
Baleen Entity Linking (#73)
Browse files Browse the repository at this point in the history
* Baleen Entity Linking

In this commit, we provide a framework for entity linking and an initial
implementation. Entity linking is the identification of entities in a document
against an external data source.

We provide example candidate suppliers from DBpedia and a simple Mongo
document based supplier. A general matching algorithm using a bag of words
approach is also supplied but particular situations will benefit from tuned
implementations.

* Rename Propper to Proper

* Simplifying Information Collector and Ranker test
  • Loading branch information
stuarthendren authored and JohnDaws committed May 10, 2018
1 parent 8781ec9 commit 666c139
Show file tree
Hide file tree
Showing 50 changed files with 3,691 additions and 0 deletions.
69 changes: 69 additions & 0 deletions baleen-entity-linking/pom.xml
@@ -0,0 +1,69 @@
<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<parent>
<groupId>uk.gov.dstl.baleen</groupId>
<artifactId>baleen</artifactId>
<version>2.6.0-SNAPSHOT</version>
</parent>
<modelVersion>4.0.0</modelVersion>

<artifactId>baleen-entity-linking</artifactId>

<dependencies>
<dependency>
<groupId>org.apache.jena</groupId>
<artifactId>jena-arq</artifactId>
<version>${jena.version}</version>
</dependency>
<dependency>
<groupId>org.apache.jena</groupId>
<artifactId>jena-querybuilder</artifactId>
<version>${jena.version}</version>
</dependency>
<dependency>
<groupId>uk.gov.dstl.baleen</groupId>
<artifactId>baleen-annotators</artifactId>
<version>2.6.0-SNAPSHOT</version>
</dependency>

<!-- JSON Flattener -->
<dependency>
<groupId>com.github.wnameless</groupId>
<artifactId>json-flattener</artifactId>
<version>0.4.1</version>
</dependency>

<!-- Fongo -->
<dependency>
<groupId>com.github.fakemongo</groupId>
<artifactId>fongo</artifactId>
<version>${fongo.version}</version>
<scope>test</scope>
</dependency>
<dependency>
<groupId>org.testcontainers</groupId>
<artifactId>testcontainers</artifactId>
<version>${testcontainers.version}</version>
<scope>test</scope>
</dependency>

<!-- Testing -->
<dependency>
<groupId>uk.gov.dstl.baleen</groupId>
<artifactId>baleen-annotators</artifactId>
<version>${baleen.version}</version>
<type>test-jar</type>
<scope>test</scope>
</dependency>
<dependency>
<groupId>uk.gov.dstl.baleen</groupId>
<artifactId>baleen-uima</artifactId>
<version>${baleen.version}</version>
<type>test-jar</type>
<scope>test</scope>
</dependency>

</dependencies>

</project>
@@ -0,0 +1,246 @@
// Copyright (c) Committed Software 2018, opensource@committed.io
package uk.gov.dstl.baleen.annotators.coreference;

import java.util.Collection;
import java.util.Optional;
import java.util.Set;

import org.apache.uima.UimaContext;
import org.apache.uima.analysis_engine.AnalysisEngineProcessException;
import org.apache.uima.fit.descriptor.ConfigurationParameter;
import org.apache.uima.fit.descriptor.ExternalResource;
import org.apache.uima.jcas.JCas;
import org.apache.uima.resource.ResourceInitializationException;

import com.google.common.collect.ImmutableSet;

import uk.gov.dstl.baleen.core.pipelines.orderers.AnalysisEngineAction;
import uk.gov.dstl.baleen.core.utils.BuilderUtils;
import uk.gov.dstl.baleen.entity.linking.Candidate;
import uk.gov.dstl.baleen.entity.linking.CandidateRanker;
import uk.gov.dstl.baleen.entity.linking.CandidateSupplier;
import uk.gov.dstl.baleen.entity.linking.EntityInformation;
import uk.gov.dstl.baleen.entity.linking.InformationCollector;
import uk.gov.dstl.baleen.entity.linking.collector.JCasInformationCollector;
import uk.gov.dstl.baleen.entity.linking.collector.ProperNounInformationCollector;
import uk.gov.dstl.baleen.entity.linking.ranker.BagOfWordsCandidateRanker;
import uk.gov.dstl.baleen.resources.SharedStopwordResource;
import uk.gov.dstl.baleen.types.language.WordToken;
import uk.gov.dstl.baleen.types.semantic.Entity;
import uk.gov.dstl.baleen.types.semantic.ReferenceTarget;
import uk.gov.dstl.baleen.uima.BaleenAnnotator;

/**
* Annotator for linking Entities to a known data source
*
* <p>Entity linking is made up of an {@link InformationCollector}, a {@link CandidateSupplier} and
* a {@link CandidateRanker}. By default the {@link ProperNounInformationCollector} is used, however
* this requires part of speech tagging to also be in the pipeline, so an implementation that does
* not require this is also supplied by {@link JCasInformationCollector}. A {@link
* BagOfWordsCandidateRanker} is also used by default. To configure, an entity type and a candidate
* supplier must be configured e.g.
*
* <pre>
* - class: coreference.EntityLinkingAnnotator
* entityType: Person
* candidateSupplier: dbpedia.DBPediaPersonCandidateSupplier
* </pre>
*
* @param <T> The Entity type
*/
public class EntityLinkingAnnotator<T extends Entity> extends BaleenAnnotator {

private static final String SEMANTIC_ENTITY_PACKAGE = "uk.gov.dstl.baleen.types.semantic";
private static final String COMMON_ENTITY_PACKAGE = "uk.gov.dstl.baleen.types.common";
private static final String COLLECTOR_PACKAGE = "uk.gov.dstl.baleen.entity.linking.collector";
private static final String RANKER_PACKAGE = "uk.gov.dstl.baleen.entity.linking.ranker";
private static final String SUPPLIER_PACKAGE = "uk.gov.dstl.baleen.entity.linking.supplier";

/**
* Connection to Stopwords Resource
*
* @baleen.resource uk.gov.dstl.baleen.resources.SharedStopwordResource
*/
public static final String KEY_STOPWORDS = "stopwords";

@ExternalResource(key = KEY_STOPWORDS)
private SharedStopwordResource stopwordResource;

/**
* The stoplist to use. If the stoplist matches one of the enum's provided in {@link
* SharedStopwordResource#StopwordList}, then that list will be loaded.
*
* <p>Otherwise, the string is taken to be a file path and that file is used. The format of the
* file is expected to be one stopword per line.
*
* @baleen.config DEFAULT
*/
public static final String PARAM_STOPLIST = "stoplist";

@ConfigurationParameter(name = PARAM_STOPLIST, defaultValue = "DEFAULT")
private String stoplist;

/**
* The entity type class name. Can be a simple class name if the class is in
* uk.gov.dstl.baleen.common or uk.gov.dstl.baleen.semantic, or a fully qualified name
*
* @baleen.config
*/
public static final String PARAM_ENTITY_TYPE = "entityType";

@ConfigurationParameter(name = PARAM_ENTITY_TYPE)
private String entityTypeClassName;

/**
* The InformationCollector implementation class name. Can be a fully qualified name or simple
* class name if a package is declared or the implementation is in
* uk.gov.dstl.baleen.entity.linking.collector
*
* @baleen.config
*/
public static final String PARAM_INFORMATION_COLLECTOR = "informationCollector";

@ConfigurationParameter(
name = PARAM_INFORMATION_COLLECTOR,
mandatory = false,
defaultValue = "ProperNounInformationCollector"
)
private String informationCollectorClassName;

/**
* The CandidateSupplier implementation class name. Can be a fully qualified name or simple class
* name if a package is declared or the implementation is in
* uk.gov.dstl.baleen.entity.linking.supplier
*
* @baleen.config
*/
public static final String PARAM_CANDIDATE_SUPPLIER = "candidateSupplier";

@ConfigurationParameter(name = PARAM_CANDIDATE_SUPPLIER)
private String candidateSupplierClassName;

/**
* The CandidateRanker implementation class name. Can be a fully qualified name or simple class
* name if a package is declared or the implementation is in
* uk.gov.dstl.baleen.entity.linking.ranker
*
* @baleen.config
*/
public static final String PARAM_CANDIDATE_RANKER = "candidateRanker";

@ConfigurationParameter(
name = PARAM_CANDIDATE_RANKER,
mandatory = false,
defaultValue = "BagOfWordsCandidateRanker"
)
private String candidateRankerClassName;

/**
* Configuration arguments for candidate supplier. This is mandatory for the
* MongoCandidateSupplier. Mandatory fields for MongoCandidateSupplier are "collection" and
* "searchField" Should be an array of Strings of key value pairs. For example, ["collection",
* "peopleCollection", "language", "en", "port", "1234"]
*
* @baleen.config {}
*/
public static final String PARAM_CANDIDATE_SUPPLIER_CONFIG_ARGUMENTS =
"candidateSupplierArguments";

@ConfigurationParameter(
name = PARAM_CANDIDATE_SUPPLIER_CONFIG_ARGUMENTS,
mandatory = false,
defaultValue = {}
)
private String[] candidateSupplierOptions;

private Class<T> entityClass;
private InformationCollector informationCollector;
private CandidateSupplier<T> candidateSupplier;
private CandidateRanker<T> candidateRanker;

@Override
@SuppressWarnings("unchecked")
public void doInitialize(UimaContext aContext) throws ResourceInitializationException {
super.doInitialize(aContext);

Collection<String> stopwords = stopwordResource.getStopwords(stoplist);

try {
entityClass =
BuilderUtils.getClassFromString(
entityTypeClassName, SEMANTIC_ENTITY_PACKAGE, COMMON_ENTITY_PACKAGE);

informationCollector =
(InformationCollector)
BuilderUtils.getClassFromString(informationCollectorClassName, COLLECTOR_PACKAGE)
.newInstance();

candidateSupplier =
(CandidateSupplier<T>)
BuilderUtils.getClassFromString(candidateSupplierClassName, SUPPLIER_PACKAGE)
.newInstance();

candidateSupplier.configure(candidateSupplierOptions);

candidateRanker =
(CandidateRanker<T>)
BuilderUtils.getClassFromString(candidateRankerClassName, RANKER_PACKAGE)
.newInstance();

candidateRanker.initialize(stopwords);

} catch (Exception e) {
throw new ResourceInitializationException(e);
}
}

@Override
protected void doProcess(JCas jCas) throws AnalysisEngineProcessException {

Set<EntityInformation<T>> entityInformationSet =
informationCollector.getEntityInformation(jCas, entityClass);

entityInformationSet.forEach(
entityInformation -> {
Collection<Candidate> candidates = candidateSupplier.getCandidates(entityInformation);
if (candidates.size() == 1) {
setLinking(jCas, entityInformation, candidates.iterator().next().getId());
return;
}
Optional<Candidate> topCandidatesOptional =
candidateRanker.getTopCandidate(entityInformation, candidates);
if (topCandidatesOptional.isPresent()) {
Candidate topCandidate = topCandidatesOptional.get();
setLinking(jCas, entityInformation, topCandidate.getId());
}
});
}

private void setLinking(JCas jCas, EntityInformation<T> entityInformation, String linking) {
removeFromJCasIndex(entityInformation.getReferenceTarget());
ReferenceTarget referenceTarget = new ReferenceTarget(jCas);
referenceTarget.setLinking(linking);
for (Entity e : entityInformation.getMentions()) {
e.setReferent(referenceTarget);
}
addToJCasIndex(referenceTarget);
addToJCasIndex(entityInformation.getMentions());
}

@Override
public AnalysisEngineAction getAction() {
return new AnalysisEngineAction(
ImmutableSet.of(Entity.class, WordToken.class, ReferenceTarget.class),
ImmutableSet.of(ReferenceTarget.class));
}

@Override
protected void doDestroy() {
try {
candidateSupplier.close();
} catch (Exception e) {
getMonitor().warn("Error closing resources", e);
}
super.doDestroy();
}
}
@@ -0,0 +1,29 @@
// Copyright (c) Committed Software 2018, opensource@committed.io
package uk.gov.dstl.baleen.entity.linking;

import java.util.Map;

/** A Candidate to link to a ReferenceTarget */
public interface Candidate {

/**
* Get the unique identifier for the Candidate. This could be, for example, a URL or database ID
*
* @return The identifier String for the Candidate
*/
String getId();

/**
* Get the name of this candidate. This is the value of the property used to match the candidates.
*
* @return The name of the Candidate
*/
String getName();

/**
* Gets the key value pairs for the Candidate
*
* @return The map of key value pairs about the Candidate
*/
Map<String, String> getKeyValuePairs();
}
@@ -0,0 +1,34 @@
// Copyright (c) Committed Software 2018, opensource@committed.io
package uk.gov.dstl.baleen.entity.linking;

import java.util.Collection;
import java.util.Optional;

import uk.gov.dstl.baleen.types.semantic.Entity;

/**
* Interface for ranking a collection of candidates
*
* @param <T> The Entity type
*/
public interface CandidateRanker<T extends Entity> {

/**
* Get the top candidate as an Optional
*
* @param entityInformation The entity information the candidates relate to
* @param candidates The collection of candidates to be ranked
* @return An Optional of type Candidate
*/
Optional<Candidate> getTopCandidate(
EntityInformation<T> entityInformation, Collection<Candidate> candidates);

/**
* Supply stopwords to the ranker (if required)
*
* <p>This is present as configuration can not be passed directly to these classes.
*
* @param stopwords
*/
void initialize(Collection<String> stopwords);
}

0 comments on commit 666c139

Please sign in to comment.