Skip to content

Commit

Permalink
Fixes to Wiktionary & WebIsALOD wrt special characters.
Browse files Browse the repository at this point in the history
  • Loading branch information
janothan committed Apr 11, 2021
1 parent 76f531f commit 99a21a4
Show file tree
Hide file tree
Showing 7 changed files with 103 additions and 54 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -99,12 +99,16 @@ public boolean isHypernymous(String linkedConcept_1, String linkedConcept_2){
Set<String> hypernyms_1 = getHypernyms(linkedConcept_1);
Set<String> hypernyms_2 = getHypernyms(linkedConcept_2);

for(String hypernym : hypernyms_1){
if(linkedConcept_2.equals(hypernym)) return true;
}
for(String hypernym : hypernyms_2){
if(linkedConcept_1.equals(hypernym)) return true;
}
if(hypernyms_1 != null) {
for (String hypernym : hypernyms_1) {
if (linkedConcept_2.equals(hypernym)) return true;
}
}
if(hypernyms_2 != null) {
for (String hypernym : hypernyms_2) {
if (linkedConcept_1.equals(hypernym)) return true;
}
}
return false;
}

Expand Down
Original file line number Diff line number Diff line change
@@ -1,10 +1,10 @@
package de.uni_mannheim.informatik.dws.melt.matching_jena_matchers.external.matcher;

import de.uni_mannheim.informatik.dws.melt.matching_base.Filter;
import de.uni_mannheim.informatik.dws.melt.matching_jena.MatcherPipelineYAAAJenaConstructor;
import de.uni_mannheim.informatik.dws.melt.matching_jena.MatcherYAAAJena;
import de.uni_mannheim.informatik.dws.melt.matching_jena_matchers.external.LabelToConceptLinker;
import de.uni_mannheim.informatik.dws.melt.matching_jena_matchers.external.SemanticWordRelationDictionary;
import de.uni_mannheim.informatik.dws.melt.matching_jena_matchers.filter.TopXFilter;
import de.uni_mannheim.informatik.dws.melt.matching_jena_matchers.filter.extraction.HungarianExtractor;
import de.uni_mannheim.informatik.dws.melt.matching_jena_matchers.filter.extraction.MaxWeightBipartiteExtractor;
import de.uni_mannheim.informatik.dws.melt.yet_another_alignment_api.Alignment;
Expand Down Expand Up @@ -33,6 +33,8 @@ public class BackgroundMatcherStandAlone extends MatcherYAAAJena {

private BackgroundMatcher backgroundMatcher;

private TopXFilter topXFilter;

/**
* The name of the matcher.
*/
Expand Down Expand Up @@ -61,7 +63,7 @@ public BackgroundMatcherStandAlone(SemanticWordRelationDictionary backgroundKnow
ImplementedBackgroundMatchingStrategies strategy,
boolean isUseOneToOneExtractor,
double threshold){
this(backgroundKnowledgeSource, strategy, isUseOneToOneExtractor, null, threshold);
this(backgroundKnowledgeSource, strategy, isUseOneToOneExtractor, null, threshold, 1);
}

/**
Expand All @@ -71,26 +73,31 @@ public BackgroundMatcherStandAlone(SemanticWordRelationDictionary backgroundKnow
* @param isUseOneToOneExtractor True if alignment shall be transformed to a 1-1 alignment.
* @param extractor The desired extractor that shall be used.
* @param threshold The minimal required threshold that is required for a match.
* @param topX The top X correspondences that shall be kept.‚
*/
public BackgroundMatcherStandAlone(SemanticWordRelationDictionary backgroundKnowledgeSource,
ImplementedBackgroundMatchingStrategies strategy,
boolean isUseOneToOneExtractor,
MatcherYAAAJena extractor,
double threshold){
double threshold,
int topX){
this.backgroundKnowledgeSource = backgroundKnowledgeSource;
this.strategy = strategy;
this.threshold = threshold;
this.simpleStringMatcher = new SimpleStringMatcher();
this.backgroundMatcher = new BackgroundMatcher(backgroundKnowledgeSource, strategy, threshold);
this.topXFilter = new TopXFilter(topX, TopXFilter.TopFilterMode.SMALLEST, threshold);

if(isUseOneToOneExtractor){
if(extractor == null) {
// default extractor: Use Hungarian at the moment due to infinity loop issues with MWBE.
//MaxWeightBipartiteExtractor mwb = new MaxWeightBipartiteExtractor();
HungarianExtractor he = new HungarianExtractor();
pipelineYAAAJena = new MatcherPipelineYAAAJenaConstructor(simpleStringMatcher, backgroundMatcher, he);
pipelineYAAAJena = new MatcherPipelineYAAAJenaConstructor(simpleStringMatcher, backgroundMatcher,
topXFilter, he);
} else {
pipelineYAAAJena = new MatcherPipelineYAAAJenaConstructor(simpleStringMatcher, backgroundMatcher, extractor);
pipelineYAAAJena = new MatcherPipelineYAAAJenaConstructor(simpleStringMatcher, backgroundMatcher,
topXFilter, extractor);
}
} else {
pipelineYAAAJena = new MatcherPipelineYAAAJenaConstructor(simpleStringMatcher, backgroundMatcher);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -61,8 +61,14 @@ public static boolean safeAsk(QueryExecution queryExecutionInstance) {
} catch (InterruptedException ie) {
LOGGER.error("Interrupted exception.", ie);
}
LOGGER.error("Retry.");
result = queryExecutionInstance.execAsk();
try {
LOGGER.error("Retry.");
result = queryExecutionInstance.execAsk();
} catch (Exception e2){
LOGGER.error("Failed to execute ASK query. Returning false.", e2);
LOGGER.error("Problematic ASK query:\n" + queryExecutionInstance.getQuery().toString());
return false;
}
} // end of catch
return result;
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@
*/
public class WebIsAlodSPARQLservice {


/**
* Default logger
*/
Expand Down Expand Up @@ -379,13 +380,13 @@ private boolean isHypernymous(WebIsAlodEndpoint sparqlWebIsAlodEndpoint, String
}
QueryExecution qe = QueryExecutionFactory.sparqlService(sparqlWebIsAlodEndpoint.toString(), getIsHypernymousAskQueryClassic(uri1, uri2, minimumConfidence, this.webIsAlodEndpoint.isClassic()));
boolean result = safeAsk(qe);
qe.close();
hypernymyAskBuffer.put(uriTuple, result);
if (sparqlWebIsAlodEndpoint.equals(WebIsAlodEndpoint.ALOD_CLASSIC_ENDPOINT)) {
commit(ALOD_CLASSIC_HYPERNYMY_ASK_BUFFER);
} else {
commit(ALOD_XL_HYPERNYMY_ASK_BUFFER);
}
qe.close();
return result;
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@

/**
* Class utilizing DBnary, a SPARQL endpoint for Wiktionary.
* Alternatively, TDB1 can be used as offline storage.
*/
public class WiktionaryKnowledgeSource extends SemanticWordRelationDictionary {

Expand Down Expand Up @@ -120,7 +121,7 @@ public WiktionaryKnowledgeSource(String tdbDirectoryPath) {
//tdbDataset = TDB2Factory.connectDataset(tdbDirectoryPath);
tdbDataset = TDBFactory.createDataset(tdbDirectoryPath);
tdbDataset.begin(ReadWrite.READ);
this.isDiskBufferEnabled = false;
this.isDiskBufferEnabled = true;
initialize();
}

Expand Down Expand Up @@ -166,25 +167,33 @@ public boolean isInDictionary(String word) {
* @return boolean indicating whether the word exists in the dictionary in the corresponding language.
*/
public boolean isInDictionary(String word, Language language) {
if(word == null || language == null){
return false;
}
word = encodeWord(word);
String key = "in_dict_" + word + "_" + language.toSparqlChar2();
if(askBuffer.containsKey(key)){
return askBuffer.get(key);
}

String queryString =
"PREFIX lexvo: <http://lexvo.org/id/iso639-3/>\r\n" +
"PREFIX dbnary: <http://kaiko.getalp.org/dbnary#>\r\n" +
"ASK { <http://kaiko.getalp.org/dbnary/" + language.toWiktionaryChar3() + "/" + word + "> ?p ?o . }";
Query query = QueryFactory.create(queryString);
QueryExecution queryExecution;
if(isUseTdb) {
queryExecution = QueryExecutionFactory.create(query, tdbDataset);
} else {
queryExecution = QueryExecutionFactory.sparqlService(ENDPOINT_URL, query);
boolean result = false;
try {
Query query = QueryFactory.create(queryString);
QueryExecution queryExecution;
if (isUseTdb) {
queryExecution = QueryExecutionFactory.create(query, tdbDataset);
} else {
queryExecution = QueryExecutionFactory.sparqlService(ENDPOINT_URL, query);
}
result = queryExecution.execAsk();
queryExecution.close();
} catch (Exception e){
// logging actual error is disabled
LOGGER.warn("An error occurred while trying to look up: '" + word + "'. Returning false.");
}
boolean result = queryExecution.execAsk();
queryExecution.close();
askBuffer.put(key, result);
commit(WIKTIONARY_ASK_BUFFER);
return result;
Expand Down Expand Up @@ -285,18 +294,22 @@ public HashSet<String> getSynonyms(String word, Language language) {
"}\r\n" +
"}";
//System.out.println(queryString);
Query query = QueryFactory.create(queryString);
QueryExecution queryExecution;
if(isUseTdb) {
queryExecution = QueryExecutionFactory.create(query, tdbDataset);
} else {
queryExecution = QueryExecutionFactory.sparqlService(ENDPOINT_URL, query);
}
ResultSet queryResult = queryExecution.execSelect();
while (queryResult.hasNext()) {
result.add(getLemmaFromURI(queryResult.next().getResource("synonym").toString()));
try {
Query query = QueryFactory.create(queryString);
QueryExecution queryExecution;
if (isUseTdb) {
queryExecution = QueryExecutionFactory.create(query, tdbDataset);
} else {
queryExecution = QueryExecutionFactory.sparqlService(ENDPOINT_URL, query);
}
ResultSet queryResult = queryExecution.execSelect();
while (queryResult.hasNext()) {
result.add(getLemmaFromURI(queryResult.next().getResource("synonym").toString()));
}
queryExecution.close();
} catch (Exception e){
LOGGER.warn("Problem with query getSynonyms for word: '" + word + "'.");
}
queryExecution.close();
synonymyBuffer.put(word + "_" + language.toWiktionaryChar3(), result);
commit(WIKTIONARY_SYNONYMY_BUFFER);
return result;
Expand All @@ -320,10 +333,13 @@ private static String getLemmaFromURI(String uri) {
*/
static String encodeWord(String word) {
// we cannot use the Java default encoder due to some ideosyncratic encodings
word = word.trim();
word = word.replace("%", "%25");
word = word.replace(" ", "_");
word = word.replace(".", "%2E");
word = word.replace("^", "%5E");
word = word.replace("<", "%3C");
word = word.replace(">", "%3E");
return word;
}

Expand All @@ -346,12 +362,13 @@ public HashSet<String> getHypernyms(String linkedConcept) {
* @return A set of hypernyms.
*/
public HashSet<String> getHypernyms(String linkedConcept, Language language) {
if (linkedConcept == null) return null;
HashSet<String> result = new HashSet<>();
if (linkedConcept == null) return result;
linkedConcept = encodeWord(linkedConcept);
if (hypernymyBuffer.containsKey(linkedConcept + "_" + linkedConcept)) {
return hypernymyBuffer.get(linkedConcept + "_" + linkedConcept);
String key = linkedConcept + "_" + language.toSparqlChar2();
if (hypernymyBuffer.containsKey(key)) {
return hypernymyBuffer.get(key);
}
HashSet<String> result = new HashSet<>();
String queryString = "PREFIX dbnary: <http://kaiko.getalp.org/dbnary#>\n" +
"PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>\n" +
"PREFIX dbnarylan: <http://kaiko.getalp.org/dbnary/eng/>\n" +
Expand Down Expand Up @@ -384,13 +401,12 @@ public HashSet<String> getHypernyms(String linkedConcept, Language language) {
result.add(getLemmaFromURI(queryResult.next().getResource("hypernym").toString()));
}
queryExecution.close();
hypernymyBuffer.put(linkedConcept + "_" + language.toWiktionaryChar3(), result);
commit(WIKTIONARY_HYPERNYMY_BUFFER);
return result;
} catch (QueryParseException qpe) {
LOGGER.info("Faild to build query for concept '" + linkedConcept + "'", qpe);
return null;
LOGGER.warn("Failed to build getHypernyms query for concept '" + linkedConcept + "'");
}
hypernymyBuffer.put(key, result);
commit(WIKTIONARY_HYPERNYMY_BUFFER);
return result;
}

/**
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,6 @@
import de.uni_mannheim.informatik.dws.melt.matching_jena_matchers.external.LabelToConceptLinker;
import de.uni_mannheim.informatik.dws.melt.matching_jena_matchers.external.Language;
import de.uni_mannheim.informatik.dws.melt.matching_jena_matchers.external.services.persistence.PersistenceService;
import de.uni_mannheim.informatik.dws.melt.matching_jena_matchers.external.services.testTools.TestOperations;
import org.apache.commons.io.FileUtils;
import org.junit.jupiter.api.AfterAll;
import org.junit.jupiter.api.BeforeAll;
Expand All @@ -28,11 +27,11 @@ public class WiktionaryKnowledgeSourceTdbTest {
public static void prepare() {
deletePersistenceDirectory();
String key = "wiktionaryTdbDirectory";
String tdbpath = getKeyFromConfigFiles("wiktionaryTdbDirectory");
if(tdbpath == null){
String tdbPath = getKeyFromConfigFiles("wiktionaryTdbDirectory");
if(tdbPath == null){
fail("Cannot find config.properties or local_config.properties with key " + key);
}
wiktionary = new WiktionaryKnowledgeSource(tdbpath);
wiktionary = new WiktionaryKnowledgeSource(tdbPath);
}

@AfterAll
Expand All @@ -46,7 +45,7 @@ public static void shutDown() {
*/
private static void deletePersistenceDirectory() {
File result = new File(PersistenceService.PERSISTENCE_DIRECTORY);
if (result != null && result.exists() && result.isDirectory()) {
if (result.exists() && result.isDirectory()) {
try {
FileUtils.deleteDirectory(result);
} catch (IOException e) {
Expand Down Expand Up @@ -93,8 +92,11 @@ public void testIsInDictionaryString() {
// true positive check; check for correct encoding of spaces
assertTrue(wiktionary.isInDictionary("seminal fluid"));

// true positive check; check for correct encoding of %
// true positive check; check for correct encoding of special characters
assertTrue(wiktionary.isInDictionary("%"));
assertTrue(wiktionary.isInDictionary("Alzheimer's"));
assertTrue(wiktionary.isInDictionary("Alzheimer's\n"));
assertTrue(wiktionary.isInDictionary("Alzheimer's Disease"));

// false positive check
assertFalse(wiktionary.isInDictionary("asdfasdfasdf"));
Expand Down Expand Up @@ -166,6 +168,8 @@ public void testHypernymy(){
// assert linking process compatibility
assertTrue(wiktionary.getHypernyms(wiktionary.getLinker().linkToSingleConcept("cat")).contains("feline"));
assertFalse(wiktionary.getHypernyms(wiktionary.getLinker().linkToSingleConcept("cat")).contains("dog"));

wiktionary.getHypernyms(wiktionary.getLinker().linkToSingleConcept("Alzheimer's disease")).size();
}

@Test
Expand All @@ -179,5 +183,4 @@ void isSynonymousOrHypernymyous(){
assertTrue(wiktionary.isSynonymousOrHypernymous(wiktionary.getLinker().linkToSingleConcept("dog"), wiktionary.getLinker().linkToSingleConcept("hound")));
assertFalse(wiktionary.isSynonymousOrHypernymous(wiktionary.getLinker().linkToSingleConcept("dog"), wiktionary.getLinker().linkToSingleConcept("cat")));
}

}
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,6 @@ private static void deletePersistenceDirectory() {
} catch (IOException e) {
LOGGER.error("Failed to remove persistence directory.");
}

}

@Test
Expand All @@ -55,6 +54,16 @@ public void testIsInDictionaryString() {
// true positive check; check for correct encoding of %
assertTrue(wiktionary.isInDictionary("%"));

assertTrue(wiktionary.isInDictionary("Alzheimer's"));
assertTrue(wiktionary.isInDictionary("Alzheimer's\n"));
assertTrue(wiktionary.isInDictionary("Alzheimer's Disease"));

// true positive with language
assertTrue(wiktionary.isInDictionary("Ähre", Language.GERMAN ));

// false positive check; check for stability with random signs
assertFalse(wiktionary.isInDictionary("<"));

// false positive check
assertFalse(wiktionary.isInDictionary("asdfasdfasdf"));
}
Expand Down Expand Up @@ -111,20 +120,23 @@ public void testIsSynonymous() {
WiktionaryKnowledgeSource wiktionary = new WiktionaryKnowledgeSource();
assertTrue(wiktionary.isSynonymous("dog", "hound"));
assertTrue(wiktionary.isSynonymous("dog", "dog"));
assertFalse(wiktionary.isSynonymous("dog", "cat"));
assertFalse(wiktionary.isSynonymous("dog\n", "cat"));
}

@Test
public void testIsStrongFromSynonymous() {
WiktionaryKnowledgeSource wiktionary = new WiktionaryKnowledgeSource();
assertTrue(wiktionary.isStrongFormSynonymous("dog", "hound"));
assertTrue(wiktionary.isStrongFormSynonymous("dog\n", "hound"));
assertTrue(wiktionary.isStrongFormSynonymous("dog", "dog"));
assertFalse(wiktionary.isStrongFormSynonymous("dog", "cat"));
}

@Test
public void testHypernymy() {
WiktionaryKnowledgeSource wiktionary = new WiktionaryKnowledgeSource();
WiktionaryLinker linker = (WiktionaryLinker) wiktionary.getLinker();

// using default language
assertTrue(wiktionary.getHypernyms("cat").contains("feline"));
assertFalse(wiktionary.getHypernyms("cat").contains("dog"));

Expand Down

0 comments on commit 99a21a4

Please sign in to comment.