Skip to content
Permalink
Browse files

Evaluator can now return NaN F1-Score in case of no reference tokens

  • Loading branch information...
asishallab committed Sep 23, 2016
1 parent 8981af2 commit 7d11bf39b24b285eb3ec568c743ba6532f19c720
Showing with 88 additions and 53 deletions.
  1. +30 −52 src/ahrd/model/EvaluationScoreCalculator.java
  2. +58 −1 test/ahrd/test/ReferenceDescriptionTest.java
@@ -49,8 +49,7 @@ public EvaluationScoreCalculator(Protein protein) {
* @param referenceTokens
* @return Double - The number of shared Tokens
*/
public static Double truePositives(Set<String> assignedTokens,
Set<String> referenceTokens) {
public static Double truePositives(Set<String> assignedTokens, Set<String> referenceTokens) {
double tp = 0.0;
if (assignedTokens != null && !assignedTokens.isEmpty()) {
for (String assignedTkn : assignedTokens) {
@@ -70,10 +69,8 @@ public static Double truePositives(Set<String> assignedTokens,
* @param referenceTokens
* @return Double - True-Positives-Rate
*/
public static Double truePositivesRate(Set<String> assignedTokens,
Set<String> referenceTokens) {
return truePositives(assignedTokens, referenceTokens)
/ referenceTokens.size();
public static Double truePositivesRate(Set<String> assignedTokens, Set<String> referenceTokens) {
return truePositives(assignedTokens, referenceTokens) / referenceTokens.size();
}

/**
@@ -88,8 +85,8 @@ public static Double truePositivesRate(Set<String> assignedTokens,
* @param allBlastTokens
* @return Double - False-Positives-Rates
*/
public static Double falsePositivesRate(Set<String> assignedTokens,
Set<String> referenceTokens, Set<String> allBlastTokens) {
public static Double falsePositivesRate(Set<String> assignedTokens, Set<String> referenceTokens,
Set<String> allBlastTokens) {
// Count false-positives
double fp = 0;
for (String asgnTkn : assignedTokens) {
@@ -130,14 +127,13 @@ public static Double falsePositivesRate(Set<String> assignedTokens,
* competitor-method
* @param referenceTkns
* - Tokens of the Reference
* @return Double - F-Beta-Score
* @return Double - F-Beta-Score or Double.NaN if no reference Tokens were
* given.
*/
public static Double fBetaScore(Set<String> assignedTkns,
Set<String> referenceTkns) {
public static Double fBetaScore(Set<String> assignedTkns, Set<String> referenceTkns) {
// Validate Reference:
if (referenceTkns == null || referenceTkns.isEmpty())
throw new IllegalArgumentException(
"Cannot calculate F1-Score, got an empty set of Reference-Tokens.");
return Double.NaN;
// Calculate f-beta-score:
double fBetaScore = 0.0;
if (assignedTkns != null && !assignedTkns.isEmpty()) {
@@ -148,8 +144,7 @@ public static Double fBetaScore(Set<String> assignedTkns,
double rc = tp / referenceTkns.size();
// F-Beta-Measure is the harmonic mean of precision and recall
// weighted by param beta:
Double bSqr = getSettings().getFMeasureBetaParameter()
* getSettings().getFMeasureBetaParameter();
Double bSqr = getSettings().getFMeasureBetaParameter() * getSettings().getFMeasureBetaParameter();
fBetaScore = (1 + bSqr) * (pr * rc) / (bSqr * pr + rc);
}
}
@@ -167,8 +162,7 @@ public static Double fBetaScore(Set<String> assignedTkns,
*/
public void addUnchangedBlastResult(String blastDb, BlastResult br) {
if (!getUnchangedBlastResults().containsKey(blastDb)
|| getUnchangedBlastResults().get(blastDb).getBitScore() < br
.getBitScore()) {
|| getUnchangedBlastResults().get(blastDb).getBitScore() < br.getBitScore()) {
getUnchangedBlastResults().put(blastDb, br);
}
}
@@ -179,29 +173,21 @@ public void addUnchangedBlastResult(String blastDb, BlastResult br) {
* Blast-Hit.
*/
public void assignEvlScrsToCompetitors() {
if (getReferenceDescription() != null
&& getReferenceDescription().getDescription() != null) {
if (getReferenceDescription() != null && getReferenceDescription().getDescription() != null) {
// First Competitor is the Description assigned by AHRD itself:
if (getProtein().getDescriptionScoreCalculator()
.getHighestScoringBlastResult() != null) {
if (getProtein().getDescriptionScoreCalculator().getHighestScoringBlastResult() != null) {
// Generate the set of Evaluation-Tokens from the
// actually assigned Description, WITHOUT filtering each
// Token with the BLACKLIST:
getProtein().getDescriptionScoreCalculator()
.getHighestScoringBlastResult().tokenizeForEvaluation();
Set<String> hrdEvlTkns = getProtein()
.getDescriptionScoreCalculator()
.getHighestScoringBlastResult().getEvaluationTokens();
getProtein().getDescriptionScoreCalculator().getHighestScoringBlastResult().tokenizeForEvaluation();
Set<String> hrdEvlTkns = getProtein().getDescriptionScoreCalculator().getHighestScoringBlastResult()
.getEvaluationTokens();
// Calculate the Evaluation-Score as the F-Beta-Score:
setEvalutionScore(fBetaScore(hrdEvlTkns,
getReferenceDescription().getTokens()));
setEvalutionScore(fBetaScore(hrdEvlTkns, getReferenceDescription().getTokens()));
// Enable calculation of the ROC-Curve:
setTruePositivesRate(truePositivesRate(hrdEvlTkns,
getReferenceDescription().getTokens()));
setFalsePositivesRate(falsePositivesRate(hrdEvlTkns,
getReferenceDescription().getTokens(), getProtein()
.getTokenScoreCalculator().getTokenScores()
.keySet()));
setTruePositivesRate(truePositivesRate(hrdEvlTkns, getReferenceDescription().getTokens()));
setFalsePositivesRate(falsePositivesRate(hrdEvlTkns, getReferenceDescription().getTokens(),
getProtein().getTokenScoreCalculator().getTokenScores().keySet()));
} else {
// Well, no Description assigned means scores ZERO:
setEvalutionScore(0.0);
@@ -213,16 +199,14 @@ public void assignEvlScrsToCompetitors() {
Double bestCompEvlScr = 0.0;
if (getUnchangedBlastResults().size() > 0) {
for (String blastDatabase : getUnchangedBlastResults().keySet()) {
BlastResult cmpt = getUnchangedBlastResults().get(
blastDatabase);
BlastResult cmpt = getUnchangedBlastResults().get(blastDatabase);
if (cmpt != null) {
// Generate the set of Evaluation-Tokens from the
// actually assigned Description, WITHOUT filtering each
// Token with the BLACKLIST:
cmpt.tokenizeForEvaluation();
cmpt.setEvaluationScore(fBetaScore(
cmpt.getEvaluationTokens(),
getReferenceDescription().getTokens()));
cmpt.setEvaluationScore(
fBetaScore(cmpt.getEvaluationTokens(), getReferenceDescription().getTokens()));
// Find best performing competitor-method:
if (cmpt.getEvaluationScore() > bestCompEvlScr)
bestCompEvlScr = cmpt.getEvaluationScore();
@@ -232,9 +216,8 @@ public void assignEvlScrsToCompetitors() {
// Also compare with the Blast2GO-Annotation(s), if present:
if (getBlast2GoAnnots() != null) {
for (Blast2GoAnnot b2ga : getBlast2GoAnnots()) {
b2ga.setEvaluationScore(fBetaScore(
b2ga.getEvaluationTokens(),
getReferenceDescription().getTokens()));
b2ga.setEvaluationScore(
fBetaScore(b2ga.getEvaluationTokens(), getReferenceDescription().getTokens()));
// Find best performing competitor-method:
if (b2ga.getEvaluationScore() > bestCompEvlScr)
bestCompEvlScr = b2ga.getEvaluationScore();
@@ -253,15 +236,13 @@ public void assignEvlScrsToCompetitors() {
*/
public void findHighestPossibleEvaluationScore() {
setHighestPossibleEvaluationScore(0.0);
for (List<BlastResult> resultsFromBlastDatabase : getProtein()
.getBlastResults().values()) {
for (List<BlastResult> resultsFromBlastDatabase : getProtein().getBlastResults().values()) {
for (BlastResult cmpt : resultsFromBlastDatabase) {
// Generate the set of Evaluation-Tokens from the
// actually assigned Description, WITHOUT filtering each
// Token with the BLACKLIST:
cmpt.tokenizeForEvaluation();
cmpt.setEvaluationScore(fBetaScore(cmpt.getEvaluationTokens(),
getReferenceDescription().getTokens()));
cmpt.setEvaluationScore(fBetaScore(cmpt.getEvaluationTokens(), getReferenceDescription().getTokens()));
// Find best performing BlastResult-Description:
if (cmpt.getEvaluationScore() > getHighestPossibleEvaluationScore())
setHighestPossibleEvaluationScore(cmpt.getEvaluationScore());
@@ -302,17 +283,15 @@ public ReferenceDescription getReferenceDescription() {
return referenceDescription;
}

public void setReferenceDescription(
ReferenceDescription referenceDescription) {
public void setReferenceDescription(ReferenceDescription referenceDescription) {
this.referenceDescription = referenceDescription;
}

public Map<String, BlastResult> getUnchangedBlastResults() {
return unchangedBlastResults;
}

public void setUnchangedBlastResults(
Map<String, BlastResult> unchangedBlastResults) {
public void setUnchangedBlastResults(Map<String, BlastResult> unchangedBlastResults) {
this.unchangedBlastResults = unchangedBlastResults;
}

@@ -368,8 +347,7 @@ public Double getHighestPossibleEvaluationScore() {
return highestPossibleEvaluationScore;
}

public void setHighestPossibleEvaluationScore(
Double highestPossibleEvaluationScore) {
public void setHighestPossibleEvaluationScore(Double highestPossibleEvaluationScore) {
this.highestPossibleEvaluationScore = highestPossibleEvaluationScore;
}

@@ -1,9 +1,13 @@
package ahrd.test;

import static ahrd.controller.Settings.setSettings;
import static ahrd.model.ReferenceDescription.constructFromFastaEntry;
import static org.junit.Assert.assertEquals;
import static org.junit.Assert.assertNotNull;
import static org.junit.Assert.assertTrue;

import java.io.IOException;
import java.nio.file.Paths;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.Map;
@@ -13,6 +17,7 @@
import org.xml.sax.SAXException;

import ahrd.controller.Evaluator;
import ahrd.controller.Settings;
import ahrd.exception.MissingAccessionException;
import ahrd.exception.MissingProteinException;
import ahrd.model.Protein;
@@ -26,7 +31,7 @@
public void testParsingOfReferences() throws IOException {
TestUtils.initTestSettings();
String fastaEntry = "AT06g1234 Sheep wool growth factor\nRSSPMSRATVDAAPLLASAAASSGTAPMIEISAAEPKRAPKRVSTTPVTPDRPNSSPPNE\nLIVTVWLFGKMMRSHPTVTRFWPTFRPDW";
ReferenceDescription rd = ReferenceDescription.constructFromFastaEntry(fastaEntry);
ReferenceDescription rd = constructFromFastaEntry(fastaEntry);
assertEquals("AT06g1234", rd.getAccession());
assertEquals("Sheep wool growth factor", rd.getDescription());
assertEquals(4, rd.getTokens().size());
@@ -57,4 +62,56 @@ public void testSwissprotBatch1ReferenceTokens()
!rd.getTokens().isEmpty());
}
}

@Test
public void testReferenceDescriptionBlacklistingAndFiltering() throws IOException {
setSettings(new Settings(
Paths.get("test", "resources", "evaluator_filter_references_example_input.yml").toString()));
ReferenceDescription rd1 = constructFromFastaEntry(
"ATMG00450.1 hypothetical protein\nMVVTAYPKSSAGMGVTVLPEYLKQSSYEAYSRPYSAFFLSGCTKQERSPLLARRLVDAWL");
ReferenceDescription rd2 = constructFromFastaEntry(
"AT1G31870.1 unknown protein\nMAGNQSLKDYLKKYESSDVVEKKKKKKKQKKPSKPEPRGVLVVDEDPVWQKQVDPEEDEN");
ReferenceDescription rd3 = constructFromFastaEntry(
"AT1G75110.1 REDUCED RESIDUAL ARABINOSE 2\nMAGRRDRIQQLRGSRIAIAIFVGILIGCVCSVLFPNGFFNSGSSLIANEERISKSTSTDG");
ReferenceDescription rd4 = constructFromFastaEntry(
"AT1G75080.1 BRASSINAZOLE-RESISTANT 1; DNA binding / transcription regulator/ transcription repressor\nMTSDGATSTSAAAAAAAAAAARRKPSWRERENNRRRERRRRAVAAKIYTGLRAQGDYNLP");
assertNotNull("Expected ReferenceDescription rd1 but got null.", rd1);
assertNotNull("Expected ReferenceDescription rd2 but got null.", rd2);
assertNotNull("Expected ReferenceDescription rd3 but got null.", rd3);
assertNotNull("Expected ReferenceDescription rd4 but got null.", rd4);
assertTrue("Expected rd1 to not have any tokens, instead has: (" + rd1.getTokens() + ")",
rd1.getTokens().isEmpty());
assertTrue("Expected rd2 to not have any tokens, instead has: (" + rd2.getTokens() + ")",
rd2.getTokens().isEmpty());
assertTrue("Expected ReferenceDescription to contain Token 'reduced', instead has: (" + rd3.getTokens() + ").",
rd3.getTokens().contains("reduced"));
assertTrue("Expected ReferenceDescription to contain Token 'residual', instead has: (" + rd3.getTokens() + ").",
rd3.getTokens().contains("residual"));
assertTrue(
"Expected ReferenceDescription to contain Token 'arabinose', instead has: (" + rd3.getTokens() + ").",
rd3.getTokens().contains("arabinose"));
assertTrue("Expected ReferenceDescription to contain Token '2', instead has: (" + rd3.getTokens() + ").",
rd3.getTokens().contains("2"));
assertTrue("Expected ReferenceDescription to contain Token 'brassinazole', instead has: (" + rd4.getTokens()
+ ").", rd4.getTokens().contains("brassinazole"));
assertTrue(
"Expected ReferenceDescription to contain Token 'resistant', instead has: (" + rd4.getTokens() + ").",
rd4.getTokens().contains("resistant"));
assertTrue("Expected ReferenceDescription to contain Token '1', instead has: (" + rd4.getTokens() + ").",
rd4.getTokens().contains("1"));
assertTrue("Expected ReferenceDescription to contain Token 'dna', instead has: (" + rd4.getTokens() + ").",
rd4.getTokens().contains("dna"));
assertTrue("Expected ReferenceDescription to contain Token 'binding', instead has: (" + rd4.getTokens() + ").",
rd4.getTokens().contains("binding"));
assertTrue("Expected ReferenceDescription to contain Token 'transcription', instead has: (" + rd4.getTokens()
+ ").", rd4.getTokens().contains("transcription"));
assertTrue(
"Expected ReferenceDescription to contain Token 'regulator', instead has: (" + rd4.getTokens() + ").",
rd4.getTokens().contains("regulator"));
assertTrue("Expected ReferenceDescription to contain Token 'transcription', instead has: (" + rd4.getTokens()
+ ").", rd4.getTokens().contains("transcription"));
assertTrue(
"Expected ReferenceDescription to contain Token 'repressor', instead has: (" + rd4.getTokens() + ").",
rd4.getTokens().contains("repressor"));
}
}

0 comments on commit 7d11bf3

Please sign in to comment.
You can’t perform that action at this time.