Skip to content
Permalink
Browse files

Implemented option to filter the reference descriptions as is done wi…

…th candidate descriptions of normal BlastResults. That involves: Blacklisting, Filtering, and Blacklisting of Tokens.
  • Loading branch information...
asishallab committed Aug 18, 2016
1 parent d0148c8 commit 7adff266d975c771b2cb09aa673a14de650309fa
@@ -63,6 +63,9 @@ public static void setSettings(Settings s) {
public static final String TOKEN_SCORE_OVERLAP_SCORE_WEIGHT = "token_score_overlap_score_weight";
public static final String DESCRIPTION_SCORE_BIT_SCORE_WEIGHT = "description_score_bit_score_weight";
public static final String REFERENCES_FASTA_KEY = "references_fasta";
public static final String REFERENCES_DESCRIPTION_FILTER_KEY = "references_description_filter";
public static final String REFERENCES_DESCRIPTION_BLACKLIST_KEY = "references_description_blacklist";
public static final String REFERENCES_TOKEN_BLACKLIST_KEY = "references_token_blacklist";
public static final String F_MEASURE_BETA_PARAM_KEY = "f_measure_beta_parameter";
public static final String BLAST_2_GO_ANNOT_FILE_KEY = "blast2go";
public static final String TEMPERATURE_KEY = "temperature";
@@ -94,12 +97,19 @@ public static void setSettings(Settings s) {
public static final Pattern DEFAULT_REFERENCE_GO_REGEX = Pattern
.compile("^UniProtKB\\s+(?<shortAccession>\\S+)\\s+\\S+\\s+(?<goTerm>GO:\\d{7})");
public static final String PREFER_REFERENCE_WITH_GO_ANNOS_KEY = "prefer_reference_with_go_annos";
public static final String EVALUATE_VALID_TAKENS_KEY = "evaluate_valid_tokens";

/**
* Fields:
*/
private String pathToProteinsFasta;
private String pathToReferencesFasta;
private String pathToReferencesDescriptionBlacklist;
private List<String> referencesDescriptionBlacklist;
private String pathToReferencesDescriptionFilter;
private List<String> referencesDescriptionFilter;
private String pathToReferencesTokenBlacklist;
private List<String> referencesTokenBlacklist = new ArrayList<String>();
private String pathToInterproDatabase;
private String pathToInterproResults;
private String pathToGeneOntologyResults;
@@ -212,6 +222,11 @@ public static void setSettings(Settings s) {
* annotations AHRD works "as normal".
*/
private Boolean preferReferenceWithGoAnnos = false;
/**
* If set to TRUE the AHRD Evaluation Score is based ONLY on tokens that
* pass the Blacklisting. Otherwise all Tokens are submitted to evaluation.
*/
private Boolean evaluateValidTokens = true;

/**
* Construct from contents of file 'AHRD_input.yml'.
@@ -342,6 +357,21 @@ public void initialize(String pathToYml) throws IOException {
if (input.get(PREFER_REFERENCE_WITH_GO_ANNOS_KEY) != null) {
this.preferReferenceWithGoAnnos = true;
}
if (input.get(EVALUATE_VALID_TAKENS_KEY) != null) {
this.setEvaluateValidTokens(true);
}
if (input.get(REFERENCES_DESCRIPTION_BLACKLIST_KEY) != null) {
this.setPathToReferencesDescriptionBlacklist(input.get(REFERENCES_DESCRIPTION_BLACKLIST_KEY).toString());
this.setReferencesDescriptionBlacklist(fromFile(getPathToReferencesDescriptionBlacklist()));
}
if (input.get(REFERENCES_DESCRIPTION_FILTER_KEY) != null) {
this.setPathToReferencesDescriptionFilter(input.get(REFERENCES_DESCRIPTION_FILTER_KEY).toString());
this.setReferencesDescriptionFilter(fromFile(getPathToReferencesDescriptionFilter()));
}
if (input.get(REFERENCES_TOKEN_BLACKLIST_KEY) != null) {
this.setPathToReferencesTokenBlacklist(input.get(REFERENCES_TOKEN_BLACKLIST_KEY).toString());
this.setReferencesTokenBlacklist(fromFile(getPathToReferencesTokenBlacklist()));
}
}

/**
@@ -840,4 +870,59 @@ public void setPreferReferenceWithGoAnnos(Boolean preferReferenceWithGoAnnos) {
this.preferReferenceWithGoAnnos = preferReferenceWithGoAnnos;
}

public Boolean getEvaluateValidTokens() {
return evaluateValidTokens;
}

public void setEvaluateValidTokens(Boolean evaluateValidTokens) {
this.evaluateValidTokens = evaluateValidTokens;
}

public String getPathToReferencesDescriptionFilter() {
return pathToReferencesDescriptionFilter;
}

public void setPathToReferencesDescriptionFilter(String pathToReferencesDescriptionFilter) {
this.pathToReferencesDescriptionFilter = pathToReferencesDescriptionFilter;
}

public String getPathToReferencesDescriptionBlacklist() {
return pathToReferencesDescriptionBlacklist;
}

public void setPathToReferencesDescriptionBlacklist(String pathToReferencesDescriptionBlacklist) {
this.pathToReferencesDescriptionBlacklist = pathToReferencesDescriptionBlacklist;
}

public String getPathToReferencesTokenBlacklist() {
return pathToReferencesTokenBlacklist;
}

public void setPathToReferencesTokenBlacklist(String pathToReferencesTokenBlacklist) {
this.pathToReferencesTokenBlacklist = pathToReferencesTokenBlacklist;
}

public List<String> getReferencesDescriptionBlacklist() {
return referencesDescriptionBlacklist;
}

public void setReferencesDescriptionBlacklist(List<String> referencesDescriptionBlacklist) {
this.referencesDescriptionBlacklist = referencesDescriptionBlacklist;
}

public List<String> getReferencesDescriptionFilter() {
return referencesDescriptionFilter;
}

public void setReferencesDescriptionFilter(List<String> referencesDescriptionFilter) {
this.referencesDescriptionFilter = referencesDescriptionFilter;
}

public List<String> getReferencesTokenBlacklist() {
return referencesTokenBlacklist;
}

public void setReferencesTokenBlacklist(List<String> referencesTokenBlacklist) {
this.referencesTokenBlacklist = referencesTokenBlacklist;
}
}
@@ -1,7 +1,8 @@
package ahrd.model;

import static ahrd.model.ReferenceDescription.tokenizeDescription;
import static ahrd.model.TokenScoreCalculator.tokenize;

import java.util.ArrayList;
import java.util.Set;

public class Blast2GoAnnot implements Comparable<Blast2GoAnnot> {
@@ -17,8 +18,7 @@ public static Blast2GoAnnot fromBlast2GoEntry(String resultLine) {
String accession = vals[0].trim();
// GO-Term-Accession is in position 2, which is ignored here.
String description = vals[2].trim();
if (accession != null && description != null && !accession.equals("")
&& !description.equals(""))
if (accession != null && description != null && !accession.equals("") && !description.equals(""))
res = new Blast2GoAnnot(accession, description);
return res;
}
@@ -27,7 +27,7 @@ public Blast2GoAnnot(String accession, String description) {
super();
setAccession(accession);
setDescription(description);
setEvaluationTokens(tokenizeDescription(getDescription()));
setEvaluationTokens(tokenize(getDescription(), new ArrayList<String>()));
}

/**
@@ -1,18 +1,14 @@
package ahrd.model;

import static ahrd.controller.Settings.getSettings;
import static ahrd.model.ReferenceDescription.tokenizeDescription;
import static ahrd.model.TokenScoreCalculator.tokenPassesBlacklist;

import java.io.BufferedReader;
import java.io.FileReader;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.Set;
@@ -404,51 +400,58 @@ public String patternize() {
return pattern;
}

/**
* Splits the Human Readable Description into single tokens and stores them
* in this' tokens field.
*/
public void tokenize() {
List<String> tknBlackList = getSettings().getTokenBlackList(getBlastDatabaseName());
for (String tokenCandidate : new HashSet<String>(Arrays.asList(getDescription().split(TOKEN_SPLITTER_REGEX)))) {
tokenCandidate = tokenCandidate.toLowerCase();
if (tokenPassesBlacklist(tokenCandidate, tknBlackList))
getTokens().add(tokenCandidate);
}
this.setTokens(TokenScoreCalculator.tokenize(this.getDescription(), tknBlackList));
}

/**
* Checks if this' human readable description passes the blacklist
* associated with this' sequence database name.
*
* @param blastResultDescriptionLine
* @return boolean TRUE if and only if the human readable description passes
* the respective blacklist. FALSE otherwise.
*/
public boolean passesBlacklist(String blastResultDescriptionLine) {
boolean passesBlacklist = (blastResultDescriptionLine != null && !blastResultDescriptionLine.equals(""));
for (Iterator<String> i = getSettings().getBlastResultsBlackList(getBlastDatabaseName()).iterator(); (i
.hasNext() && passesBlacklist);) {
Pattern p = Pattern.compile(i.next());
Matcher m = p.matcher(blastResultDescriptionLine);
passesBlacklist = !m.find();
}
return passesBlacklist;
List<String> blacklist = getSettings().getBlastResultsBlackList(getBlastDatabaseName());
return DescriptionScoreCalculator.passesBlacklist(blastResultDescriptionLine, blacklist);
}

/**
* Filters this' human readable description using the global filter
* implemented in <code>DescriptionScoreCalculator.filter(...)</code>
*
* @param blastResultDescriptionLine
* @return String the modified version of this' human readable description,
* in which all matches to the respective filters are deleted.
*/
public String filter(String blastResultDescriptionLine) {
String filteredDescLine = blastResultDescriptionLine;
for (Iterator<String> i = getSettings().getBlastResultsFilter(getBlastDatabaseName()).iterator(); i
.hasNext();) {
Pattern p = Pattern.compile(i.next());
// Replace with whitespace, so word-boundaries are kept up
filteredDescLine = p.matcher(filteredDescLine).replaceAll(" ");
}
// Condense multiple whitespaces into one and trim the description-line:
filteredDescLine = filteredDescLine.replaceAll("\\s{2,}", " ").trim();
return filteredDescLine;
List<String> filter = getSettings().getBlastResultsFilter(getBlastDatabaseName());
return DescriptionScoreCalculator.filter(blastResultDescriptionLine, filter);
}

/**
* Evaluation Tokens are <i>not</i> filtered with the TOKEN-BLACKLIST, as ee
* Evaluation Tokens are <i>not</i> filtered with the TOKEN-BLACKLIST, as we
* want to evaluate <i>all</i> tokens, that are printed out, too. This set
* of evaluation-tokens is set if and only if, AHRD is run in Evaluator-Mode
* and this BlastResult is the best scoring of the Blast-Search-Result, it
* is obtained from.
* is obtained from. You can evaluate AHRD based <i>only<\i> on tokens that
* passed the Blacklist and Filtering with the correct input parameter. See
* <code>Settings.evaluateValidTokens</code> for details.
*
* @Note: This method uses the static respective static method from
* Model-Class ReferenceDescription.
*/
public void tokenizeForEvaluation() {
setEvaluationTokens(tokenizeDescription(getDescription()));
if (getSettings().getEvaluateValidTokens())
setEvaluationTokens(getTokens());
else
setEvaluationTokens(TokenScoreCalculator.tokenize(getDescription(), new ArrayList<String>()));
}

public boolean isValid() {
@@ -496,7 +499,7 @@ public void generateHRDCandidateForProtein() {
// Pass best Blast-Hit's Description through filter:
theClone.setDescription(filter(theClone.getDescription()));
// Tokenize without filtering tokens through the Blacklist:
theClone.setTokens(tokenizeDescription(theClone.getDescription()));
theClone.setTokens(TokenScoreCalculator.tokenize(theClone.getDescription(), new ArrayList<String>()));
getProtein().getEvaluationScoreCalculator().addUnchangedBlastResult(getBlastDatabaseName(), theClone);
}
if (passesBlacklist(getDescription())) {
@@ -542,7 +545,7 @@ public String getShortAccession() {
}
return (shortAccession);
}

public void setShortAccession(String shortAccession) {
this.shortAccession = shortAccession;
}
@@ -5,11 +5,55 @@
import java.util.Collections;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

public class DescriptionScoreCalculator {

/**
* Global implementation of the Description Blacklist.
*
* @param description
* @param blacklist
* @return TRUE if and only if none of the regular expressions in blacklist
* matches the argument description. FALSE otherwise.
*/
public static boolean passesBlacklist(String description, List<String> blacklist) {
boolean passesBlacklist = (description != null && !description.equals(""));
for (Iterator<String> i = blacklist.iterator(); (i.hasNext() && passesBlacklist);) {
Pattern p = Pattern.compile(i.next());
Matcher m = p.matcher(description);
passesBlacklist = !m.find();
}
return passesBlacklist;
}

/**
* Global implementation of the filter Description function.
*
* @param description
* @param filter
* @return A modified version of argument description in which all matches
* to any of the regular expressions in argument filter are deleted.
* Finally the filtered description is trimmed and multiple
* white-spaces are condensed into a single white-spaces.
*/
public static String filter(String description, List<String> filter) {
String filteredDescLine = description;
for (Iterator<String> i = filter.iterator(); i.hasNext();) {
Pattern p = Pattern.compile(i.next());
// Replace with whitespace, so word-boundaries are kept up
filteredDescLine = p.matcher(filteredDescLine).replaceAll(" ");
}
// Condense multiple whitespaces into one and trim the description-line:
filteredDescLine = filteredDescLine.replaceAll("\\s{2,}", " ").trim();
return filteredDescLine;
}

private Protein protein;
private double maxBitScore = 0.0;
private BlastResult highestScoringBlastResult;
@@ -1,5 +1,6 @@
package ahrd.model;

import static ahrd.controller.Settings.getSettings;
import java.util.HashSet;
import java.util.Set;

@@ -22,29 +23,27 @@ public static ReferenceDescription constructFromFastaEntry(String fastaEntry) {
rd.setAccession(fastaData[0].split(" ")[0].trim());
// Everything after the Accession is considered the description-line:
rd.setDescription(fastaData[0].replace(rd.getAccession(), "").trim());
// Tokenize description and filter out tokens matching any regex in the
// blacklist:
rd.setTokens(tokenizeDescription(rd.getDescription()));

return rd;
}

/**
* Tokenizes a String using Blastresult.TOKEN_SPLITTER_REGEX and returns all
* resulting unique tokens.
*
* @param description
* @param blacklist
* @return Set<String>
*/
public static Set<String> tokenizeDescription(String description) {
Set<String> tkns = new HashSet<String>();
for (String tkn : description.split(BlastResult.TOKEN_SPLITTER_REGEX)) {
String tokenCandidate = tkn.trim().toLowerCase();
if (tokenCandidate != null && !tokenCandidate.equals(""))
tkns.add(tokenCandidate);
// Process the reference's human readable description as requested by
// the user (Settings) -
// NOTE, if the HRD passes the Blacklist and no filtering is
// requested the HRD does not have to be processed any further.
if (getSettings().getReferencesDescriptionBlacklist() != null
&& !getSettings().getReferencesDescriptionBlacklist().isEmpty()) {
if (!DescriptionScoreCalculator.passesBlacklist(rd.getDescription(),
getSettings().getReferencesDescriptionBlacklist())) {
// Does NOT pass blacklist
rd.setDescription("");
} else if (getSettings().getReferencesDescriptionFilter() != null
&& !getSettings().getReferencesDescriptionFilter().isEmpty()) {
// Passes Blacklist AND is requested to be filtered:
rd.setDescription(DescriptionScoreCalculator.filter(rd.getDescription(),
getSettings().getReferencesDescriptionFilter()));
}
}
return tkns;
// Tokenize, and if requested in Settings retain only those tokens that
// pass the Blacklist:
rd.setTokens(TokenScoreCalculator.tokenize(rd.getDescription(), getSettings().getReferencesTokenBlacklist()));
return rd;
}

public Set<String> getTokens() {

0 comments on commit 7adff26

Please sign in to comment.
You can’t perform that action at this time.