Skip to content

Commit

Permalink
AHRD is now capable of extending GO term annotations with parental GO…
Browse files Browse the repository at this point in the history
… terms.
  • Loading branch information
asishallab committed May 15, 2015
1 parent fdf7fcd commit 0be7567
Show file tree
Hide file tree
Showing 13 changed files with 455 additions and 6 deletions.
1 change: 1 addition & 0 deletions .classpath
Expand Up @@ -20,5 +20,6 @@
<classpathentry kind="lib" path="lib/xml-apis.jar"/> <classpathentry kind="lib" path="lib/xml-apis.jar"/>
<classpathentry kind="lib" path="lib/xom-1.2.6.jar"/> <classpathentry kind="lib" path="lib/xom-1.2.6.jar"/>
<classpathentry kind="lib" path="lib/yamlbeans-1.06.jar"/> <classpathentry kind="lib" path="lib/yamlbeans-1.06.jar"/>
<classpathentry kind="lib" path="/Users/ah/projects/AHRD/lib/mysql-connector-java-5.1.35-bin.jar"/>
<classpathentry kind="output" path="classes"/> <classpathentry kind="output" path="classes"/>
</classpath> </classpath>
4 changes: 4 additions & 0 deletions .gitignore
Expand Up @@ -27,3 +27,7 @@ junitvmwatcher*
*.phr *.phr
*.pin *.pin
*.psq *.psq

# Ignore test output files
ahrd_extended_go_table.tsv
ahrd_output.csv
Binary file added lib/mysql-connector-java-5.1.35-bin.jar
Binary file not shown.
70 changes: 67 additions & 3 deletions src/ahrd/controller/AHRD.java
Expand Up @@ -5,6 +5,8 @@
import static ahrd.model.ReferenceGoAnnotations.parseReferenceGoAnnotations; import static ahrd.model.ReferenceGoAnnotations.parseReferenceGoAnnotations;


import java.io.IOException; import java.io.IOException;
import java.sql.Connection;
import java.sql.SQLException;
import java.util.Collection; import java.util.Collection;
import java.util.Date; import java.util.Date;
import java.util.HashMap; import java.util.HashMap;
Expand All @@ -20,22 +22,26 @@
import ahrd.exception.MissingInterproResultException; import ahrd.exception.MissingInterproResultException;
import ahrd.exception.MissingProteinException; import ahrd.exception.MissingProteinException;
import ahrd.model.BlastResult; import ahrd.model.BlastResult;
import ahrd.model.GOdbSQL;
import ahrd.model.GOterm;
import ahrd.model.InterproResult; import ahrd.model.InterproResult;
import ahrd.model.Protein; import ahrd.model.Protein;
import ahrd.view.ExtendedGOAnnotationTableWriter;
import ahrd.view.FastaOutputWriter; import ahrd.view.FastaOutputWriter;
import ahrd.view.IOutputWriter; import ahrd.view.IOutputWriter;
import ahrd.view.OutputWriter; import ahrd.view.OutputWriter;


public class AHRD { public class AHRD {


public static final String VERSION = "3.0"; public static final String VERSION = "3.1";


private Map<String, Protein> proteins; private Map<String, Protein> proteins;
private Map<String, Double> descriptionScoreBitScoreWeights = new HashMap<String, Double>(); private Map<String, Double> descriptionScoreBitScoreWeights = new HashMap<String, Double>();
private Map<String, Set<String>> referenceGoAnnotations; private Map<String, Set<String>> referenceGoAnnotations;
private Set<String> uniqueBlastResultShortAccessions; private Set<String> uniqueBlastResultShortAccessions;
private long timestamp; private long timestamp;
private long memorystamp; private long memorystamp;
private Map<String, GOterm> goDB;


protected long takeTime() { protected long takeTime() {
// Measure time: // Measure time:
Expand Down Expand Up @@ -80,7 +86,21 @@ public static void main(String[] args) {
// Log // Log
System.out.println("Wrote output in " + ahrd.takeTime() System.out.println("Wrote output in " + ahrd.takeTime()
+ "sec, currently occupying " + ahrd.takeMemoryUsage() + "sec, currently occupying " + ahrd.takeMemoryUsage()
+ " MB\n\nDONE"); + " MB");
// If requested, write extended Gene Ontology (GO) annotation table:
if (getSettings().generateExtendedGoResultTable()) {
System.out
.println("Writing extended Gene Ontology (GO) term annotation table to '"
+ getSettings().getExtendedGoResultTablePath()
+ "'.");
ExtendedGOAnnotationTableWriter gw = new ExtendedGOAnnotationTableWriter(
ahrd.getProteins().values(), ahrd.getGoDB());
gw.writeOutput();
System.out.println("Wrote extended GO table in "
+ ahrd.takeTime() + "sec, currently occupying "
+ ahrd.takeMemoryUsage() + " MB");
}
System.out.println("\n\nDONE");
} catch (Exception e) { } catch (Exception e) {
System.err.println("We are sorry, an un-expected ERROR occurred:"); System.err.println("We are sorry, an un-expected ERROR occurred:");
e.printStackTrace(System.err); e.printStackTrace(System.err);
Expand Down Expand Up @@ -226,9 +246,10 @@ public void setup(boolean writeLogMsgs) throws IOException,
* *
* @throws MissingInterproResultException * @throws MissingInterproResultException
* @throws IOException * @throws IOException
* @throws SQLException
*/ */
public void assignHumanReadableDescriptions() public void assignHumanReadableDescriptions()
throws MissingInterproResultException, IOException { throws MissingInterproResultException, IOException, SQLException {
for (String protAcc : getProteins().keySet()) { for (String protAcc : getProteins().keySet()) {
Protein prot = getProteins().get(protAcc); Protein prot = getProteins().get(protAcc);
// Find best scoring Blast-Hit's Description-Line (based on // Find best scoring Blast-Hit's Description-Line (based on
Expand Down Expand Up @@ -262,6 +283,41 @@ && getReferenceGoAnnotations().containsKey(
// interpro-results // interpro-results
InterproResult.filterForMostInforming(prot); InterproResult.filterForMostInforming(prot);
} }
if (getSettings().generateExtendedGoResultTable())
extendGOtermAnnotationsWithParentalTerms();
}

/**
* Extends each Proteins' Gene Ontology (GO) terms with their respective
* parental terms. Per protein each respective GO term is only annotated
* once.
*
* @throws SQLException
*/
public void extendGOtermAnnotationsWithParentalTerms() throws SQLException {
// Initialize the in memory Gene Ontology (GO) database:
System.out.println("1");
Set<String> gts = Protein.uniqueGOaccessions(getProteins().values());
System.out.println("2");
Connection goCon = null;
System.out.println("3");
try {
goCon = GOdbSQL.connectToGeneOntologyDb();
System.out.println("4");
setGoDB(GOdbSQL.parentGoTermsForAccessions(gts, goCon));
System.out.println("5");
} finally {
goCon.close();
System.out.println("6");
}
// Extend each Proteins' GO results with their parental terms:
for (Protein p : getProteins().values()) {
Collection<String> gos = p.getGoResults();
System.out.println("7");
if (gos != null)
p.setGoResults(GOdbSQL.uniqueGOAccessions(gos, getGoDB()));
System.out.println("8");
}
} }


public Map<String, Protein> getProteins() { public Map<String, Protein> getProteins() {
Expand Down Expand Up @@ -299,4 +355,12 @@ public void setUniqueBlastResultShortAccessions(
this.uniqueBlastResultShortAccessions = uniqueBlastResultShortAccessions; this.uniqueBlastResultShortAccessions = uniqueBlastResultShortAccessions;
} }


public Map<String, GOterm> getGoDB() {
return goDB;
}

public void setGoDB(Map<String, GOterm> goDB) {
this.goDB = goDB;
}

} }
55 changes: 55 additions & 0 deletions src/ahrd/controller/Settings.java
Expand Up @@ -94,6 +94,18 @@ public static void setSettings(Settings s) {
public static final String REFERENCE_GO_REGEX_KEY = "reference_go_regex"; public static final String REFERENCE_GO_REGEX_KEY = "reference_go_regex";
public static final Pattern DEFAULT_REFERENCE_GO_REGEX = Pattern public static final Pattern DEFAULT_REFERENCE_GO_REGEX = Pattern
.compile("^UniProtKB\\s+(?<shortAccession>\\S+)\\s+\\S+\\s+(?<goTerm>GO:\\d{7})"); .compile("^UniProtKB\\s+(?<shortAccession>\\S+)\\s+\\S+\\s+(?<goTerm>GO:\\d{7})");
public static final String EXTENDED_GO_RESULT_TABLE_KEY = "extended_go_result_table";
public static final String GO_DB_URL_KEY = "go_db_url";
public static final String GO_DB_USER_KEY = "go_db_user";
public static final String GO_DB_PASSWORD_KEY = "go_db_password";

/**
* Constant parameters
*/
public static final String GO_DB_TERM_TBL_ACCESSION_KEY = "acc";
public static final String GO_DB_TERM_TBL_NAME_KEY = "name";
public static final String GO_DB_TERM_TBL_ONTOLOGY_KEY = "term_type";
public static final String GO_DB_DESCENDANT_TERM_TBL_ACC_KEY = "desc_acc";


/** /**
* Fields: * Fields:
Expand Down Expand Up @@ -206,6 +218,10 @@ public static void setSettings(Settings s) {
private Integer seqSimSearchTableEValueCol = 10; private Integer seqSimSearchTableEValueCol = 10;
private Integer seqSimSearchTableBitScoreCol = 11; private Integer seqSimSearchTableBitScoreCol = 11;
private Pattern referenceGoRegex; private Pattern referenceGoRegex;
private String extendedGoResultTablePath;
private String goDbURL = "jdbc:mysql://mysql.ebi.ac.uk:4085/go_latest";
private String goDbUser = "go_select";
private String goDbPassword = "amigo";


/** /**
* Construct from contents of file 'AHRD_input.yml'. * Construct from contents of file 'AHRD_input.yml'.
Expand Down Expand Up @@ -365,6 +381,20 @@ public void initialize(String pathToYml) throws IOException {
setReferenceGoRegex(Pattern.compile(input.get( setReferenceGoRegex(Pattern.compile(input.get(
REFERENCE_GO_REGEX_KEY).toString())); REFERENCE_GO_REGEX_KEY).toString()));
} }
// Enable generation of an extended GO result table:
if (input.get(EXTENDED_GO_RESULT_TABLE_KEY) != null) {
setExtendedGoResultTablePath(input
.get(EXTENDED_GO_RESULT_TABLE_KEY).toString());
}
if (input.get(GO_DB_URL_KEY) != null) {
this.goDbURL = input.get(GO_DB_URL_KEY).toString();
}
if (input.get(GO_DB_USER_KEY) != null) {
this.goDbUser = input.get(GO_DB_USER_KEY).toString();
}
if (input.get(GO_DB_PASSWORD_KEY) != null) {
this.goDbPassword = input.get(GO_DB_PASSWORD_KEY).toString();
}
} }


/** /**
Expand Down Expand Up @@ -883,4 +913,29 @@ public Pattern getReferenceGoRegex() {
public void setReferenceGoRegex(Pattern referenceGoRegex) { public void setReferenceGoRegex(Pattern referenceGoRegex) {
this.referenceGoRegex = referenceGoRegex; this.referenceGoRegex = referenceGoRegex;
} }

public Boolean generateExtendedGoResultTable() {
return getExtendedGoResultTablePath() != null;
}

public String getGoDbURL() {
return goDbURL;
}

public String getGoDbUser() {
return goDbUser;
}

public String getGoDbPassword() {
return goDbPassword;
}

public String getExtendedGoResultTablePath() {
return extendedGoResultTablePath;
}

public void setExtendedGoResultTablePath(String extendedGoResultTablePath) {
this.extendedGoResultTablePath = extendedGoResultTablePath;
}

} }
5 changes: 4 additions & 1 deletion src/ahrd/controller/Trainer.java
Expand Up @@ -3,6 +3,7 @@
import static ahrd.controller.Settings.getSettings; import static ahrd.controller.Settings.getSettings;


import java.io.IOException; import java.io.IOException;
import java.sql.SQLException;
import java.util.HashSet; import java.util.HashSet;
import java.util.Random; import java.util.Random;
import java.util.Set; import java.util.Set;
Expand Down Expand Up @@ -87,8 +88,10 @@ public Trainer(String pathToInputYml) throws IOException {
* *
* @throws IOException * @throws IOException
* @throws MissingInterproResultException * @throws MissingInterproResultException
* @throws SQLException
*/ */
public void train() throws MissingInterproResultException, IOException { public void train() throws MissingInterproResultException, IOException,
SQLException {
while (getSettings().getTemperature() > 0) { while (getSettings().getTemperature() > 0) {
// If we run simulated annealing remembering tested Parameters and // If we run simulated annealing remembering tested Parameters and
// their scores, // their scores,
Expand Down
129 changes: 129 additions & 0 deletions src/ahrd/model/GOdbSQL.java
@@ -0,0 +1,129 @@
package ahrd.model;

import static ahrd.controller.Settings.getSettings;

import java.sql.Connection;
import java.sql.DriverManager;
import java.sql.ResultSet;
import java.sql.SQLException;
import java.sql.Statement;
import java.util.Collection;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Map;
import java.util.Set;

import ahrd.controller.Settings;

public class GOdbSQL {

/**
* Joins all Gene Ontology (GO) terms given in argument goTermAccs with
* their parental terms in a set of GO term accessions.
*
* @param goTermAccs
* @param goDB
* @return Set<String>
*/
public static Set<String> uniqueGOAccessions(Collection<String> goTermAccs,
Map<String, GOterm> goDB) {
Set<String> g = new HashSet<String>();
for (String a : goTermAccs)
g.addAll(goDB.get(a).getParentAccessions());
return g;
}

/**
* Connects to the MySQL Gene Ontology database specified in Settings.
*
* @return Connection
* @throws SQLException
*/
public static Connection connectToGeneOntologyDb() throws SQLException {
return DriverManager.getConnection(getSettings().getGoDbURL(),
getSettings().getGoDbUser(), getSettings().getGoDbPassword());
}

/**
* Finds the Gene Ontology (GO) terms that are parent to the argument
* 'goTermAccs' and are NOT obsolete.
*
* @param goTermAccs
* @param includeSelves
* @param relationshipTypeId
* @param goCon
* @return Map<String, GOterm> Keys are the GO accessions in 'goTermAccs'
* and Values are the instances of GOterm parental to them.
* @throws SQLException
*/
public static Map<String, GOterm> parentGoTermsForAccessions(
Collection<String> goTermAccs, Connection goCon)
throws SQLException {
Statement stmt = null;
String query = parentGoTermsForAccessionsSQLQuery(goTermAccs);
Map<String, GOterm> res = new HashMap<String, GOterm>();
Map<String, Set<String>> parentGos = new HashMap<String, Set<String>>();
try {
stmt = goCon.createStatement();
ResultSet rs = null;
boolean hasRows = stmt.execute(query);
if (hasRows) {
rs = stmt.getResultSet();
while (rs.next()) {
String gAcc = rs
.getString(Settings.GO_DB_TERM_TBL_ACCESSION_KEY);
if (!res.containsKey(gAcc))
res.put(gAcc,
new GOterm(
gAcc,
rs.getString(Settings.GO_DB_TERM_TBL_NAME_KEY),
rs.getString(Settings.GO_DB_TERM_TBL_ONTOLOGY_KEY)));
String descAcc = rs
.getString(Settings.GO_DB_DESCENDANT_TERM_TBL_ACC_KEY);
if (!parentGos.containsKey(descAcc))
parentGos.put(descAcc, new HashSet<String>());
parentGos.get(descAcc).add(gAcc);
}
}
} finally {
if (stmt != null)
stmt.close();
}
for (String a : parentGos.keySet())
res.get(a).setParentAccessions(parentGos.get(a));
return res;
}

/**
* Generates a valid SQL query in table term and graph_path to SELECT all
* parental Gene Ontology (GO) terms for argument goTermAccs. <i>Note</i>
* that the query will also SELECT the GO terms indicated by argument
* goTermAccs.
*
* @param goTermAccs
* @return String
*/
public static String parentGoTermsForAccessionsSQLQuery(
Collection<String> goTermAccs) {
StringBuffer gta = new StringBuffer();
int i = 0;
for (String g : goTermAccs) {
gta.append("'" + g + "'");
if (i < goTermAccs.size() - 1)
gta.append(",");
i++;
}
return "SELECT t.*, to_root.relation_distance, child.acc as desc_acc "
+ "FROM graph_path res LEFT JOIN term t ON t.id = res.term1_id "
+ "LEFT JOIN graph_path to_root ON t.id = to_root.term2_id "
+ "LEFT JOIN term child ON child.id = res.term2_id "
+ "WHERE "
+ "res.term1_id != (SELECT r.id FROM term r WHERE r.is_root = 1) "
+ "AND child.acc in ("
+ gta.toString()
+ ") "
+ "AND to_root.term1_id = (SELECT r.id FROM term r WHERE r.is_root = 1) "
+ "AND t.is_obsolete = 0 "
+ "GROUP BY t.id ORDER BY to_root.relation_distance ASC";
}
}

0 comments on commit 0be7567

Please sign in to comment.