Skip to content
Permalink
Browse files

Moved Asis Hallab's repository to the official work group's one.

  • Loading branch information...
groupschoof committed Mar 13, 2012
0 parents commit 2d28544279020d3c82057036509ba183412a3b0b
Showing with 41,850 additions and 0 deletions.
  1. +25 −0 .classpath
  2. +18 −0 .gitignore
  3. +23 −0 .project
  4. +308 −0 LICENSE.txt
  5. +1 −0 MANIFEST.MF
  6. +196 −0 README.textile
  7. +101 −0 build.xml
  8. BIN images/formulae.jpg
  9. +124 −0 interpro.dtd
  10. BIN lib/apps-1.7.1.jar
  11. BIN lib/biojava-1.7.1.jar
  12. BIN lib/bytecode.jar
  13. BIN lib/commons-cli.jar
  14. BIN lib/commons-collections-2.1.jar
  15. BIN lib/commons-dbcp-1.1.jar
  16. BIN lib/commons-pool-1.1.jar
  17. BIN lib/demos-1.7.1.jar
  18. BIN lib/dtd-xercesImpl.jar
  19. BIN lib/jarjar-1.0.jar
  20. BIN lib/jgrapht-jdk1.5.jar
  21. BIN lib/junit-4.9b2.jar
  22. BIN lib/objenesis-1.2.jar
  23. BIN lib/serializer.jar
  24. BIN lib/xalan.jar
  25. BIN lib/xml-apis.jar
  26. BIN lib/xom-1.2.6.jar
  27. BIN lib/yamlbeans-1.06.jar
  28. +233 −0 src/ahrd/controller/AHRD.java
  29. +378 −0 src/ahrd/controller/Batcher.java
  30. +117 −0 src/ahrd/controller/Evaluator.java
  31. +380 −0 src/ahrd/controller/Parameters.java
  32. +539 −0 src/ahrd/controller/Settings.java
  33. +263 −0 src/ahrd/controller/Trainer.java
  34. +224 −0 src/ahrd/controller/TrainerBatcher.java
  35. +103 −0 src/ahrd/controller/Utils.java
  36. +14 −0 src/ahrd/exception/MissingAccessionException.java
  37. +11 −0 src/ahrd/exception/MissingInterproResultException.java
  38. +14 −0 src/ahrd/exception/MissingProteinException.java
  39. +101 −0 src/ahrd/model/Blast2GoAnnot.java
  40. +294 −0 src/ahrd/model/BlastResult.java
  41. +195 −0 src/ahrd/model/BlastSearchContentAdapter.java
  42. +171 −0 src/ahrd/model/DescriptionScoreCalculator.java
  43. +376 −0 src/ahrd/model/EvaluationScoreCalculator.java
  44. +103 −0 src/ahrd/model/GeneOntologyResult.java
  45. +358 −0 src/ahrd/model/InterproResult.java
  46. +62 −0 src/ahrd/model/LexicalScoreCalculator.java
  47. +212 −0 src/ahrd/model/Protein.java
  48. +74 −0 src/ahrd/model/ReferenceDescription.java
  49. +316 −0 src/ahrd/model/TokenScoreCalculator.java
  50. +348 −0 src/ahrd/view/OutputWriter.java
  51. +79 −0 src/ahrd/view/TrainerOutputWriter.java
  52. +130 −0 test/ahrd/test/AhrdTest.java
  53. +20 −0 test/ahrd/test/AhrdTestRun.java
  54. +116 −0 test/ahrd/test/BatcherTest.java
  55. +88 −0 test/ahrd/test/Blast2GoAnnotTest.java
  56. +134 −0 test/ahrd/test/BlastResultTest.java
  57. +65 −0 test/ahrd/test/BlastSearchContentAdapterTest.java
  58. +110 −0 test/ahrd/test/DescriptionScoreCalculatorTest.java
  59. +331 −0 test/ahrd/test/EvaluationScoreCalculatorTest.java
  60. +86 −0 test/ahrd/test/EvaluatorTest.java
  61. +38 −0 test/ahrd/test/GeneOntologyResultTest.java
  62. +144 −0 test/ahrd/test/InterproResultTest.java
  63. +103 −0 test/ahrd/test/LexicalScoreCalculatorTest.java
  64. +310 −0 test/ahrd/test/ParametersTest.java
  65. +83 −0 test/ahrd/test/ProteinTest.java
  66. +37 −0 test/ahrd/test/ReferenceDescriptionTest.java
  67. +109 −0 test/ahrd/test/SettingsTest.java
  68. +61 −0 test/ahrd/test/TestRegexs.java
  69. +190 −0 test/ahrd/test/TestUtils.java
  70. +243 −0 test/ahrd/test/TokenScoreCalculatorTest.java
  71. +123 −0 test/ahrd/test/TrainerBatcherTest.java
  72. +170 −0 test/ahrd/test/TrainerTest.java
  73. +84 −0 test/ahrd/test/UtilsTest.java
  74. +41 −0 test/resources/ahrd_input.yml
  75. +34 −0 test/resources/ahrd_input_test_run.yml
  76. +45 −0 test/resources/batcher_input.yml
  77. +44 −0 test/resources/batcher_input_test.yml
  78. +10 −0 test/resources/blacklist_descline.txt
  79. +26 −0 test/resources/blacklist_token.txt
  80. +3 −0 test/resources/blast2go_out.annot
  81. 0 test/resources/empty_file.txt
  82. +10 −0 test/resources/filter_descline_sprot.txt
  83. +14 −0 test/resources/filter_descline_tair.txt
  84. +10 −0 test/resources/filter_descline_trembl.txt
  85. 0 test/resources/gene_ontology_results/batch001.csv
  86. 0 test/resources/gene_ontology_results/batch002.csv
  87. 0 test/resources/gene_ontology_results/batch003.csv
  88. +3 −0 test/resources/go_results.csv
  89. +4,782 −0 test/resources/interpro_31.xml
  90. +7 −0 test/resources/interpro_result.raw
  91. 0 test/resources/interpro_results/batch001.raw
  92. 0 test/resources/interpro_results/batch002.raw
  93. 0 test/resources/interpro_results/batch003.raw
  94. +12 −0 test/resources/match_list.txt
  95. +18 −0 test/resources/proteins.fasta
  96. 0 test/resources/proteins/batch001.fasta
  97. 0 test/resources/proteins/batch002.fasta
  98. 0 test/resources/proteins/batch003.fasta
  99. +18 −0 test/resources/references.fasta
  100. +26 −0 test/resources/regex_list.txt
  101. +38 −0 test/resources/scores_test/ahrd_input.yml
  102. +5 −0 test/resources/scores_test/prot_patternTest.fa
  103. +55 −0 test/resources/scores_test/sprot_patternTest.blastout
  104. +109 −0 test/resources/scores_test/tair_patternTest.blastout
  105. +606 −0 test/resources/scores_test/trembl_patternTest.blastout
  106. 0 test/resources/sprot_blast_results/batch001.pairwise
  107. 0 test/resources/sprot_blast_results/batch002.pairwise
  108. 0 test/resources/sprot_blast_results/batch003.pairwise
  109. +7,136 −0 test/resources/swissprot.pairwise
  110. +7,136 −0 test/resources/tair.pairwise
  111. +5,925 −0 test/resources/tair.xml
  112. 0 test/resources/tair_blast_results/batch001.pairwise
  113. 0 test/resources/tair_blast_results/batch002.pairwise
  114. 0 test/resources/tair_blast_results/batch003.pairwise
  115. +28 −0 test/resources/trainer_batcher_input_test.yml
  116. +37 −0 test/resources/trainer_input.yml
  117. +7,136 −0 test/resources/trembl.pairwise
  118. 0 test/resources/trembl_blast_results/batch001.pairwise
  119. 0 test/resources/trembl_blast_results/batch002.pairwise
  120. 0 test/resources/trembl_blast_results/batch003.pairwise
@@ -0,0 +1,25 @@
<?xml version="1.0" encoding="UTF-8"?>
<classpath>
<classpathentry kind="src" path="src"/>
<classpathentry kind="src" path="test"/>
<classpathentry kind="con" path="org.eclipse.jdt.launching.JRE_CONTAINER"/>
<classpathentry kind="lib" path="lib/apps-1.7.1.jar"/>
<classpathentry kind="lib" path="lib/biojava-1.7.1.jar"/>
<classpathentry kind="lib" path="lib/bytecode.jar"/>
<classpathentry kind="lib" path="lib/commons-cli.jar"/>
<classpathentry kind="lib" path="lib/commons-collections-2.1.jar"/>
<classpathentry kind="lib" path="lib/commons-dbcp-1.1.jar"/>
<classpathentry kind="lib" path="lib/commons-pool-1.1.jar"/>
<classpathentry kind="lib" path="lib/demos-1.7.1.jar"/>
<classpathentry kind="lib" path="lib/dtd-xercesImpl.jar"/>
<classpathentry kind="lib" path="lib/jarjar-1.0.jar"/>
<classpathentry kind="lib" path="lib/jgrapht-jdk1.5.jar"/>
<classpathentry kind="lib" path="lib/junit-4.9b2.jar"/>
<classpathentry kind="lib" path="lib/objenesis-1.2.jar"/>
<classpathentry kind="lib" path="lib/serializer.jar"/>
<classpathentry kind="lib" path="lib/xalan.jar"/>
<classpathentry kind="lib" path="lib/xml-apis.jar"/>
<classpathentry kind="lib" path="lib/xom-1.2.6.jar"/>
<classpathentry kind="lib" path="lib/yamlbeans-1.06.jar"/>
<classpathentry kind="output" path="classes"/>
</classpath>
@@ -0,0 +1,18 @@
# Ignore everything in directory ./classes
classes/**/*

# Ignore Batcher-Tests:
start_ahrd_batched.sh

# The following directory is only created, if Batcher is tested:
test/resources/batch_ymls

# Executable jar should also be ignored
dist/ahrd.jar

# Ignore AHRD-Output generated by Test-Suite:
test/ahrd_output.csv

# Ignore subversion related files
.svn
.svnignore
@@ -0,0 +1,23 @@
<?xml version="1.0" encoding="UTF-8"?>
<projectDescription>
<name>AHRD_Java</name>
<comment></comment>
<projects>
</projects>
<buildSpec>
<buildCommand>
<name>org.eclipse.jdt.core.javabuilder</name>
<arguments>
</arguments>
</buildCommand>
<buildCommand>
<name>net.sourceforge.metrics.builder</name>
<arguments>
</arguments>
</buildCommand>
</buildSpec>
<natures>
<nature>org.eclipse.jdt.core.javanature</nature>
<nature>net.sourceforge.metrics.nature</nature>
</natures>
</projectDescription>

Large diffs are not rendered by default.

@@ -0,0 +1 @@
Main-Class: ahrd.controller.AHRD
@@ -0,0 +1,196 @@
h1. Automated Assignment of Human Readable Descriptions (AHRD)

Protein function has often been transferred from characterized proteins to
novel proteins based on sequence similarity, e.g. using the best BLAST hit. To
assign human readable descriptions to predicted proteins we developed a new
program called Automatic assignment of human readable descriptions (AHRD). We
aim to select descriptions that are concise and informative, precise in regard
to function and use standard nomenclature. AHRD scores BLAST hits taken from
searches against different databases on the basis of the trust put into these
databases and the local alignment quality. The BLAST hit descriptions are
tokenized into informative words and a lexical analysis scores these tokens
according to their frequency and the quality of the BLAST hits they occur in.
Shared tokens with Gene-Ontology Annotations increase the description-scoring
in order to use standard nomenclature where possible. Finally the best scoring
description is assigned.

h2. 1.1 Requirements:

AHRD is a Java-Program which requires Java 1.5 or higher.


h2. 1.2 Installation:

h3. 1.2.1 Get AHRD

Copy (clone) AHRD to your computer using git:
<pre>git clone git://github.com/asishallab/AHRD.git</pre>

h3. 1.2.2 Build the executable jar:

Running <pre>ant dist</pre> will create the executable JAR-File: ./dist/ahrd.jar

h2. 2 Usage:

All AHRD-Inputs are passed to AHRD in a single YML-File.
See test/resources/ahrd_input.yml for details.
(About YAML-Format see <a href="http://en.wikipedia.org/wiki/YAML">Wikipedia</a>)

Basically AHRD needs a FASTA-File of amino acid sequences and different files
containing the results from the respective BLAST searches, in our example we
searched three databases: Uniprot/trEMBL, Uniprot/Swissprot and TAIR10. Note,
that AHRD is generic and can make use of any number of different Blast
databases that do not necessarily have to be the above ones. If e.g. annotating
genes from a fungal genome searching yeast databases might be more
recommendable than using TAIR (_Arabidopsis thaliana_).

All parameters can be set manually, or the default ones can be used as given in
the example input file ./test/resources/ahrd_input_test_run.yml (see section
Parameters).

AHRD is recommended to be run on batches of 1,000 to 2,000 proteins. If you
want to annotate a whole genome use the included Batcher to split your
input-data into Batches of appropriate size (see section Batcher).

h3. 2.2 AHRD Example run:

<pre>java -Xmx2g -jar ./dist/ahrd.jar ./test/resources/ahrd_input_test_run.yml </pre>

or just execute

<pre>ant test.run </pre>

h3. 2.3 Recommended BLAST-Search:

For your query proteins you should start independent BLAST searches e.g.
in the three different databases mentioned above:

<pre>
blastall -p blastp -i proteins.fasta -o swissprot_blastout.pairwise -d swissprot.fasta -e 0.0001 -v 200 -b 200 -m 0
</pre>

h3. 2.4 Batcher:

Start the Batcher with:
<pre>mkdir test/resources/batch_ymls
java -cp ./dist/ahrd.jar ahrd.controller.Batcher ./test/resources/batcher_input_test.yml</pre>
You will have to edit ./test/resources/batcher_input_test.yml according to your
needs.

h3. 2.5 Output:

AHRD writes out a CSV table with the following columns:
# Protein-Accesion -- The Query Protein's Accession
# Blast-Hit-Accession -- The Accession of the Protein the assigned description was taken from.
# AHRD-Quality-Code -- explained below
# Human-Readable-Description -- The assigned HRD
# Interpro-ID (Description) -- If AHRD was started with InterProScan-Results, they are appended here.
# Gene-Ontology-ID (Name) -- If AHRD was started with Gene-Ontology-Annotations, they are appended here.

AHRD's quality-code consists of a four character string, where each character is
either '*' if the respective criteria is met or '-' otherwise. Their meaning is
explained in the following table:

| Position | Criteria |
| 1 | Bit score of the blast result is >50 and e-value is <e-10 |
| 2 | Overlap of the blast result is >60% |
| 3 | Top token score of assigned HRD is >0.5 |
| 4 | Gene ontology terms found in description line |

h3. 3 Algorithm:

Based on e-values the 200 best scoring blast results are chosen from each
database-search (e.g. Swissprot, TAIR, trEMBL). For all 600 resulting candidate
description lines a score is calculated using a lexical approach. First each
description line is passed through two regular expression filters. The first
filter discards any matching description line in order to ignore descriptions
like e.g. 'Whole genome shotgun sequence', while the second filter tailors the
description lines deleting matching parts, in order to discard e.g. the
trailing Species-Descriptions 'OS=Arabidopsis thaliana [...]". In the second
step the scoring each description line is split into single tokens, which are
passed through a blacklist filter, ignoring all matching tokens in terms of
score. Tokens are sequences of characters with a collective meaning. For each
token a score is calculated from three single scores with different weights,
the bit score, the database score and the overlap score. The bit score is
provided within the blast result. The database score is a fixed score for each
blast database, based on the description quality of the database. The overlap
score reflects the overlap of the query and subject sequence. In the second
step the sum of all token scores from a description line is divided by a
correction factor that avoids the scoring system from being biased towards
longer or shorter description lines. In the third step the predicted gene
ontology terms, if available, are used to evaluate the description lines and to
get a better ranking. Therefore only gene ontology terms with a probability
greater than 0.4 are used. From this ranking now the best scoring description
line can be chosen. In the last step a domain name provided by InterProScan
results, if available, is extracted and appended to the best scoring
description line for each uncharacterized protein.

In the end for each uncharacterized protein a description line is selected that
comes from a high-scoring BLAST match, that contains words occurring frequently
in the descriptions of highest scoring BLAST matches and that does not contain
meaningless "fill words". If available an assigned Interpro domain is appended
to the description line and each line will contain an evaluation section that
reflects the significance of the assigned human readable description.

h4. 3.1 Pseudo-Code

# Choose 600 best scoring blast results
# Filter description lines of above blast-results using regular expressions:
## Reject those matched by any regex given in e.g. ./test/resources/blacklist_descline.txt,
## Delete those parts of each description line, matching any regex in e.g. ./test/resources/filter_descline_sprot.txt.
# Divide each description line into tokens (characters of collective meaning)
## In terms of score ignore any tokens matching regexs given e.g. in ./test/resources/blacklist_token.txt.
# Token score (calculated from: bitscore, database weight, overlap score)
# Lexical score (calculated from: Token score, High score factor, Pattern factor, Correction factor)
# Description score (calculated from: Lexical score, GO score, Blast score)
# Choose best scoring description line
# Append InterProScan description to chosen description line if available

h4. 3.2 Used Formulae and Parameters:

<img src="http://github.com/asishallab/AHRD/raw/master/images/formulae.jpg" />

h5. 3.3 Parameters

Above formulae use the following parameters as given in *./test/resources/ahrd_input_test_run.yml*

h5. 3.3.1 The weights in formula Token-Score are:

<pre>token_score_bit_score_weight: 0.5
token_score_database_score_weight: 0.3
token_score_overlap_score_weight: 0.2 </pre> and Blast-Database specific:
<pre>weight: 100 </pre>

h5. 3.3.2 The weight in formula Lexical-Score is:

<pre>description_score_relative_description_frequency_weight: 0.6 </pre>

h5. 3.3.3 The weight in formula Description-Score also is Blast-database specific:

<pre>description_score_bit_score_weight: 0.2 </pre>

h2. 4. Testing:

If you want to run the complete JUnit Test-Suite execute: <pre>ant</pre>

h2. 5. License

See attached file LICENSE.txt for details.

h2. 6. Authors

Kathrin Klee and Asis Hallab

Group "Plant Computational Biology"
Prof. Dr. Heiko Schoof

Max Planck Institute for Plant Breeding Research
Carl-von-Linné-Weg 10
50829 Köln (Cologne)
Germany

INRES Crop Bioinformatics
University of Bonn
Katzenburgweg 2
53115 Bonn
Germany
101 build.xml
@@ -0,0 +1,101 @@
<project name="AHRD" default="test" basedir=".">
<description>
Assign Human Readable Description (AHRD)
</description>
<!-- set global properties for this build -->
<property name="src" location="src" />
<property name="classes" location="classes" />
<property name="test.classes" location="classes/ahrd/test" />
<property name="lib" location="lib" />
<property name="src.test" location="test" />
<property name="dist" location="dist" />

<path id="classpath.compile">
<fileset dir="${lib}">
<include name="**/*.jar" />
</fileset>
</path>

<path id="classpath.test">
<fileset dir="${lib}">
<include name="**/*.jar" />
</fileset>
<pathelement location="${classes}" />
</path>

<target name="init">
<!-- Create the time stamp -->
<tstamp />
<!-- Create the build directory structure used by compile -->
<mkdir dir="${classes}" />
<delete>
<fileset dir="${classes}" />
</delete>
<mkdir dir="${classes}" />
</target>

<target name="compile" depends="init" description="compile the source ">
<!-- Compile the java code from ${src} into ${classes} -->
<javac srcdir="${src}" destdir="${classes}" includeAntRuntime="yes" nowarn="off">
<compilerarg value="-Xlint:unchecked" />
<classpath refid="classpath.compile" />
</javac>
</target>

<target name="compile.test" depends="compile" description="compile the test-classes">
<javac srcdir="${src.test}" destdir="${classes}" includeAntRuntime="yes" nowarn="off">
<compilerarg value="-Xlint:unchecked" />
<classpath refid="classpath.test" />
</javac>
</target>

<target name="dist" depends="compile" description="generate the distribution">
<!-- Create the distribution directory -->
<mkdir dir="${dist}" />
<unzip dest="${classes}">
<fileset dir="${lib}" />
</unzip>

<!-- Put everything in ${classes} into the ahrd.jar file -->
<jar jarfile="${dist}/ahrd.jar" basedir="${classes}" manifest="MANIFEST.MF" />

<delete>
<fileset dir="${classes}" />
</delete>
</target>

<target name="clean" description="clean up">
<!-- Delete the ${classes} and ${dist} directory trees -->
<delete dir="${classes}" />
<delete dir="${dist}" />
</target>

<target name="test" depends="compile.test">
<junit printsummary="true" fork="true" forkmode="perTest" maxmemory="2048m">
<classpath refid="classpath.test" />
<formatter type="plain" usefile="false" />
<!-- <test name="ahrd.test.AhrdTest" /> -->
<batchtest>
<fileset dir="${classes}" includes="**/*Test.class" />
</batchtest>
</junit>
</target>

<target name="test.run" depends="compile.test">
<junit printsummary="true" fork="true" forkmode="perTest" maxmemory="2048m">
<classpath refid="classpath.test" />
<formatter type="plain" usefile="false" />
<test name="ahrd.test.AhrdTestRun" />
</junit>
</target>

<target name="test.regexs" depends="compile.test">
<junit printsummary="true" fork="true" forkmode="perTest">
<classpath refid="classpath.test" />
<formatter type="plain" usefile="false" />
<test name="ahrd.test.TestRegexs" />
</junit>
</target>

</project>

BIN +131 KB images/formulae.jpg
Binary file not shown.

0 comments on commit 2d28544

Please sign in to comment.
You can’t perform that action at this time.