Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Moved Asis Hallab's repository to the official work group's one.
- Loading branch information
0 parents
commit 2d28544
Showing
120 changed files
with
41,850 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Original file line | Diff line number | Diff line change |
---|---|---|---|
@@ -0,0 +1,25 @@ | |||
<?xml version="1.0" encoding="UTF-8"?> | |||
<classpath> | |||
<classpathentry kind="src" path="src"/> | |||
<classpathentry kind="src" path="test"/> | |||
<classpathentry kind="con" path="org.eclipse.jdt.launching.JRE_CONTAINER"/> | |||
<classpathentry kind="lib" path="lib/apps-1.7.1.jar"/> | |||
<classpathentry kind="lib" path="lib/biojava-1.7.1.jar"/> | |||
<classpathentry kind="lib" path="lib/bytecode.jar"/> | |||
<classpathentry kind="lib" path="lib/commons-cli.jar"/> | |||
<classpathentry kind="lib" path="lib/commons-collections-2.1.jar"/> | |||
<classpathentry kind="lib" path="lib/commons-dbcp-1.1.jar"/> | |||
<classpathentry kind="lib" path="lib/commons-pool-1.1.jar"/> | |||
<classpathentry kind="lib" path="lib/demos-1.7.1.jar"/> | |||
<classpathentry kind="lib" path="lib/dtd-xercesImpl.jar"/> | |||
<classpathentry kind="lib" path="lib/jarjar-1.0.jar"/> | |||
<classpathentry kind="lib" path="lib/jgrapht-jdk1.5.jar"/> | |||
<classpathentry kind="lib" path="lib/junit-4.9b2.jar"/> | |||
<classpathentry kind="lib" path="lib/objenesis-1.2.jar"/> | |||
<classpathentry kind="lib" path="lib/serializer.jar"/> | |||
<classpathentry kind="lib" path="lib/xalan.jar"/> | |||
<classpathentry kind="lib" path="lib/xml-apis.jar"/> | |||
<classpathentry kind="lib" path="lib/xom-1.2.6.jar"/> | |||
<classpathentry kind="lib" path="lib/yamlbeans-1.06.jar"/> | |||
<classpathentry kind="output" path="classes"/> | |||
</classpath> |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Original file line | Diff line number | Diff line change |
---|---|---|---|
@@ -0,0 +1,18 @@ | |||
# Ignore everything in directory ./classes | |||
classes/**/* | |||
|
|||
# Ignore Batcher-Tests: | |||
start_ahrd_batched.sh | |||
|
|||
# The following directory is only created, if Batcher is tested: | |||
test/resources/batch_ymls | |||
|
|||
# Executable jar should also be ignored | |||
dist/ahrd.jar | |||
|
|||
# Ignore AHRD-Output generated by Test-Suite: | |||
test/ahrd_output.csv | |||
|
|||
# Ignore subversion related files | |||
.svn | |||
.svnignore |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Original file line | Diff line number | Diff line change |
---|---|---|---|
@@ -0,0 +1,23 @@ | |||
<?xml version="1.0" encoding="UTF-8"?> | |||
<projectDescription> | |||
<name>AHRD_Java</name> | |||
<comment></comment> | |||
<projects> | |||
</projects> | |||
<buildSpec> | |||
<buildCommand> | |||
<name>org.eclipse.jdt.core.javabuilder</name> | |||
<arguments> | |||
</arguments> | |||
</buildCommand> | |||
<buildCommand> | |||
<name>net.sourceforge.metrics.builder</name> | |||
<arguments> | |||
</arguments> | |||
</buildCommand> | |||
</buildSpec> | |||
<natures> | |||
<nature>org.eclipse.jdt.core.javanature</nature> | |||
<nature>net.sourceforge.metrics.nature</nature> | |||
</natures> | |||
</projectDescription> |
Large diffs are not rendered by default.
Oops, something went wrong.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Original file line | Diff line number | Diff line change |
---|---|---|---|
@@ -0,0 +1 @@ | |||
Main-Class: ahrd.controller.AHRD |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Original file line | Diff line number | Diff line change |
---|---|---|---|
@@ -0,0 +1,196 @@ | |||
h1. Automated Assignment of Human Readable Descriptions (AHRD) | |||
|
|||
Protein function has often been transferred from characterized proteins to | |||
novel proteins based on sequence similarity, e.g. using the best BLAST hit. To | |||
assign human readable descriptions to predicted proteins we developed a new | |||
program called Automatic assignment of human readable descriptions (AHRD). We | |||
aim to select descriptions that are concise and informative, precise in regard | |||
to function and use standard nomenclature. AHRD scores BLAST hits taken from | |||
searches against different databases on the basis of the trust put into these | |||
databases and the local alignment quality. The BLAST hit descriptions are | |||
tokenized into informative words and a lexical analysis scores these tokens | |||
according to their frequency and the quality of the BLAST hits they occur in. | |||
Shared tokens with Gene-Ontology Annotations increase the description-scoring | |||
in order to use standard nomenclature where possible. Finally the best scoring | |||
description is assigned. | |||
|
|||
h2. 1.1 Requirements: | |||
|
|||
AHRD is a Java-Program which requires Java 1.5 or higher. | |||
|
|||
|
|||
h2. 1.2 Installation: | |||
|
|||
h3. 1.2.1 Get AHRD | |||
|
|||
Copy (clone) AHRD to your computer using git: | |||
<pre>git clone git://github.com/asishallab/AHRD.git</pre> | |||
|
|||
h3. 1.2.2 Build the executable jar: | |||
|
|||
Running <pre>ant dist</pre> will create the executable JAR-File: ./dist/ahrd.jar | |||
|
|||
h2. 2 Usage: | |||
|
|||
All AHRD-Inputs are passed to AHRD in a single YML-File. | |||
See test/resources/ahrd_input.yml for details. | |||
(About YAML-Format see <a href="http://en.wikipedia.org/wiki/YAML">Wikipedia</a>) | |||
|
|||
Basically AHRD needs a FASTA-File of amino acid sequences and different files | |||
containing the results from the respective BLAST searches, in our example we | |||
searched three databases: Uniprot/trEMBL, Uniprot/Swissprot and TAIR10. Note, | |||
that AHRD is generic and can make use of any number of different Blast | |||
databases that do not necessarily have to be the above ones. If e.g. annotating | |||
genes from a fungal genome searching yeast databases might be more | |||
recommendable than using TAIR (_Arabidopsis thaliana_). | |||
|
|||
All parameters can be set manually, or the default ones can be used as given in | |||
the example input file ./test/resources/ahrd_input_test_run.yml (see section | |||
Parameters). | |||
|
|||
AHRD is recommended to be run on batches of 1,000 to 2,000 proteins. If you | |||
want to annotate a whole genome use the included Batcher to split your | |||
input-data into Batches of appropriate size (see section Batcher). | |||
|
|||
h3. 2.2 AHRD Example run: | |||
|
|||
<pre>java -Xmx2g -jar ./dist/ahrd.jar ./test/resources/ahrd_input_test_run.yml </pre> | |||
|
|||
or just execute | |||
|
|||
<pre>ant test.run </pre> | |||
|
|||
h3. 2.3 Recommended BLAST-Search: | |||
|
|||
For your query proteins you should start independent BLAST searches e.g. | |||
in the three different databases mentioned above: | |||
|
|||
<pre> | |||
blastall -p blastp -i proteins.fasta -o swissprot_blastout.pairwise -d swissprot.fasta -e 0.0001 -v 200 -b 200 -m 0 | |||
</pre> | |||
|
|||
h3. 2.4 Batcher: | |||
|
|||
Start the Batcher with: | |||
<pre>mkdir test/resources/batch_ymls | |||
java -cp ./dist/ahrd.jar ahrd.controller.Batcher ./test/resources/batcher_input_test.yml</pre> | |||
You will have to edit ./test/resources/batcher_input_test.yml according to your | |||
needs. | |||
|
|||
h3. 2.5 Output: | |||
|
|||
AHRD writes out a CSV table with the following columns: | |||
# Protein-Accesion -- The Query Protein's Accession | |||
# Blast-Hit-Accession -- The Accession of the Protein the assigned description was taken from. | |||
# AHRD-Quality-Code -- explained below | |||
# Human-Readable-Description -- The assigned HRD | |||
# Interpro-ID (Description) -- If AHRD was started with InterProScan-Results, they are appended here. | |||
# Gene-Ontology-ID (Name) -- If AHRD was started with Gene-Ontology-Annotations, they are appended here. | |||
|
|||
AHRD's quality-code consists of a four character string, where each character is | |||
either '*' if the respective criteria is met or '-' otherwise. Their meaning is | |||
explained in the following table: | |||
|
|||
| Position | Criteria | | |||
| 1 | Bit score of the blast result is >50 and e-value is <e-10 | | |||
| 2 | Overlap of the blast result is >60% | | |||
| 3 | Top token score of assigned HRD is >0.5 | | |||
| 4 | Gene ontology terms found in description line | | |||
|
|||
h3. 3 Algorithm: | |||
|
|||
Based on e-values the 200 best scoring blast results are chosen from each | |||
database-search (e.g. Swissprot, TAIR, trEMBL). For all 600 resulting candidate | |||
description lines a score is calculated using a lexical approach. First each | |||
description line is passed through two regular expression filters. The first | |||
filter discards any matching description line in order to ignore descriptions | |||
like e.g. 'Whole genome shotgun sequence', while the second filter tailors the | |||
description lines deleting matching parts, in order to discard e.g. the | |||
trailing Species-Descriptions 'OS=Arabidopsis thaliana [...]". In the second | |||
step the scoring each description line is split into single tokens, which are | |||
passed through a blacklist filter, ignoring all matching tokens in terms of | |||
score. Tokens are sequences of characters with a collective meaning. For each | |||
token a score is calculated from three single scores with different weights, | |||
the bit score, the database score and the overlap score. The bit score is | |||
provided within the blast result. The database score is a fixed score for each | |||
blast database, based on the description quality of the database. The overlap | |||
score reflects the overlap of the query and subject sequence. In the second | |||
step the sum of all token scores from a description line is divided by a | |||
correction factor that avoids the scoring system from being biased towards | |||
longer or shorter description lines. In the third step the predicted gene | |||
ontology terms, if available, are used to evaluate the description lines and to | |||
get a better ranking. Therefore only gene ontology terms with a probability | |||
greater than 0.4 are used. From this ranking now the best scoring description | |||
line can be chosen. In the last step a domain name provided by InterProScan | |||
results, if available, is extracted and appended to the best scoring | |||
description line for each uncharacterized protein. | |||
|
|||
In the end for each uncharacterized protein a description line is selected that | |||
comes from a high-scoring BLAST match, that contains words occurring frequently | |||
in the descriptions of highest scoring BLAST matches and that does not contain | |||
meaningless "fill words". If available an assigned Interpro domain is appended | |||
to the description line and each line will contain an evaluation section that | |||
reflects the significance of the assigned human readable description. | |||
|
|||
h4. 3.1 Pseudo-Code | |||
|
|||
# Choose 600 best scoring blast results | |||
# Filter description lines of above blast-results using regular expressions: | |||
## Reject those matched by any regex given in e.g. ./test/resources/blacklist_descline.txt, | |||
## Delete those parts of each description line, matching any regex in e.g. ./test/resources/filter_descline_sprot.txt. | |||
# Divide each description line into tokens (characters of collective meaning) | |||
## In terms of score ignore any tokens matching regexs given e.g. in ./test/resources/blacklist_token.txt. | |||
# Token score (calculated from: bitscore, database weight, overlap score) | |||
# Lexical score (calculated from: Token score, High score factor, Pattern factor, Correction factor) | |||
# Description score (calculated from: Lexical score, GO score, Blast score) | |||
# Choose best scoring description line | |||
# Append InterProScan description to chosen description line if available | |||
|
|||
h4. 3.2 Used Formulae and Parameters: | |||
|
|||
<img src="http://github.com/asishallab/AHRD/raw/master/images/formulae.jpg" /> | |||
|
|||
h5. 3.3 Parameters | |||
|
|||
Above formulae use the following parameters as given in *./test/resources/ahrd_input_test_run.yml* | |||
|
|||
h5. 3.3.1 The weights in formula Token-Score are: | |||
|
|||
<pre>token_score_bit_score_weight: 0.5 | |||
token_score_database_score_weight: 0.3 | |||
token_score_overlap_score_weight: 0.2 </pre> and Blast-Database specific: | |||
<pre>weight: 100 </pre> | |||
|
|||
h5. 3.3.2 The weight in formula Lexical-Score is: | |||
|
|||
<pre>description_score_relative_description_frequency_weight: 0.6 </pre> | |||
|
|||
h5. 3.3.3 The weight in formula Description-Score also is Blast-database specific: | |||
|
|||
<pre>description_score_bit_score_weight: 0.2 </pre> | |||
|
|||
h2. 4. Testing: | |||
|
|||
If you want to run the complete JUnit Test-Suite execute: <pre>ant</pre> | |||
|
|||
h2. 5. License | |||
|
|||
See attached file LICENSE.txt for details. | |||
|
|||
h2. 6. Authors | |||
|
|||
Kathrin Klee and Asis Hallab | |||
|
|||
Group "Plant Computational Biology" | |||
Prof. Dr. Heiko Schoof | |||
|
|||
Max Planck Institute for Plant Breeding Research | |||
Carl-von-Linné-Weg 10 | |||
50829 Köln (Cologne) | |||
Germany | |||
|
|||
INRES Crop Bioinformatics | |||
University of Bonn | |||
Katzenburgweg 2 | |||
53115 Bonn | |||
Germany |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Original file line | Diff line number | Diff line change |
---|---|---|---|
@@ -0,0 +1,101 @@ | |||
<project name="AHRD" default="test" basedir="."> | |||
<description> | |||
Assign Human Readable Description (AHRD) | |||
</description> | |||
<!-- set global properties for this build --> | |||
<property name="src" location="src" /> | |||
<property name="classes" location="classes" /> | |||
<property name="test.classes" location="classes/ahrd/test" /> | |||
<property name="lib" location="lib" /> | |||
<property name="src.test" location="test" /> | |||
<property name="dist" location="dist" /> | |||
|
|||
<path id="classpath.compile"> | |||
<fileset dir="${lib}"> | |||
<include name="**/*.jar" /> | |||
</fileset> | |||
</path> | |||
|
|||
<path id="classpath.test"> | |||
<fileset dir="${lib}"> | |||
<include name="**/*.jar" /> | |||
</fileset> | |||
<pathelement location="${classes}" /> | |||
</path> | |||
|
|||
<target name="init"> | |||
<!-- Create the time stamp --> | |||
<tstamp /> | |||
<!-- Create the build directory structure used by compile --> | |||
<mkdir dir="${classes}" /> | |||
<delete> | |||
<fileset dir="${classes}" /> | |||
</delete> | |||
<mkdir dir="${classes}" /> | |||
</target> | |||
|
|||
<target name="compile" depends="init" description="compile the source "> | |||
<!-- Compile the java code from ${src} into ${classes} --> | |||
<javac srcdir="${src}" destdir="${classes}" includeAntRuntime="yes" nowarn="off"> | |||
<compilerarg value="-Xlint:unchecked" /> | |||
<classpath refid="classpath.compile" /> | |||
</javac> | |||
</target> | |||
|
|||
<target name="compile.test" depends="compile" description="compile the test-classes"> | |||
<javac srcdir="${src.test}" destdir="${classes}" includeAntRuntime="yes" nowarn="off"> | |||
<compilerarg value="-Xlint:unchecked" /> | |||
<classpath refid="classpath.test" /> | |||
</javac> | |||
</target> | |||
|
|||
<target name="dist" depends="compile" description="generate the distribution"> | |||
<!-- Create the distribution directory --> | |||
<mkdir dir="${dist}" /> | |||
<unzip dest="${classes}"> | |||
<fileset dir="${lib}" /> | |||
</unzip> | |||
|
|||
<!-- Put everything in ${classes} into the ahrd.jar file --> | |||
<jar jarfile="${dist}/ahrd.jar" basedir="${classes}" manifest="MANIFEST.MF" /> | |||
|
|||
<delete> | |||
<fileset dir="${classes}" /> | |||
</delete> | |||
</target> | |||
|
|||
<target name="clean" description="clean up"> | |||
<!-- Delete the ${classes} and ${dist} directory trees --> | |||
<delete dir="${classes}" /> | |||
<delete dir="${dist}" /> | |||
</target> | |||
|
|||
<target name="test" depends="compile.test"> | |||
<junit printsummary="true" fork="true" forkmode="perTest" maxmemory="2048m"> | |||
<classpath refid="classpath.test" /> | |||
<formatter type="plain" usefile="false" /> | |||
<!-- <test name="ahrd.test.AhrdTest" /> --> | |||
<batchtest> | |||
<fileset dir="${classes}" includes="**/*Test.class" /> | |||
</batchtest> | |||
</junit> | |||
</target> | |||
|
|||
<target name="test.run" depends="compile.test"> | |||
<junit printsummary="true" fork="true" forkmode="perTest" maxmemory="2048m"> | |||
<classpath refid="classpath.test" /> | |||
<formatter type="plain" usefile="false" /> | |||
<test name="ahrd.test.AhrdTestRun" /> | |||
</junit> | |||
</target> | |||
|
|||
<target name="test.regexs" depends="compile.test"> | |||
<junit printsummary="true" fork="true" forkmode="perTest"> | |||
<classpath refid="classpath.test" /> | |||
<formatter type="plain" usefile="false" /> | |||
<test name="ahrd.test.TestRegexs" /> | |||
</junit> | |||
</target> | |||
|
|||
</project> | |||
|
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Oops, something went wrong.