Moved Asis Hallab's repository to the official work group's one.

groupschoof · Mar 13, 2012 · 2d28544 · 2d28544
commit 2d28544
Show file tree

Hide file tree

Showing 120 changed files with 41,850 additions and 0 deletions.
diff --git a/.classpath b/.classpath
@@ -0,0 +1,25 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<classpath>
+	<classpathentry kind="src" path="src"/>
+	<classpathentry kind="src" path="test"/>
+	<classpathentry kind="con" path="org.eclipse.jdt.launching.JRE_CONTAINER"/>
+	<classpathentry kind="lib" path="lib/apps-1.7.1.jar"/>
+	<classpathentry kind="lib" path="lib/biojava-1.7.1.jar"/>
+	<classpathentry kind="lib" path="lib/bytecode.jar"/>
+	<classpathentry kind="lib" path="lib/commons-cli.jar"/>
+	<classpathentry kind="lib" path="lib/commons-collections-2.1.jar"/>
+	<classpathentry kind="lib" path="lib/commons-dbcp-1.1.jar"/>
+	<classpathentry kind="lib" path="lib/commons-pool-1.1.jar"/>
+	<classpathentry kind="lib" path="lib/demos-1.7.1.jar"/>
+	<classpathentry kind="lib" path="lib/dtd-xercesImpl.jar"/>
+	<classpathentry kind="lib" path="lib/jarjar-1.0.jar"/>
+	<classpathentry kind="lib" path="lib/jgrapht-jdk1.5.jar"/>
+	<classpathentry kind="lib" path="lib/junit-4.9b2.jar"/>
+	<classpathentry kind="lib" path="lib/objenesis-1.2.jar"/>
+	<classpathentry kind="lib" path="lib/serializer.jar"/>
+	<classpathentry kind="lib" path="lib/xalan.jar"/>
+	<classpathentry kind="lib" path="lib/xml-apis.jar"/>
+	<classpathentry kind="lib" path="lib/xom-1.2.6.jar"/>
+	<classpathentry kind="lib" path="lib/yamlbeans-1.06.jar"/>
+	<classpathentry kind="output" path="classes"/>
+</classpath>
diff --git a/.gitignore b/.gitignore
@@ -0,0 +1,18 @@
+# Ignore everything in directory ./classes 
+classes/**/*
+
+# Ignore Batcher-Tests:
+start_ahrd_batched.sh
+
+# The following directory is only created, if Batcher is tested:
+test/resources/batch_ymls
+
+# Executable jar should also be ignored
+dist/ahrd.jar
+
+# Ignore AHRD-Output generated by Test-Suite:
+test/ahrd_output.csv
+
+# Ignore subversion related files
+.svn
+.svnignore
diff --git a/.project b/.project
@@ -0,0 +1,23 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<projectDescription>
+	<name>AHRD_Java</name>
+	<comment></comment>
+	<projects>
+	</projects>
+	<buildSpec>
+		<buildCommand>
+			<name>org.eclipse.jdt.core.javabuilder</name>
+			<arguments>
+			</arguments>
+		</buildCommand>
+		<buildCommand>
+			<name>net.sourceforge.metrics.builder</name>
+			<arguments>
+			</arguments>
+		</buildCommand>
+	</buildSpec>
+	<natures>
+		<nature>org.eclipse.jdt.core.javanature</nature>
+		<nature>net.sourceforge.metrics.nature</nature>
+	</natures>
+</projectDescription>
diff --git a/LICENSE.txt b/LICENSE.txt
diff --git a/MANIFEST.MF b/MANIFEST.MF
@@ -0,0 +1 @@
+Main-Class: ahrd.controller.AHRD
diff --git a/README.textile b/README.textile
@@ -0,0 +1,196 @@
+h1. Automated Assignment of Human Readable Descriptions (AHRD)
+
+Protein function has often been transferred from characterized proteins to
+novel proteins based on sequence similarity, e.g. using the best BLAST hit. To
+assign human readable descriptions to predicted proteins we developed a new
+program called Automatic assignment of human readable descriptions (AHRD). We
+aim to select descriptions that are concise and informative, precise in regard
+to function and use standard nomenclature. AHRD scores BLAST hits taken from
+searches against different databases on the basis of the trust put into these
+databases and the local alignment quality. The BLAST hit descriptions are
+tokenized into informative words and a lexical analysis scores these tokens
+according to their frequency and the quality of the BLAST hits they occur in.
+Shared tokens with Gene-Ontology Annotations increase the description-scoring
+in order to use standard nomenclature where possible. Finally the best scoring
+description is assigned.
+
+h2. 1.1 Requirements:
+
+AHRD is a Java-Program which requires Java 1.5 or higher.
+
+
+h2. 1.2 Installation:
+
+h3. 1.2.1 Get AHRD
+
+Copy (clone) AHRD to your computer using git:
+<pre>git clone git://github.com/asishallab/AHRD.git</pre>
+
+h3. 1.2.2 Build the executable jar:
+
+Running <pre>ant dist</pre> will create the executable JAR-File: ./dist/ahrd.jar
+
+h2. 2 Usage:
+
+All AHRD-Inputs are passed to AHRD in a single YML-File.
+See test/resources/ahrd_input.yml for details.
+(About YAML-Format see <a href="http://en.wikipedia.org/wiki/YAML">Wikipedia</a>)
+
+Basically AHRD needs a FASTA-File of amino acid sequences and different files
+containing the results from the respective BLAST searches, in our example we
+searched three databases: Uniprot/trEMBL, Uniprot/Swissprot and TAIR10. Note,
+that AHRD is generic and can make use of any number of different Blast
+databases that do not necessarily have to be the above ones. If e.g. annotating
+genes from a fungal genome searching yeast databases might be more
+recommendable than using TAIR (_Arabidopsis thaliana_).
+
+All parameters can be set manually, or the default ones can be used as given in
+the example input file ./test/resources/ahrd_input_test_run.yml (see section
+Parameters).
+
+AHRD is recommended to be run on batches of 1,000 to 2,000 proteins. If you
+want to annotate a whole genome use the included Batcher to split your
+input-data into Batches of appropriate size (see section Batcher).
+
+h3. 2.2 AHRD Example run:
+
+<pre>java -Xmx2g -jar ./dist/ahrd.jar ./test/resources/ahrd_input_test_run.yml </pre>
+
+or just execute
+
+<pre>ant test.run </pre>
+
+h3. 2.3 Recommended BLAST-Search:
+
+For your query proteins you should start independent BLAST searches e.g.
+in the three different databases mentioned above:
+
+<pre>
+blastall -p blastp -i proteins.fasta -o swissprot_blastout.pairwise -d swissprot.fasta -e 0.0001 -v 200 -b 200 -m 0
+</pre>
+
+h3. 2.4 Batcher:
+
+Start the Batcher with:
+<pre>mkdir test/resources/batch_ymls 
+java -cp ./dist/ahrd.jar ahrd.controller.Batcher ./test/resources/batcher_input_test.yml</pre>
+You will have to edit ./test/resources/batcher_input_test.yml according to your
+needs.
+
+h3. 2.5 Output:
+
+AHRD writes out a CSV table with the following columns:
+# Protein-Accesion -- The Query Protein's Accession
+# Blast-Hit-Accession -- The Accession of the Protein the assigned description was taken from.
+# AHRD-Quality-Code -- explained below
+# Human-Readable-Description -- The assigned HRD
+# Interpro-ID (Description) -- If AHRD was started with InterProScan-Results, they are appended here.
+# Gene-Ontology-ID (Name) -- If AHRD was started with Gene-Ontology-Annotations, they are appended here.
+
+AHRD's quality-code consists of a four character string, where each character is
+either '*' if the respective criteria is met or '-' otherwise. Their meaning is
+explained in the following table:
+
+| Position | Criteria |
+| 1 | Bit score of the blast result is >50 and e-value is <e-10 |
+| 2 | Overlap of the blast result is >60% |
+| 3 | Top token score of assigned HRD is >0.5 |
+| 4 | Gene ontology terms found in description line |
+
+h3. 3 Algorithm:
+
+Based on e-values the 200 best scoring blast results are chosen from each
+database-search (e.g. Swissprot, TAIR, trEMBL). For all 600 resulting candidate
+description lines a score is calculated using a lexical approach. First each
+description line is passed through two regular expression filters. The first
+filter discards any matching description line in order to ignore descriptions
+like e.g. 'Whole genome shotgun sequence', while the second filter tailors the
+description lines deleting matching parts, in order to discard e.g. the
+trailing Species-Descriptions 'OS=Arabidopsis thaliana [...]". In the second
+step the scoring each description line is split into single tokens, which are
+passed through a blacklist filter, ignoring all matching tokens in terms of
+score. Tokens are sequences of characters with a collective meaning. For each
+token a score is calculated from three single scores with different weights,
+the bit score, the database score and the overlap score. The bit score is
+provided within the blast result. The database score is a fixed score for each
+blast database, based on the description quality of the database. The overlap
+score reflects the overlap of the query and subject sequence. In the second
+step the sum of all token scores from a description line is divided by a
+correction factor that avoids the scoring system from being biased towards
+longer or shorter description lines. In the third step the predicted gene
+ontology terms, if available, are used to evaluate the description lines and to
+get a better ranking. Therefore only gene ontology terms with a probability
+greater than 0.4 are used. From this ranking now the best scoring description
+line can be chosen. In the last step a domain name provided by InterProScan
+results, if available, is extracted and appended to the best scoring
+description line for each uncharacterized protein.
+
+In the end for each uncharacterized protein a description line is selected that
+comes from a high-scoring BLAST match, that contains words occurring frequently
+in the descriptions of highest scoring BLAST matches and that does not contain
+meaningless "fill words". If available an assigned Interpro domain is appended
+to the description line and each line will contain an evaluation section that
+reflects the significance of the assigned human readable description.
+
+h4. 3.1 Pseudo-Code
+
+# Choose 600 best scoring blast results
+# Filter description lines of above blast-results using regular expressions:
+## Reject those matched by any regex given in e.g. ./test/resources/blacklist_descline.txt,
+## Delete those parts of each description line, matching any regex in e.g. ./test/resources/filter_descline_sprot.txt. 
+# Divide each description line into tokens (characters of collective meaning)
+## In terms of score ignore any tokens matching regexs given e.g. in ./test/resources/blacklist_token.txt.
+# Token score (calculated from: bitscore, database weight, overlap score)
+# Lexical score (calculated from: Token score, High score factor, Pattern factor, Correction factor)
+# Description score (calculated from: Lexical score, GO score, Blast score)
+# Choose best scoring description line
+# Append InterProScan description to chosen description line if available
+
+h4. 3.2 Used Formulae and Parameters:
+
+<img src="http://github.com/asishallab/AHRD/raw/master/images/formulae.jpg" />
+
+h5. 3.3 Parameters
+
+Above formulae use the following parameters as given in *./test/resources/ahrd_input_test_run.yml*
+
+h5. 3.3.1 The weights in formula Token-Score are:  
+
+<pre>token_score_bit_score_weight: 0.5
+token_score_database_score_weight: 0.3
+token_score_overlap_score_weight: 0.2 </pre> and Blast-Database specific:
+<pre>weight: 100 </pre>
+
+h5. 3.3.2 The weight in formula Lexical-Score is:
+
+<pre>description_score_relative_description_frequency_weight: 0.6 </pre>
+
+h5. 3.3.3 The weight in formula Description-Score also is Blast-database specific:
+
+<pre>description_score_bit_score_weight: 0.2 </pre>
+
+h2. 4. Testing:
+
+If you want to run the complete JUnit Test-Suite execute: <pre>ant</pre>
+
+h2. 5. License
+
+See attached file LICENSE.txt for details.
+
+h2. 6. Authors
+
+Kathrin Klee and Asis Hallab
+
+Group "Plant Computational Biology"
+Prof. Dr. Heiko Schoof
+
+Max Planck Institute for Plant Breeding Research
+Carl-von-Linné-Weg 10
+50829 Köln (Cologne)
+Germany
+
+INRES Crop Bioinformatics
+University of Bonn
+Katzenburgweg 2
+53115 Bonn
+Germany
diff --git a/build.xml b/build.xml
@@ -0,0 +1,101 @@
+<project name="AHRD" default="test" basedir=".">
+	<description>
+    Assign Human Readable Description (AHRD)
+  </description>
+	<!-- set global properties for this build -->
+	<property name="src" location="src" />
+	<property name="classes" location="classes" />
+	<property name="test.classes" location="classes/ahrd/test" />
+	<property name="lib" location="lib" />
+	<property name="src.test" location="test" />
+	<property name="dist" location="dist" />
+
+	<path id="classpath.compile">
+		<fileset dir="${lib}">
+			<include name="**/*.jar" />
+		</fileset>
+	</path>
+
+	<path id="classpath.test">
+		<fileset dir="${lib}">
+			<include name="**/*.jar" />
+		</fileset>
+		<pathelement location="${classes}" />
+	</path>
+
+	<target name="init">
+		<!-- Create the time stamp -->
+		<tstamp />
+		<!-- Create the build directory structure used by compile -->
+		<mkdir dir="${classes}" />
+		<delete>
+			<fileset dir="${classes}" />
+		</delete>
+		<mkdir dir="${classes}" />
+	</target>
+
+	<target name="compile" depends="init" description="compile the source ">
+		<!-- Compile the java code from ${src} into ${classes} -->
+		<javac srcdir="${src}" destdir="${classes}" includeAntRuntime="yes" nowarn="off">
+			<compilerarg value="-Xlint:unchecked" />
+			<classpath refid="classpath.compile" />
+		</javac>
+	</target>
+
+	<target name="compile.test" depends="compile" description="compile the test-classes">
+		<javac srcdir="${src.test}" destdir="${classes}" includeAntRuntime="yes" nowarn="off">
+			<compilerarg value="-Xlint:unchecked" />
+			<classpath refid="classpath.test" />
+		</javac>
+	</target>
+
+	<target name="dist" depends="compile" description="generate the distribution">
+		<!-- Create the distribution directory -->
+		<mkdir dir="${dist}" />
+		<unzip dest="${classes}">
+			<fileset dir="${lib}" />
+		</unzip>
+
+		<!-- Put everything in ${classes} into the ahrd.jar file -->
+		<jar jarfile="${dist}/ahrd.jar" basedir="${classes}" manifest="MANIFEST.MF" />
+
+		<delete>
+			<fileset dir="${classes}" />
+		</delete>
+	</target>
+
+	<target name="clean" description="clean up">
+		<!-- Delete the ${classes} and ${dist} directory trees -->
+		<delete dir="${classes}" />
+		<delete dir="${dist}" />
+	</target>
+
+	<target name="test" depends="compile.test">
+		<junit printsummary="true" fork="true" forkmode="perTest" maxmemory="2048m">
+			<classpath refid="classpath.test" />
+			<formatter type="plain" usefile="false" />
+			<!-- <test name="ahrd.test.AhrdTest" /> -->
+			<batchtest>
+				<fileset dir="${classes}" includes="**/*Test.class" />
+			</batchtest>
+		</junit>
+	</target>
+
+	<target name="test.run" depends="compile.test">
+		<junit printsummary="true" fork="true" forkmode="perTest" maxmemory="2048m">
+			<classpath refid="classpath.test" />
+			<formatter type="plain" usefile="false" />
+			<test name="ahrd.test.AhrdTestRun" />
+		</junit>
+	</target>
+
+	<target name="test.regexs" depends="compile.test">
+		<junit printsummary="true" fork="true" forkmode="perTest">
+			<classpath refid="classpath.test" />
+			<formatter type="plain" usefile="false" />
+			<test name="ahrd.test.TestRegexs" />
+		</junit>
+	</target>
+
+</project>
+
diff --git a/images/formulae.jpg b/images/formulae.jpg