dbpedia-spotlight · tgalery · Jun 1, 2015 · Jun 12, 2015 · Jun 13, 2015 · Jun 14, 2015
diff --git a/.gitignore b/.gitignore
@@ -3,9 +3,16 @@
 .classpath
 .project
 .settings/
-data_quickstart
 target
 *.log
 *~
 index/output
+core/.cache
+data_quickstart/
 dist
+eval/.cache
+index/.cache
+push.sh
+rest-tomcat/.cache
+rest/.cache
+uima/.cache
diff --git a/README.md b/README.md
@@ -1,4 +1,4 @@
-# DBpedia Spotlight
+    # DBpedia Spotlight
 #### Shedding Light on the Web of Documents
 
 DBpedia Spotlight looks for ~3.5M things of unknown or ~320 known types in text and tries to link them to their global unique identifiers in [DBpedia](http://dbpedia.org). 
@@ -28,12 +28,51 @@ or for JSON:
 
 #### Run your own server
 
+##### Download jar file and data
+
 If you need service reliability and lower response times, you can run DBpedia Spotlight in your own [In-House Server](https://github.com/dbpedia-spotlight/dbpedia-spotlight/wiki/Installation). Just download a model and Spotlight from [here](http://spotlight.sztaki.hu/downloads/) to get started.
 
-    wget http://spotlight.sztaki.hu/downloads/dbpedia-spotlight-latest.jar
-    wget http://spotlight.sztaki.hu/downloads/latest_models/en.tar.gz
-    tar xzf en.tar.gz
-    java -jar dbpedia-spotlight-latest.jar en http://localhost:2222/rest
+1. wget http://spotlight.sztaki.hu/downloads/dbpedia-spotlight-latest.jar
+2. wget http://spotlight.sztaki.hu/downloads/latest_models/en.tar.gz
+3. tar xzf en.tar.gz
+4. java -jar dbpedia-spotlight-latest.jar en http://localhost:2222/rest
+
+Note that `en` above represents the path to the english model downloaded from step 2, and
+`http://localhost:2222/rest` is the mountpoint of the spotlight server.
+Although you can change the base address and port, you cannot change the `/rest` mountpoint.
+
+##### Build From source
+
+If you want to run the latest version of spotlight (to be packaged as v0.8), you should do the following:
+
+1. Clone this repository
+2. Checkout development branch (`git checkout -b development origin/development`, where `origin` is the name of the official spotlight remote)
+3. Build the package `cd dbpedia-spotlight && mvn clean package` (needs java 7 and maven installed)
+4. Download an entity model (as per step 2 in the subsection above)
+5. Uncompress the language model tarball (as per step 3 in the subsection above)
+6. Run the spotlight server (as per step 4 in the subsection above, the jar file is build on `dist/target` folder)
+
+Note: the current development branch (v0.8) works with the vanilla datasets provided in the steps above,
+but it also works with datasets containing (i) weights for a Log-Linear Model for disambiguation,
+and (ii) serialized dense vector representations (word2vec) that would be loaded and used in the desambiguation step.
+The weights comprise of a simple `ranklib-model.txt` file that should be included in the language model's folder (if it's not there already) with content as follows:
+
+```
+## Coordinate Ascent
+## Restart = 5
+## MaxIteration = 25
+## StepBase = 0.05
+## StepScale = 2.0
+## Tolerance = 0.001
+## Regularized = false
+## Slack = 0.001
+1:0.37391416006434364 2:0.07140601847073497 3:0.2616870643056067 4:0.07643781575763943 5:0.21655494140167517
+```
+
+The serialized dense vectors should be placed under a `word2vec` folder inside the spotlight language model's root.
+Most of the work done in the general of these vectors is done by Idio's [wiki2vec](https://github.com/idio/wiki2vec) plus some tooling.
+The use of these models is extremely experimental, so testing and bug reporting is very welcome.
+A full wiki on how to generate these dense vector representations and obtain the LLVM weights is in the works, but for a tentative guide see (this document)[https://github.com/phdowling/gsoc-progress/wiki/Final-Summary]
 
 #### Models and data
 

diff --git a/bin/index_db.sh b/bin/index_db.sh
@@ -10,12 +10,15 @@
 # $4 Analyzer+Stemmer language prefix e.g. Dutch
 # $5 Model target folder
 
+
+# TODO: call ranklib to train LLM and generate output
+
 export MAVEN_OPTS="-Xmx26G"
 
 usage ()
 {
   echo "index_db.sh"
-  echo "usage: ./index_db.sh -o /data/spotlight/nl/opennlp wdir nl_NL /data/spotlight/nl/stopwords.nl.list Dutch /data/spotlight/nl/final_model"
+  echo "usage: ./index_db.sh -v -o /data/spotlight/nl/opennlp wdir nl_NL /data/spotlight/nl/stopwords.nl.list Dutch /data/spotlight/nl/final_model"
   echo "Create a database-backed model of DBpedia Spotlight for a specified language."
   echo " "
 }
@@ -24,12 +27,18 @@ usage ()
 opennlp="None"
 eval="false"
 blacklist="false"
+data_only="false"
+local_mode="false"
+train_llm="false"
 
-while getopts "eo:b:" opt; do
+while getopts "ledovb:" opt; do
   case $opt in
     o) opennlp="$OPTARG";;
     e) eval="true";;
     b) blacklist="$OPTARG";;
+    d) data_only="true";;
+    l) local_mode="true"
+    v) train_llm="true"
   esac
 done
 
@@ -206,8 +215,30 @@ cd $BASE_WDIR/dbpedia-spotlight
 
 mvn -pl index exec:java -Dexec.mainClass=org.dbpedia.spotlight.db.CreateSpotlightModel -Dexec.args="$2 $WDIR $TARGET_DIR $opennlp $STOPWORDS $4Stemmer"
 
-if [ "$eval" == "true" ]; then
-  mvn -pl eval exec:java -Dexec.mainClass=org.dbpedia.spotlight.evaluation.EvaluateSpotlightModel -Dexec.args="$TARGET_DIR $WDIR/heldout.txt" > $TARGET_DIR/evaluation.txt
+if [ "$data_only" == "true" ]; then
+    echo "$CREATE_MODEL" >> create_models.job.sh
+else
+  eval "$CREATE_MODEL"
+
+  if ["$train_llm" == "true"]; then
+    echo "Training LLM Weights"
+    echo "Downloading ranklib..."
+    mkdir -p $BASE_WDIR/ranklib/
+    cd $BASE_WDIR/ranklib/
+    curl -L -o RankLib-2.1-patched.jar http://downloads.sourceforge.net/project/lemur/lemur/RankLib-2.1/RankLib-2.1-patched.jar?r=http%3A%2F%2Fsourceforge.net%2Fprojects%2Flemur%2Ffiles%2Flemur%2FRankLib-2.1%2F&ts=1439317425&use_mirror=skylink
+
+    cd $BASE_DIR
+    echo "Generating features and writing ranklib train data..."
+    MAVEN_OPTS='-Xmx15G' mvn -pl index exec:java -Dexec.mainClass=org.dbpedia.spotlight.db.CreateLLMTrainData -Dexec.args="$2 $WDIR $TARGET_DIR";
+
+    echo "Training LLM weights using ranklib..."
+    java -jar $BASE_WDIR/ranklib/RankLib-2.1-patched.jar  -ranker 4 -train $TARGET_DIR/ranklib-training-data.txt -save $TARGET_DIR/ranklib-model.txt -metric2t ERR@1
+  fi
+
+  if [ "$eval" == "true" ]; then
+      mvn -pl eval exec:java -Dexec.mainClass=org.dbpedia.spotlight.evaluation.EvaluateSpotlightModel -Dexec.args="$TARGET_DIR $WDIR/heldout.txt" > $TARGET_DIR/evaluation.txt
+  fi
+
 fi
 
 curl https://raw.githubusercontent.com/dbpedia-spotlight/model-quickstarter/master/model_readme.txt > $TARGET_DIR/README.txt

diff --git a/bin/train_llm.sh b/bin/train_llm.sh
@@ -0,0 +1,136 @@
+#!/bin/bash
+#+------------------------------------------------------------------------------------------------------------------------------+
+#| DBpedia Spotlight - Create database-backed model                                                                             |
+#| @author Joachim Daiber                                                                                                       |
+#| @author Philipp Dowling                                                                                                      |
+#+------------------------------------------------------------------------------------------------------------------------------+
+
+# $1 Working directory
+# $2 Locale (en_US)
+# $3 Stopwords file
+# $4 Analyzer+Stemmer language prefix e.g. Dutch
+# $5 Model target folder
+
+// TODO test run, fix usage string, integrate into index_db.sh
+
+export MAVEN_OPTS="-Xmx26G"
+
+usage ()
+{
+     echo "index_db.sh"
+     echo "usage: ./train_llm.sh wdir en_US /data/spotlight/stopwords.list English /data/spotlight/output_model_folder"
+     echo "Train weights for the log-linear model used by Spotlight's vector-based context similarity."
+     echo " "
+}
+
+
+opennlp="None"
+eval="false"
+data_only="false"
+local_mode="false"
+
+
+while getopts "ledo:" opt; do
+  case $opt in
+    e) eval="true";;
+    d) data_only="true";;
+    l) local_mode="true"
+  esac
+done
+
+
+shift $((OPTIND - 1))
+
+if [ $# != 5 ]
+then
+    usage
+    exit
+fi
+
+BASE_DIR=$(pwd)
+
+if [[ "$1"  = /* ]]
+then
+   BASE_WDIR="$1"
+else
+   BASE_WDIR="$BASE_DIR/$1"
+fi
+
+if [[ "$5" = /* ]]
+then
+   TARGET_DIR="$5"
+else
+   TARGET_DIR="$BASE_DIR/$5"
+fi
+
+if [[ "$3" = /* ]]
+then
+   STOPWORDS="$3"
+else
+   STOPWORDS="$BASE_DIR/$3"
+fi
+
+WDIR="$BASE_WDIR/$2"
+
+if [[ "$opennlp" == "None" ]]; then
+    echo "";
+elif [[ "$opennlp" != /* ]]; then
+    opennlp="$BASE_DIR/$opennlp"; 
+fi
+
+
+LANGUAGE=`echo $2 | sed "s/_.*//g"`
+
+echo "Language: $LANGUAGE"
+echo "Working directory: $WDIR"
+
+mkdir -p $WDIR
+
+# Stop processing if one step fails
+set -e
+
+cd $BASE_DIR
+#Set up pig:
+if [ -d $BASE_WDIR/pig ]; then
+    echo "Updating PigNLProc..."
+    cd $BASE_WDIR/pig/pignlproc
+    git reset --hard HEAD
+    git pull
+else
+    echo "Setting up PigNLProc..."
+    mkdir -p $BASE_WDIR/pig/
+    cd $BASE_WDIR/pig/
+    git clone --depth 1 https://github.com/dbpedia-spotlight/pignlproc.git
+    cd pignlproc
+    echo "Building PigNLProc..."
+fi
+
+
+echo "Generating train data."
+mkdir -p $BASE_WDIR/wikipedia/
+cd $BASE_WDIR/wikipedia/
+echo "Downloading wikipedia dump..."
+# curl -O "http://dumps.wikimedia.org/${LANGUAGE}wiki/latest/${LANGUAGE}wiki-latest-pages-articles.xml.bz2"
+
+echo "Splitting off train set..."
+# bzcat ${LANGUAGE}wiki-latest-pages-articles.xml.bz2 | python $BASE_WDIR/pig/pignlproc/utilities/split_train_test.py 12000 $WDIR/heldout.txt > /dev/null
+
+echo "Downloading DBpedia redirects and disambiguations..."
+cd $WDIR
+if [ ! -f "redirects.nt" ]; then
+  curl -# http://downloads.dbpedia.org/current/$LANGUAGE/redirects_$LANGUAGE.nt.bz2 | bzcat > redirects.nt
+  curl -# http://downloads.dbpedia.org/current/$LANGUAGE/disambiguations_$LANGUAGE.nt.bz2 | bzcat > disambiguations.nt
+fi
+
+echo "Downloading ranklib..."
+mkdir -p $BASE_WDIR/ranklib/
+cd $BASE_WDIR/ranklib/
+curl -L -o RankLib-2.1-patched.jar http://downloads.sourceforge.net/project/lemur/lemur/RankLib-2.1/RankLib-2.1-patched.jar?r=http%3A%2F%2Fsourceforge.net%2Fprojects%2Flemur%2Ffiles%2Flemur%2FRankLib-2.1%2F&ts=1439317425&use_mirror=skylink
+
+cd $BASE_DIR
+echo "Generating features and writing ranklib train data..."
+MAVEN_OPTS='-Xmx15G' mvn -pl index exec:java -Dexec.mainClass=org.dbpedia.spotlight.db.CreateLLMTrainData -Dexec.args="$2 $WDIR $TARGET_DIR";
+
+echo "Training model using ranklib..."
+java -jar $BASE_WDIR/ranklib/RankLib-2.1-patched.jar  -ranker 4 -train $TARGET_DIR/ranklib-training-data.txt -save $TARGET_DIR/ranklib-model.txt -metric2t ERR@1
+
diff --git a/core/pom.xml b/core/pom.xml
@@ -25,7 +25,7 @@
     <parent>
         <groupId>org.dbpedia.spotlight</groupId>
         <artifactId>spotlight</artifactId>
-        <version>0.7</version>
+        <version>0.8</version>
         <relativePath>../pom.xml</relativePath>
     </parent>
 
@@ -242,6 +242,7 @@
             <version>0.10</version>
         </dependency>
 
+
         <dependency>
             <groupId>com.typesafe.akka</groupId>
             <artifactId>akka-actor_2.10</artifactId>