-
Notifications
You must be signed in to change notification settings - Fork 201
Adds log-linear weighting of features for disambiguation #390
base: development
Are you sure you want to change the base?
Changes from all commits
3f62f2f
839e41e
99095bb
c44546b
d063627
b16b760
4a7ac33
b8a5030
0f9894a
ce93716
d537800
ad2491e
58b99de
a95776a
841584f
292ee27
e5ea9d4
51c792f
ef353aa
f89a80f
3f2ad3b
0444238
9a1db6f
cbd8250
5fbd072
2c194e4
6114561
00b6169
9cdb668
20ac1cc
2ab209d
2c0a2b0
4db1012
4018540
5cfcf7f
9e1c13a
f2b0718
727c322
e22733d
8162a2d
3919e7c
1a07e8d
ddeb717
069495d
3941b8e
bef2a03
f7ef127
5430db4
d4de688
2523b7a
21d6171
fabf86d
86a95fe
0d09881
6b672cd
2b9205b
44fdfb7
f1e5318
f649c4d
beff993
b80397b
c807e63
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -10,12 +10,15 @@ | |
# $4 Analyzer+Stemmer language prefix e.g. Dutch | ||
# $5 Model target folder | ||
|
||
|
||
# TODO: call ranklib to train LLM and generate output | ||
|
||
export MAVEN_OPTS="-Xmx26G" | ||
|
||
usage () | ||
{ | ||
echo "index_db.sh" | ||
echo "usage: ./index_db.sh -o /data/spotlight/nl/opennlp wdir nl_NL /data/spotlight/nl/stopwords.nl.list Dutch /data/spotlight/nl/final_model" | ||
echo "usage: ./index_db.sh -v -o /data/spotlight/nl/opennlp wdir nl_NL /data/spotlight/nl/stopwords.nl.list Dutch /data/spotlight/nl/final_model" | ||
echo "Create a database-backed model of DBpedia Spotlight for a specified language." | ||
echo " " | ||
} | ||
|
@@ -24,12 +27,18 @@ usage () | |
opennlp="None" | ||
eval="false" | ||
blacklist="false" | ||
data_only="false" | ||
local_mode="false" | ||
train_llm="false" | ||
|
||
while getopts "eo:b:" opt; do | ||
while getopts "ledovb:" opt; do | ||
case $opt in | ||
o) opennlp="$OPTARG";; | ||
e) eval="true";; | ||
b) blacklist="$OPTARG";; | ||
d) data_only="true";; | ||
l) local_mode="true" | ||
v) train_llm="true" | ||
esac | ||
done | ||
|
||
|
@@ -206,8 +215,30 @@ cd $BASE_WDIR/dbpedia-spotlight | |
|
||
mvn -pl index exec:java -Dexec.mainClass=org.dbpedia.spotlight.db.CreateSpotlightModel -Dexec.args="$2 $WDIR $TARGET_DIR $opennlp $STOPWORDS $4Stemmer" | ||
|
||
if [ "$eval" == "true" ]; then | ||
mvn -pl eval exec:java -Dexec.mainClass=org.dbpedia.spotlight.evaluation.EvaluateSpotlightModel -Dexec.args="$TARGET_DIR $WDIR/heldout.txt" > $TARGET_DIR/evaluation.txt | ||
if [ "$data_only" == "true" ]; then | ||
echo "$CREATE_MODEL" >> create_models.job.sh | ||
else | ||
eval "$CREATE_MODEL" | ||
|
||
if ["$train_llm" == "true"]; then | ||
echo "Training LLM Weights" | ||
echo "Downloading ranklib..." | ||
mkdir -p $BASE_WDIR/ranklib/ | ||
cd $BASE_WDIR/ranklib/ | ||
curl -L -o RankLib-2.1-patched.jar http://downloads.sourceforge.net/project/lemur/lemur/RankLib-2.1/RankLib-2.1-patched.jar?r=http%3A%2F%2Fsourceforge.net%2Fprojects%2Flemur%2Ffiles%2Flemur%2FRankLib-2.1%2F&ts=1439317425&use_mirror=skylink | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I was wondering if we could add RankLib as a maven dependency ? There is a suggestion on how to do it here http://sourceforge.net/p/lemur/discussion/ranklib/thread/a45e2a7c/?limit=25. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yeah, might be nice to do that! On the other hand it's only a training-time dependency, right? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. True, but I find it a bit ugly downloading the jar on training time. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. We do the same type of thing for pignlproc and other training dependencies, I think I kind of imitated that |
||
|
||
cd $BASE_DIR | ||
echo "Generating features and writing ranklib train data..." | ||
MAVEN_OPTS='-Xmx15G' mvn -pl index exec:java -Dexec.mainClass=org.dbpedia.spotlight.db.CreateLLMTrainData -Dexec.args="$2 $WDIR $TARGET_DIR"; | ||
|
||
echo "Training LLM weights using ranklib..." | ||
java -jar $BASE_WDIR/ranklib/RankLib-2.1-patched.jar -ranker 4 -train $TARGET_DIR/ranklib-training-data.txt -save $TARGET_DIR/ranklib-model.txt -metric2t ERR@1 | ||
fi | ||
|
||
if [ "$eval" == "true" ]; then | ||
mvn -pl eval exec:java -Dexec.mainClass=org.dbpedia.spotlight.evaluation.EvaluateSpotlightModel -Dexec.args="$TARGET_DIR $WDIR/heldout.txt" > $TARGET_DIR/evaluation.txt | ||
fi | ||
|
||
fi | ||
|
||
curl https://raw.githubusercontent.com/dbpedia-spotlight/model-quickstarter/master/model_readme.txt > $TARGET_DIR/README.txt | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,136 @@ | ||
#!/bin/bash | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. it feels that this has a lot of similarities with the other scripts. It might be a better idea to split those into scripts that download the data and the scripts that actually can be used to train the LLVM There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Let's discuss on slack what this should look like |
||
#+------------------------------------------------------------------------------------------------------------------------------+ | ||
#| DBpedia Spotlight - Create database-backed model | | ||
#| @author Joachim Daiber | | ||
#| @author Philipp Dowling | | ||
#+------------------------------------------------------------------------------------------------------------------------------+ | ||
|
||
# $1 Working directory | ||
# $2 Locale (en_US) | ||
# $3 Stopwords file | ||
# $4 Analyzer+Stemmer language prefix e.g. Dutch | ||
# $5 Model target folder | ||
|
||
// TODO test run, fix usage string, integrate into index_db.sh | ||
|
||
export MAVEN_OPTS="-Xmx26G" | ||
|
||
usage () | ||
{ | ||
echo "index_db.sh" | ||
echo "usage: ./train_llm.sh wdir en_US /data/spotlight/stopwords.list English /data/spotlight/output_model_folder" | ||
echo "Train weights for the log-linear model used by Spotlight's vector-based context similarity." | ||
echo " " | ||
} | ||
|
||
|
||
opennlp="None" | ||
eval="false" | ||
data_only="false" | ||
local_mode="false" | ||
|
||
|
||
while getopts "ledo:" opt; do | ||
case $opt in | ||
e) eval="true";; | ||
d) data_only="true";; | ||
l) local_mode="true" | ||
esac | ||
done | ||
|
||
|
||
shift $((OPTIND - 1)) | ||
|
||
if [ $# != 5 ] | ||
then | ||
usage | ||
exit | ||
fi | ||
|
||
BASE_DIR=$(pwd) | ||
|
||
if [[ "$1" = /* ]] | ||
then | ||
BASE_WDIR="$1" | ||
else | ||
BASE_WDIR="$BASE_DIR/$1" | ||
fi | ||
|
||
if [[ "$5" = /* ]] | ||
then | ||
TARGET_DIR="$5" | ||
else | ||
TARGET_DIR="$BASE_DIR/$5" | ||
fi | ||
|
||
if [[ "$3" = /* ]] | ||
then | ||
STOPWORDS="$3" | ||
else | ||
STOPWORDS="$BASE_DIR/$3" | ||
fi | ||
|
||
WDIR="$BASE_WDIR/$2" | ||
|
||
if [[ "$opennlp" == "None" ]]; then | ||
echo ""; | ||
elif [[ "$opennlp" != /* ]]; then | ||
opennlp="$BASE_DIR/$opennlp"; | ||
fi | ||
|
||
|
||
LANGUAGE=`echo $2 | sed "s/_.*//g"` | ||
|
||
echo "Language: $LANGUAGE" | ||
echo "Working directory: $WDIR" | ||
|
||
mkdir -p $WDIR | ||
|
||
# Stop processing if one step fails | ||
set -e | ||
|
||
cd $BASE_DIR | ||
#Set up pig: | ||
if [ -d $BASE_WDIR/pig ]; then | ||
echo "Updating PigNLProc..." | ||
cd $BASE_WDIR/pig/pignlproc | ||
git reset --hard HEAD | ||
git pull | ||
else | ||
echo "Setting up PigNLProc..." | ||
mkdir -p $BASE_WDIR/pig/ | ||
cd $BASE_WDIR/pig/ | ||
git clone --depth 1 https://github.com/dbpedia-spotlight/pignlproc.git | ||
cd pignlproc | ||
echo "Building PigNLProc..." | ||
fi | ||
|
||
|
||
echo "Generating train data." | ||
mkdir -p $BASE_WDIR/wikipedia/ | ||
cd $BASE_WDIR/wikipedia/ | ||
echo "Downloading wikipedia dump..." | ||
# curl -O "http://dumps.wikimedia.org/${LANGUAGE}wiki/latest/${LANGUAGE}wiki-latest-pages-articles.xml.bz2" | ||
|
||
echo "Splitting off train set..." | ||
# bzcat ${LANGUAGE}wiki-latest-pages-articles.xml.bz2 | python $BASE_WDIR/pig/pignlproc/utilities/split_train_test.py 12000 $WDIR/heldout.txt > /dev/null | ||
|
||
echo "Downloading DBpedia redirects and disambiguations..." | ||
cd $WDIR | ||
if [ ! -f "redirects.nt" ]; then | ||
curl -# http://downloads.dbpedia.org/current/$LANGUAGE/redirects_$LANGUAGE.nt.bz2 | bzcat > redirects.nt | ||
curl -# http://downloads.dbpedia.org/current/$LANGUAGE/disambiguations_$LANGUAGE.nt.bz2 | bzcat > disambiguations.nt | ||
fi | ||
|
||
echo "Downloading ranklib..." | ||
mkdir -p $BASE_WDIR/ranklib/ | ||
cd $BASE_WDIR/ranklib/ | ||
curl -L -o RankLib-2.1-patched.jar http://downloads.sourceforge.net/project/lemur/lemur/RankLib-2.1/RankLib-2.1-patched.jar?r=http%3A%2F%2Fsourceforge.net%2Fprojects%2Flemur%2Ffiles%2Flemur%2FRankLib-2.1%2F&ts=1439317425&use_mirror=skylink | ||
|
||
cd $BASE_DIR | ||
echo "Generating features and writing ranklib train data..." | ||
MAVEN_OPTS='-Xmx15G' mvn -pl index exec:java -Dexec.mainClass=org.dbpedia.spotlight.db.CreateLLMTrainData -Dexec.args="$2 $WDIR $TARGET_DIR"; | ||
|
||
echo "Training model using ranklib..." | ||
java -jar $BASE_WDIR/ranklib/RankLib-2.1-patched.jar -ranker 4 -train $TARGET_DIR/ranklib-training-data.txt -save $TARGET_DIR/ranklib-model.txt -metric2t ERR@1 | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Should this be a separate script ? Could it run in isolation ?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
There is the 'train_llm.sh' script, but I figured I would keep this in here so that a model can be built by just calling one script. Should I simply remove this?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I think it would be better to keep as a separate script. Plus, @jodaiber is doing a lot of changes to index_db.sh, so we might have to talk about the best way of doing this a lot.