Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Init repository with the Java code, the Python code, and the bash scr…
…ipts "prepare_data", "train", "evaluate" and "decode"
- Loading branch information
Loïc Vial
committed
Nov 6, 2018
1 parent
7387b87
commit 9167d7a
Showing
54 changed files
with
4,552 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,3 @@ | ||
#!/bin/bash | ||
"$(dirname "$0")"/java/launch.sh NeuralWSDDecode --python_path "$(dirname "$0")"/python "$@" | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,3 @@ | ||
#!/bin/bash | ||
"$(dirname "$0")"/java/launch.sh NeuralWSDTest --python_path "$(dirname "$0")"/python "$@" | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,8 @@ | ||
/bin/ | ||
/target/ | ||
.classpath | ||
.project | ||
.settings | ||
.idea | ||
*.iml | ||
.m2 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,3 @@ | ||
#!/bin/sh | ||
cd "$(dirname "$0")" | ||
mvn compile |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,3 @@ | ||
#!/bin/bash | ||
mvn exec:java -f "$(dirname "$0")"/pom.xml -e --quiet -Dexec.mainClass="$1" -Dexec.args="${*:2}" | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,75 @@ | ||
<?xml version="1.0" encoding="UTF-8"?> | ||
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" | ||
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd"> | ||
|
||
<modelVersion>4.0.0</modelVersion> | ||
<groupId>getalp</groupId> | ||
<artifactId>disambiguate</artifactId> | ||
<version>1.0-SNAPSHOT</version> | ||
|
||
<properties> | ||
<maven.compiler.source>1.8</maven.compiler.source> | ||
<maven.compiler.target>1.8</maven.compiler.target> | ||
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding> | ||
</properties> | ||
|
||
<dependencies> | ||
<dependency> | ||
<groupId>getalp</groupId> | ||
<artifactId>ufsac</artifactId> | ||
<version>1.0-SNAPSHOT</version> | ||
</dependency> | ||
<dependency> | ||
<groupId>com.google.guava</groupId> | ||
<artifactId>guava</artifactId> | ||
<version>19.0</version> | ||
</dependency> | ||
<dependency> | ||
<groupId>com.google.code.gson</groupId> | ||
<artifactId>gson</artifactId> | ||
<version>2.8.0</version> | ||
</dependency> | ||
<dependency> | ||
<groupId>org.apache.lucene</groupId> | ||
<artifactId>lucene-snowball</artifactId> | ||
<version>3.0.3</version> | ||
</dependency> | ||
<dependency> | ||
<groupId>org.apache.commons</groupId> | ||
<artifactId>commons-math3</artifactId> | ||
<version>3.6.1</version> | ||
</dependency> | ||
<dependency> | ||
<groupId>org.apache.commons</groupId> | ||
<artifactId>commons-lang3</artifactId> | ||
<version>3.5</version> | ||
</dependency> | ||
<dependency> | ||
<groupId>commons-cli</groupId> | ||
<artifactId>commons-cli</artifactId> | ||
<version>1.4</version> | ||
</dependency> | ||
<dependency> | ||
<groupId>com.panayotis.javaplot</groupId> | ||
<artifactId>javaplot</artifactId> | ||
<version>0.5.0</version> | ||
</dependency> | ||
<dependency> | ||
<groupId>nz.ac.waikato.cms.weka</groupId> | ||
<artifactId>weka-dev</artifactId> | ||
<version>3.9.1</version> | ||
</dependency> | ||
<dependency> | ||
<groupId>nz.ac.waikato.cms.weka</groupId> | ||
<artifactId>LibLINEAR</artifactId> | ||
<version>1.9.7</version> | ||
</dependency> | ||
</dependencies> | ||
|
||
<repositories> | ||
<repository> | ||
<id>central</id> | ||
<url>http://central.maven.org/maven2/</url> | ||
</repository> | ||
</repositories> | ||
</project> |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,62 @@ | ||
import getalp.wsd.common.wordnet.WordnetHelper; | ||
import getalp.wsd.method.neural.NeuralDisambiguator; | ||
import getalp.wsd.ufsac.core.Sentence; | ||
import getalp.wsd.ufsac.core.Word; | ||
import getalp.wsd.ufsac.utils.CorpusPOSTaggerAndLemmatizer; | ||
import getalp.wsd.utils.ArgumentParser; | ||
import getalp.wsd.utils.WordnetUtils; | ||
import java.io.BufferedReader; | ||
import java.io.BufferedWriter; | ||
import java.io.InputStreamReader; | ||
import java.io.OutputStreamWriter; | ||
import java.util.List; | ||
|
||
public class NeuralWSDDecode | ||
{ | ||
public static void main(String[] args) throws Exception | ||
{ | ||
ArgumentParser parser = new ArgumentParser(); | ||
parser.addArgument("python_path"); | ||
parser.addArgument("data_path"); | ||
parser.addArgumentList("weights"); | ||
parser.addArgument("lowercase", "true"); | ||
parser.addArgument("sense_reduction", "true"); | ||
if (!parser.parse(args)) return; | ||
|
||
String pythonPath = parser.getArgValue("python_path"); | ||
String dataPath = parser.getArgValue("data_path"); | ||
List<String> weights = parser.getArgValueList("weights"); | ||
boolean lowercase = parser.getArgValueBoolean("lowercase"); | ||
boolean senseReduction = parser.getArgValueBoolean("sense_reduction"); | ||
|
||
CorpusPOSTaggerAndLemmatizer tagger = new CorpusPOSTaggerAndLemmatizer(); | ||
NeuralDisambiguator disambiguator = new NeuralDisambiguator(pythonPath, dataPath, weights); | ||
disambiguator.lowercaseWords = lowercase; | ||
if (senseReduction) disambiguator.reducedOutputVocabulary = WordnetUtils.getReducedSynsetKeysWithHypernyms3(WordnetHelper.wn30()); | ||
else disambiguator.reducedOutputVocabulary = null; | ||
|
||
BufferedReader reader = new BufferedReader(new InputStreamReader(System.in)); | ||
BufferedWriter writer = new BufferedWriter(new OutputStreamWriter(System.out)); | ||
for (String line = reader.readLine() ; line != null ; line = reader.readLine()) | ||
{ | ||
Sentence sentence = new Sentence(line); | ||
tagger.tag(sentence.getWords()); | ||
disambiguator.disambiguate(sentence, "wsd"); | ||
for (Word word : sentence.getWords()) | ||
{ | ||
writer.write(word.getValue().replace("|", "")); | ||
if (word.hasAnnotation("lemma") && word.hasAnnotation("pos") && word.hasAnnotation("wsd")) | ||
{ | ||
writer.write("|" + word.getAnnotationValue("wsd")); | ||
} | ||
writer.write(" "); | ||
} | ||
writer.newLine(); | ||
writer.flush(); | ||
} | ||
writer.close(); | ||
reader.close(); | ||
disambiguator.close(); | ||
} | ||
} | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,59 @@ | ||
import getalp.wsd.common.wordnet.WordnetHelper; | ||
import getalp.wsd.method.neural.NeuralDisambiguator; | ||
import getalp.wsd.ufsac.core.Sentence; | ||
import getalp.wsd.ufsac.core.Word; | ||
import getalp.wsd.ufsac.streaming.modifier.StreamingCorpusModifierSentence; | ||
import getalp.wsd.ufsac.streaming.reader.StreamingCorpusReaderSentence; | ||
import getalp.wsd.ufsac.streaming.writer.StreamingCorpusWriterSentence; | ||
import getalp.wsd.ufsac.utils.CorpusPOSTaggerAndLemmatizer; | ||
import getalp.wsd.utils.ArgumentParser; | ||
import getalp.wsd.utils.WordnetUtils; | ||
|
||
import java.io.BufferedReader; | ||
import java.io.BufferedWriter; | ||
import java.io.InputStreamReader; | ||
import java.io.OutputStreamWriter; | ||
import java.util.List; | ||
|
||
public class NeuralWSDDecodeUFSAC | ||
{ | ||
public static void main(String[] args) throws Exception | ||
{ | ||
ArgumentParser parser = new ArgumentParser(); | ||
parser.addArgument("python_path"); | ||
parser.addArgument("data_path"); | ||
parser.addArgumentList("weights"); | ||
parser.addArgument("input"); | ||
parser.addArgument("output"); | ||
parser.addArgument("lowercase", "true"); | ||
parser.addArgument("sense_reduction", "true"); | ||
if (!parser.parse(args)) return; | ||
|
||
String pythonPath = parser.getArgValue("python_path"); | ||
String dataPath = parser.getArgValue("data_path"); | ||
List<String> weights = parser.getArgValueList("weights"); | ||
String inputPath = parser.getArgValue("input"); | ||
String outputPath = parser.getArgValue("output"); | ||
boolean lowercase = parser.getArgValueBoolean("lowercase"); | ||
boolean senseReduction = parser.getArgValueBoolean("sense_reduction"); | ||
|
||
CorpusPOSTaggerAndLemmatizer tagger = new CorpusPOSTaggerAndLemmatizer(); | ||
NeuralDisambiguator disambiguator = new NeuralDisambiguator(pythonPath, dataPath, weights); | ||
disambiguator.lowercaseWords = lowercase; | ||
if (senseReduction) disambiguator.reducedOutputVocabulary = WordnetUtils.getReducedSynsetKeysWithHypernyms3(WordnetHelper.wn30()); | ||
else disambiguator.reducedOutputVocabulary = null; | ||
|
||
StreamingCorpusModifierSentence modifier = new StreamingCorpusModifierSentence() | ||
{ | ||
public void modifySentence(Sentence sentence) | ||
{ | ||
tagger.tag(sentence.getWords()); | ||
disambiguator.disambiguate(sentence, "wsd"); | ||
} | ||
}; | ||
|
||
modifier.load(inputPath, outputPath); | ||
disambiguator.close(); | ||
} | ||
} | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,106 @@ | ||
import java.util.Collections; | ||
import java.util.List; | ||
|
||
import getalp.wsd.common.wordnet.WordnetHelper; | ||
import getalp.wsd.method.neural.NeuralDataPreparator; | ||
import getalp.wsd.utils.ArgumentParser; | ||
import getalp.wsd.utils.WordnetUtils; | ||
|
||
public class NeuralWSDPrepare | ||
{ | ||
public static void main(String[] args) throws Exception | ||
{ | ||
ArgumentParser parser = new ArgumentParser(); | ||
parser.addArgument("data_path"); | ||
parser.addArgumentList("train"); | ||
parser.addArgumentList("dev"); | ||
parser.addArgumentList("input_features", Collections.singletonList("surface_form")); | ||
parser.addArgumentList("output_features", Collections.singletonList("wn30_key")); | ||
parser.addArgument("lowercase", "true"); | ||
parser.addArgument("uniform_dash", "false"); | ||
parser.addArgument("dev_from_train", "0"); | ||
parser.addArgument("sense_reduction", "true"); | ||
parser.addArgument("add_monosemics", "false"); | ||
parser.addArgument("remove_monosemics", "false"); | ||
parser.addArgument("remove_duplicates", "true"); | ||
if (!parser.parse(args, true)) return; | ||
|
||
String dataPath = parser.getArgValue("data_path"); | ||
List<String> trainingCorpusPaths = parser.getArgValueList("train"); | ||
List<String> devCorpusPaths = parser.getArgValueList("dev"); | ||
List<String> inputFeatures = parser.getArgValueList("input_features"); | ||
List<String> outputFeatures = parser.getArgValueList("output_features"); | ||
boolean lowercase = parser.getArgValueBoolean("lowercase"); | ||
boolean uniformDash = parser.getArgValueBoolean("uniform_dash"); | ||
int devFromTrain = parser.getArgValueInteger("dev_from_train"); | ||
boolean senseReduction = parser.getArgValueBoolean("sense_reduction"); | ||
boolean addMonosemics = parser.getArgValueBoolean("add_monosemics"); | ||
boolean removeMonosemics = parser.getArgValueBoolean("remove_monosemics"); | ||
boolean removeDuplicateSentences = parser.getArgValueBoolean("remove_duplicates"); | ||
|
||
NeuralDataPreparator preparator = new NeuralDataPreparator(); | ||
|
||
preparator.setOutputDirectoryPath(dataPath); | ||
|
||
for (String corpusPath : trainingCorpusPaths) | ||
{ | ||
preparator.addTrainingCorpus(corpusPath); | ||
} | ||
|
||
for (String corpusPath : devCorpusPaths) | ||
{ | ||
preparator.addDevelopmentCorpus(corpusPath); | ||
} | ||
|
||
for (int i = 0 ; i < inputFeatures.size() ; i += 2) | ||
{ | ||
String inputFeatureAnnotationName = inputFeatures.get(i); | ||
String inputFeatureEmbeddings = "null"; | ||
if (i + 1 < inputFeatures.size()) | ||
{ | ||
inputFeatureEmbeddings = inputFeatures.get(i + 1); | ||
} | ||
if (inputFeatureEmbeddings.equals("null")) | ||
{ | ||
preparator.addInputFeature(inputFeatureAnnotationName); | ||
} | ||
else | ||
{ | ||
preparator.addInputFeature(inputFeatureAnnotationName, inputFeatureEmbeddings); | ||
} | ||
} | ||
|
||
for (int i = 0 ; i < outputFeatures.size() ; i += 2) | ||
{ | ||
String outputFeatureAnnotationName = outputFeatures.get(i); | ||
String outputFeatureVocabulary = "null"; | ||
if (i + 1 < outputFeatures.size()) | ||
{ | ||
outputFeatureVocabulary = outputFeatures.get(i + 1); | ||
} | ||
if (outputFeatureVocabulary.equals("null")) | ||
{ | ||
preparator.addOutputFeature(outputFeatureAnnotationName); | ||
} | ||
else | ||
{ | ||
preparator.addOutputFeature(outputFeatureAnnotationName, outputFeatureVocabulary); | ||
} | ||
} | ||
|
||
preparator.maxLineLength = 80; | ||
preparator.lowercaseWords = lowercase; | ||
preparator.uniformDash = uniformDash; | ||
preparator.multisenses = false; | ||
preparator.removeAllCoarseGrained = true; | ||
preparator.addMonosemics = addMonosemics; | ||
preparator.removeMonosemics = removeMonosemics; | ||
if (senseReduction) preparator.reducedOutputVocabulary = WordnetUtils.getReducedSynsetKeysWithHypernyms3(WordnetHelper.wn30()); | ||
else preparator.reducedOutputVocabulary = null; | ||
preparator.additionalDevFromTrainSize = devFromTrain; | ||
preparator.removeDuplicateSentences = removeDuplicateSentences; | ||
|
||
preparator.prepareTrainingFile(); | ||
} | ||
} | ||
|
Oops, something went wrong.