Skip to content

Commit

Permalink
Init repository with the Java code, the Python code, and the bash scr…
Browse files Browse the repository at this point in the history
…ipts "prepare_data", "train", "evaluate" and "decode"
  • Loading branch information
Loïc Vial committed Nov 6, 2018
1 parent 7387b87 commit 9167d7a
Show file tree
Hide file tree
Showing 54 changed files with 4,552 additions and 0 deletions.
3 changes: 3 additions & 0 deletions decode.sh
@@ -0,0 +1,3 @@
#!/bin/bash
"$(dirname "$0")"/java/launch.sh NeuralWSDDecode --python_path "$(dirname "$0")"/python "$@"

3 changes: 3 additions & 0 deletions evaluate.sh
@@ -0,0 +1,3 @@
#!/bin/bash
"$(dirname "$0")"/java/launch.sh NeuralWSDTest --python_path "$(dirname "$0")"/python "$@"

8 changes: 8 additions & 0 deletions java/.gitignore
@@ -0,0 +1,8 @@
/bin/
/target/
.classpath
.project
.settings
.idea
*.iml
.m2
3 changes: 3 additions & 0 deletions java/compile.sh
@@ -0,0 +1,3 @@
#!/bin/sh
cd "$(dirname "$0")"
mvn compile
3 changes: 3 additions & 0 deletions java/launch.sh
@@ -0,0 +1,3 @@
#!/bin/bash
mvn exec:java -f "$(dirname "$0")"/pom.xml -e --quiet -Dexec.mainClass="$1" -Dexec.args="${*:2}"

75 changes: 75 additions & 0 deletions java/pom.xml
@@ -0,0 +1,75 @@
<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">

<modelVersion>4.0.0</modelVersion>
<groupId>getalp</groupId>
<artifactId>disambiguate</artifactId>
<version>1.0-SNAPSHOT</version>

<properties>
<maven.compiler.source>1.8</maven.compiler.source>
<maven.compiler.target>1.8</maven.compiler.target>
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
</properties>

<dependencies>
<dependency>
<groupId>getalp</groupId>
<artifactId>ufsac</artifactId>
<version>1.0-SNAPSHOT</version>
</dependency>
<dependency>
<groupId>com.google.guava</groupId>
<artifactId>guava</artifactId>
<version>19.0</version>
</dependency>
<dependency>
<groupId>com.google.code.gson</groupId>
<artifactId>gson</artifactId>
<version>2.8.0</version>
</dependency>
<dependency>
<groupId>org.apache.lucene</groupId>
<artifactId>lucene-snowball</artifactId>
<version>3.0.3</version>
</dependency>
<dependency>
<groupId>org.apache.commons</groupId>
<artifactId>commons-math3</artifactId>
<version>3.6.1</version>
</dependency>
<dependency>
<groupId>org.apache.commons</groupId>
<artifactId>commons-lang3</artifactId>
<version>3.5</version>
</dependency>
<dependency>
<groupId>commons-cli</groupId>
<artifactId>commons-cli</artifactId>
<version>1.4</version>
</dependency>
<dependency>
<groupId>com.panayotis.javaplot</groupId>
<artifactId>javaplot</artifactId>
<version>0.5.0</version>
</dependency>
<dependency>
<groupId>nz.ac.waikato.cms.weka</groupId>
<artifactId>weka-dev</artifactId>
<version>3.9.1</version>
</dependency>
<dependency>
<groupId>nz.ac.waikato.cms.weka</groupId>
<artifactId>LibLINEAR</artifactId>
<version>1.9.7</version>
</dependency>
</dependencies>

<repositories>
<repository>
<id>central</id>
<url>http://central.maven.org/maven2/</url>
</repository>
</repositories>
</project>
62 changes: 62 additions & 0 deletions java/src/main/java/NeuralWSDDecode.java
@@ -0,0 +1,62 @@
import getalp.wsd.common.wordnet.WordnetHelper;
import getalp.wsd.method.neural.NeuralDisambiguator;
import getalp.wsd.ufsac.core.Sentence;
import getalp.wsd.ufsac.core.Word;
import getalp.wsd.ufsac.utils.CorpusPOSTaggerAndLemmatizer;
import getalp.wsd.utils.ArgumentParser;
import getalp.wsd.utils.WordnetUtils;
import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.InputStreamReader;
import java.io.OutputStreamWriter;
import java.util.List;

public class NeuralWSDDecode
{
public static void main(String[] args) throws Exception
{
ArgumentParser parser = new ArgumentParser();
parser.addArgument("python_path");
parser.addArgument("data_path");
parser.addArgumentList("weights");
parser.addArgument("lowercase", "true");
parser.addArgument("sense_reduction", "true");
if (!parser.parse(args)) return;

String pythonPath = parser.getArgValue("python_path");
String dataPath = parser.getArgValue("data_path");
List<String> weights = parser.getArgValueList("weights");
boolean lowercase = parser.getArgValueBoolean("lowercase");
boolean senseReduction = parser.getArgValueBoolean("sense_reduction");

CorpusPOSTaggerAndLemmatizer tagger = new CorpusPOSTaggerAndLemmatizer();
NeuralDisambiguator disambiguator = new NeuralDisambiguator(pythonPath, dataPath, weights);
disambiguator.lowercaseWords = lowercase;
if (senseReduction) disambiguator.reducedOutputVocabulary = WordnetUtils.getReducedSynsetKeysWithHypernyms3(WordnetHelper.wn30());
else disambiguator.reducedOutputVocabulary = null;

BufferedReader reader = new BufferedReader(new InputStreamReader(System.in));
BufferedWriter writer = new BufferedWriter(new OutputStreamWriter(System.out));
for (String line = reader.readLine() ; line != null ; line = reader.readLine())
{
Sentence sentence = new Sentence(line);
tagger.tag(sentence.getWords());
disambiguator.disambiguate(sentence, "wsd");
for (Word word : sentence.getWords())
{
writer.write(word.getValue().replace("|", ""));
if (word.hasAnnotation("lemma") && word.hasAnnotation("pos") && word.hasAnnotation("wsd"))
{
writer.write("|" + word.getAnnotationValue("wsd"));
}
writer.write(" ");
}
writer.newLine();
writer.flush();
}
writer.close();
reader.close();
disambiguator.close();
}
}

59 changes: 59 additions & 0 deletions java/src/main/java/NeuralWSDDecodeUFSAC.java
@@ -0,0 +1,59 @@
import getalp.wsd.common.wordnet.WordnetHelper;
import getalp.wsd.method.neural.NeuralDisambiguator;
import getalp.wsd.ufsac.core.Sentence;
import getalp.wsd.ufsac.core.Word;
import getalp.wsd.ufsac.streaming.modifier.StreamingCorpusModifierSentence;
import getalp.wsd.ufsac.streaming.reader.StreamingCorpusReaderSentence;
import getalp.wsd.ufsac.streaming.writer.StreamingCorpusWriterSentence;
import getalp.wsd.ufsac.utils.CorpusPOSTaggerAndLemmatizer;
import getalp.wsd.utils.ArgumentParser;
import getalp.wsd.utils.WordnetUtils;

import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.InputStreamReader;
import java.io.OutputStreamWriter;
import java.util.List;

public class NeuralWSDDecodeUFSAC
{
public static void main(String[] args) throws Exception
{
ArgumentParser parser = new ArgumentParser();
parser.addArgument("python_path");
parser.addArgument("data_path");
parser.addArgumentList("weights");
parser.addArgument("input");
parser.addArgument("output");
parser.addArgument("lowercase", "true");
parser.addArgument("sense_reduction", "true");
if (!parser.parse(args)) return;

String pythonPath = parser.getArgValue("python_path");
String dataPath = parser.getArgValue("data_path");
List<String> weights = parser.getArgValueList("weights");
String inputPath = parser.getArgValue("input");
String outputPath = parser.getArgValue("output");
boolean lowercase = parser.getArgValueBoolean("lowercase");
boolean senseReduction = parser.getArgValueBoolean("sense_reduction");

CorpusPOSTaggerAndLemmatizer tagger = new CorpusPOSTaggerAndLemmatizer();
NeuralDisambiguator disambiguator = new NeuralDisambiguator(pythonPath, dataPath, weights);
disambiguator.lowercaseWords = lowercase;
if (senseReduction) disambiguator.reducedOutputVocabulary = WordnetUtils.getReducedSynsetKeysWithHypernyms3(WordnetHelper.wn30());
else disambiguator.reducedOutputVocabulary = null;

StreamingCorpusModifierSentence modifier = new StreamingCorpusModifierSentence()
{
public void modifySentence(Sentence sentence)
{
tagger.tag(sentence.getWords());
disambiguator.disambiguate(sentence, "wsd");
}
};

modifier.load(inputPath, outputPath);
disambiguator.close();
}
}

106 changes: 106 additions & 0 deletions java/src/main/java/NeuralWSDPrepare.java
@@ -0,0 +1,106 @@
import java.util.Collections;
import java.util.List;

import getalp.wsd.common.wordnet.WordnetHelper;
import getalp.wsd.method.neural.NeuralDataPreparator;
import getalp.wsd.utils.ArgumentParser;
import getalp.wsd.utils.WordnetUtils;

public class NeuralWSDPrepare
{
public static void main(String[] args) throws Exception
{
ArgumentParser parser = new ArgumentParser();
parser.addArgument("data_path");
parser.addArgumentList("train");
parser.addArgumentList("dev");
parser.addArgumentList("input_features", Collections.singletonList("surface_form"));
parser.addArgumentList("output_features", Collections.singletonList("wn30_key"));
parser.addArgument("lowercase", "true");
parser.addArgument("uniform_dash", "false");
parser.addArgument("dev_from_train", "0");
parser.addArgument("sense_reduction", "true");
parser.addArgument("add_monosemics", "false");
parser.addArgument("remove_monosemics", "false");
parser.addArgument("remove_duplicates", "true");
if (!parser.parse(args, true)) return;

String dataPath = parser.getArgValue("data_path");
List<String> trainingCorpusPaths = parser.getArgValueList("train");
List<String> devCorpusPaths = parser.getArgValueList("dev");
List<String> inputFeatures = parser.getArgValueList("input_features");
List<String> outputFeatures = parser.getArgValueList("output_features");
boolean lowercase = parser.getArgValueBoolean("lowercase");
boolean uniformDash = parser.getArgValueBoolean("uniform_dash");
int devFromTrain = parser.getArgValueInteger("dev_from_train");
boolean senseReduction = parser.getArgValueBoolean("sense_reduction");
boolean addMonosemics = parser.getArgValueBoolean("add_monosemics");
boolean removeMonosemics = parser.getArgValueBoolean("remove_monosemics");
boolean removeDuplicateSentences = parser.getArgValueBoolean("remove_duplicates");

NeuralDataPreparator preparator = new NeuralDataPreparator();

preparator.setOutputDirectoryPath(dataPath);

for (String corpusPath : trainingCorpusPaths)
{
preparator.addTrainingCorpus(corpusPath);
}

for (String corpusPath : devCorpusPaths)
{
preparator.addDevelopmentCorpus(corpusPath);
}

for (int i = 0 ; i < inputFeatures.size() ; i += 2)
{
String inputFeatureAnnotationName = inputFeatures.get(i);
String inputFeatureEmbeddings = "null";
if (i + 1 < inputFeatures.size())
{
inputFeatureEmbeddings = inputFeatures.get(i + 1);
}
if (inputFeatureEmbeddings.equals("null"))
{
preparator.addInputFeature(inputFeatureAnnotationName);
}
else
{
preparator.addInputFeature(inputFeatureAnnotationName, inputFeatureEmbeddings);
}
}

for (int i = 0 ; i < outputFeatures.size() ; i += 2)
{
String outputFeatureAnnotationName = outputFeatures.get(i);
String outputFeatureVocabulary = "null";
if (i + 1 < outputFeatures.size())
{
outputFeatureVocabulary = outputFeatures.get(i + 1);
}
if (outputFeatureVocabulary.equals("null"))
{
preparator.addOutputFeature(outputFeatureAnnotationName);
}
else
{
preparator.addOutputFeature(outputFeatureAnnotationName, outputFeatureVocabulary);
}
}

preparator.maxLineLength = 80;
preparator.lowercaseWords = lowercase;
preparator.uniformDash = uniformDash;
preparator.multisenses = false;
preparator.removeAllCoarseGrained = true;
preparator.addMonosemics = addMonosemics;
preparator.removeMonosemics = removeMonosemics;
if (senseReduction) preparator.reducedOutputVocabulary = WordnetUtils.getReducedSynsetKeysWithHypernyms3(WordnetHelper.wn30());
else preparator.reducedOutputVocabulary = null;
preparator.additionalDevFromTrainSize = devFromTrain;
preparator.removeDuplicateSentences = removeDuplicateSentences;

preparator.prepareTrainingFile();
}
}

0 comments on commit 9167d7a

Please sign in to comment.