Skip to content

Commit

Permalink
Removing resources from master pom. Refactored tests and Datasets obj…
Browse files Browse the repository at this point in the history
…ect.
  • Loading branch information
datumbox committed Jan 2, 2017
1 parent 2f43c85 commit 13333ce
Show file tree
Hide file tree
Showing 9 changed files with 117 additions and 90 deletions.
Expand Up @@ -51,7 +51,7 @@ public void testTrainAndValidate() {
logger.info("testTrainAndValidate");

Configuration configuration = Configuration.getConfiguration();

Dataframe[] data = Datasets.heartDiseaseClusters(configuration);

Dataframe trainingData = data[0];
Expand Down
Expand Up @@ -16,16 +16,10 @@
package com.datumbox.framework.applications.nlp;

import com.datumbox.framework.common.Configuration;
import com.datumbox.framework.core.common.Datasets;
import com.datumbox.framework.tests.abstracts.AbstractTest;
import org.junit.Test;

import java.io.IOException;
import java.net.URISyntaxException;
import java.nio.charset.StandardCharsets;
import java.nio.file.Files;
import java.nio.file.Paths;
import java.util.List;

import static org.junit.Assert.assertEquals;

/**
Expand All @@ -43,20 +37,8 @@ public void testExtract() {
logger.info("extract");

Configuration configuration = Configuration.getConfiguration();

String text;
try {
List<String> lines = Files.readAllLines(Paths.get(this.getClass().getClassLoader().getResource("datasets/example.com.html").toURI()), StandardCharsets.UTF_8);
StringBuilder sb = new StringBuilder();
for(String line: lines){
sb.append(line);
sb.append("\r\n");
}
text = sb.toString().trim();
}
catch(IOException | URISyntaxException ex) {
throw new RuntimeException(ex);
}

String text = Datasets.exampleHtmlCode();

CETR.Parameters parameters = new CETR.Parameters();
parameters.setNumberOfClusters(2);
Expand Down
Expand Up @@ -16,6 +16,7 @@
package com.datumbox.framework.applications.nlp;

import com.datumbox.framework.common.Configuration;
import com.datumbox.framework.core.common.Datasets;
import com.datumbox.framework.core.common.dataobjects.Dataframe;
import com.datumbox.framework.core.common.dataobjects.Record;
import com.datumbox.framework.core.machinelearning.MLBuilder;
Expand All @@ -33,11 +34,8 @@
import com.datumbox.framework.tests.abstracts.AbstractTest;
import org.junit.Test;

import java.io.UncheckedIOException;
import java.net.URI;
import java.net.URISyntaxException;
import java.util.Arrays;
import java.util.HashMap;
import java.util.List;
import java.util.Map;

Expand Down Expand Up @@ -283,16 +281,8 @@ private <ML extends AbstractClassifier, FS extends AbstractFeatureSelector, NS e


String storageName = this.getClass().getSimpleName() + testId;

Map<Object, URI> dataset = new HashMap<>();
try {
dataset.put("negative", this.getClass().getClassLoader().getResource("datasets/sentimentAnalysis.neg.txt").toURI());
dataset.put("positive", this.getClass().getClassLoader().getResource("datasets/sentimentAnalysis.pos.txt").toURI());
}
catch(UncheckedIOException | URISyntaxException ex) {
logger.warn("Unable to download datasets, skipping test.");
throw new RuntimeException(ex);
}

Map<Object, URI> dataset = Datasets.sentimentAnalysis();

TextClassifier.TrainingParameters trainingParameters = new TextClassifier.TrainingParameters();

Expand Down Expand Up @@ -327,14 +317,7 @@ private <ML extends AbstractClassifier, FS extends AbstractFeatureSelector, NS e


instance = MLBuilder.load(TextClassifier.class, storageName, configuration);
Dataframe validationData;
try {
validationData = instance.predict(this.getClass().getClassLoader().getResource("datasets/sentimentAnalysis.unlabelled.txt").toURI());
}
catch(UncheckedIOException | URISyntaxException ex) {
logger.warn("Unable to download datasets, skipping test.");
throw new RuntimeException(ex);
}
Dataframe validationData = instance.predict(Datasets.sentimentAnalysisUnlabeled());

List<Object> expResult = Arrays.asList("negative","positive");
int i = 0;
Expand Down
18 changes: 18 additions & 0 deletions datumbox-framework-common/pom.xml
Expand Up @@ -34,6 +34,24 @@
<main.basedir>..</main.basedir>
</properties>

<build>
<resources>
<resource>
<directory>./src/main/resources</directory>
<filtering>false</filtering>
<excludes>
<exclude>license.txt</exclude>
</excludes>
</resource>
</resources>
<testResources>
<testResource>
<directory>./src/test/resources</directory>
<filtering>false</filtering>
</testResource>
</testResources>
</build>

<dependencies>
<dependency>
<groupId>org.apache.commons</groupId>
Expand Down
9 changes: 9 additions & 0 deletions datumbox-framework-core/pom.xml
Expand Up @@ -34,6 +34,15 @@
<main.basedir>..</main.basedir>
</properties>

<build>
<testResources>
<testResource>
<directory>./src/test/resources</directory>
<filtering>false</filtering>
</testResource>
</testResources>
</build>

<dependencies>
<dependency>
<groupId>org.apache.commons</groupId>
Expand Down
Expand Up @@ -23,12 +23,12 @@
import com.datumbox.framework.core.common.dataobjects.Dataframe;
import com.datumbox.framework.core.common.dataobjects.Record;

import java.io.IOException;
import java.io.InputStreamReader;
import java.io.Reader;
import java.io.UncheckedIOException;
import java.util.LinkedHashMap;
import java.util.Random;
import java.io.*;
import java.net.URI;
import java.nio.charset.StandardCharsets;
import java.nio.file.Files;
import java.nio.file.StandardCopyOption;
import java.util.*;


/**
Expand Down Expand Up @@ -75,7 +75,7 @@ public static Dataframe[] carsNumeric(Configuration configuration) {
- c2: no
*/
Dataframe trainingData;
try (Reader fileReader = new InputStreamReader(Datasets.class.getClassLoader().getResourceAsStream("datasets/carsNumeric.csv"), "UTF-8")) {
try (Reader fileReader = new InputStreamReader(Datasets.class.getClassLoader().getResourceAsStream("datasets/carsNumeric.csv"), StandardCharsets.UTF_8)) {
LinkedHashMap<String, TypeInference.DataType> headerDataTypes = new LinkedHashMap<>();
headerDataTypes.put("red", TypeInference.DataType.BOOLEAN);
headerDataTypes.put("yellow", TypeInference.DataType.BOOLEAN);
Expand Down Expand Up @@ -121,7 +121,7 @@ public static Dataframe[] carsCategorical(Configuration configuration) {
- stolen: yes/no
*/
Dataframe trainingData;
try (Reader fileReader = new InputStreamReader(Datasets.class.getClassLoader().getResourceAsStream("datasets/carsCategorical.csv"), "UTF-8")) {
try (Reader fileReader = new InputStreamReader(Datasets.class.getClassLoader().getResourceAsStream("datasets/carsCategorical.csv"), StandardCharsets.UTF_8)) {
LinkedHashMap<String, TypeInference.DataType> headerDataTypes = new LinkedHashMap<>();
headerDataTypes.put("color", TypeInference.DataType.CATEGORICAL);
headerDataTypes.put("type", TypeInference.DataType.CATEGORICAL);
Expand Down Expand Up @@ -153,7 +153,7 @@ public static Dataframe[] carsCategorical(Configuration configuration) {
public static Dataframe[] housingNumerical(Configuration configuration) {
//Data from https://archive.ics.uci.edu/ml/machine-learning-databases/housing/housing.names
Dataframe trainingData;
try (Reader fileReader = new InputStreamReader(Datasets.class.getClassLoader().getResourceAsStream("datasets/housing.csv"), "UTF-8")) {
try (Reader fileReader = new InputStreamReader(Datasets.class.getClassLoader().getResourceAsStream("datasets/housing.csv"), StandardCharsets.UTF_8)) {
LinkedHashMap<String, TypeInference.DataType> headerDataTypes = new LinkedHashMap<>();
headerDataTypes.put("CRIM", TypeInference.DataType.NUMERICAL);
headerDataTypes.put("ZN", TypeInference.DataType.NUMERICAL);
Expand Down Expand Up @@ -190,7 +190,7 @@ public static Dataframe[] housingNumerical(Configuration configuration) {
public static Dataframe[] winesOrdinal(Configuration configuration) {
//Data from http://www.unt.edu/rss/class/Jon/R_SC/
Dataframe trainingData;
try (Reader fileReader = new InputStreamReader(Datasets.class.getClassLoader().getResourceAsStream("datasets/winesOrdinal.csv"), "UTF-8")) {
try (Reader fileReader = new InputStreamReader(Datasets.class.getClassLoader().getResourceAsStream("datasets/winesOrdinal.csv"), StandardCharsets.UTF_8)) {
LinkedHashMap<String, TypeInference.DataType> headerDataTypes = new LinkedHashMap<>();
headerDataTypes.put("c1", TypeInference.DataType.NUMERICAL);
headerDataTypes.put("c2", TypeInference.DataType.NUMERICAL);
Expand Down Expand Up @@ -315,7 +315,7 @@ public static Dataframe[] heartDiseaseClusters(Configuration configuration) {
//Heart Disease - C2: Age, Sex, ChestPain, RestBP, Cholesterol, BloodSugar, ECG, MaxHeartRate, Angina, OldPeak, STSlope, Vessels, Thal
//http://www.sgi.com/tech/mlc/db/heart.names
Dataframe trainingData;
try (Reader fileReader = new InputStreamReader(Datasets.class.getClassLoader().getResourceAsStream("datasets/heart.csv"), "UTF-8")) {
try (Reader fileReader = new InputStreamReader(Datasets.class.getClassLoader().getResourceAsStream("datasets/heart.csv"), StandardCharsets.UTF_8)) {
LinkedHashMap<String, TypeInference.DataType> headerDataTypes = new LinkedHashMap<>();
headerDataTypes.put("Age", TypeInference.DataType.NUMERICAL);
headerDataTypes.put("Sex", TypeInference.DataType.CATEGORICAL);
Expand Down Expand Up @@ -653,7 +653,7 @@ public static Dataframe[] regressionNumeric(Configuration configuration) {
$dataTable[]=array(array($x1,$x2),null);
*/
Dataframe trainingData;
try (Reader fileReader = new InputStreamReader(Datasets.class.getClassLoader().getResourceAsStream("datasets/regressionNumeric.csv"), "UTF-8")) {
try (Reader fileReader = new InputStreamReader(Datasets.class.getClassLoader().getResourceAsStream("datasets/regressionNumeric.csv"), StandardCharsets.UTF_8)) {
LinkedHashMap<String, TypeInference.DataType> headerDataTypes = new LinkedHashMap<>();
headerDataTypes.put("c1", TypeInference.DataType.NUMERICAL);
headerDataTypes.put("c2", TypeInference.DataType.NUMERICAL);
Expand Down Expand Up @@ -689,7 +689,7 @@ public static Dataframe[] regressionMixed(Configuration configuration) {
$dataTable[]=array(array((string)$x1,$x2,$x3,(string)$x4),null);
*/
Dataframe trainingData;
try (Reader fileReader = new InputStreamReader(Datasets.class.getClassLoader().getResourceAsStream("datasets/regressionMixed.csv"), "UTF-8")) {
try (Reader fileReader = new InputStreamReader(Datasets.class.getClassLoader().getResourceAsStream("datasets/regressionMixed.csv"), StandardCharsets.UTF_8)) {
LinkedHashMap<String, TypeInference.DataType> headerDataTypes = new LinkedHashMap<>();
headerDataTypes.put("c1", TypeInference.DataType.CATEGORICAL);
headerDataTypes.put("c2", TypeInference.DataType.NUMERICAL);
Expand All @@ -707,5 +707,70 @@ public static Dataframe[] regressionMixed(Configuration configuration) {

return new Dataframe[] {trainingData, validationData};
}


/**
* Returns a map with the URIs of a sentiment analysis dataset.
*
* @return
*/
public static Map<Object, URI> sentimentAnalysis() {
Map<Object, URI> dataset = new HashMap<>();
dataset.put("negative", inputStreamToURI(Datasets.class.getClassLoader().getResourceAsStream("datasets/sentimentAnalysis.neg.txt")));
dataset.put("positive", inputStreamToURI(Datasets.class.getClassLoader().getResourceAsStream("datasets/sentimentAnalysis.pos.txt")));
return dataset;
}

/**
* Returns the URI of an unlabelled sentiment analysis dataset.
*
* @return
*/
public static URI sentimentAnalysisUnlabeled() {
return inputStreamToURI(Datasets.class.getClassLoader().getResourceAsStream("datasets/sentimentAnalysis.unlabelled.txt"));
}

/**
* Returns the HTML code of example.com.
*
* @return
*/
public static String exampleHtmlCode() {
try(BufferedReader fileReader = new BufferedReader(new InputStreamReader(Datasets.class.getClassLoader().getResourceAsStream("datasets/example.com.html"), StandardCharsets.UTF_8))) {
StringBuilder sb = new StringBuilder();
for(String line;(line = fileReader.readLine()) != null;){
sb.append(line);
sb.append("\r\n");
}
return sb.toString().trim();
}
catch(IOException ex) {
throw new UncheckedIOException(ex);
}
}

/**
* Converts an input stream to a URI.
*
* @param is
* @return
*/
private static URI inputStreamToURI(InputStream is) {
try {
File f = File.createTempFile("is2uri", "tmp");
f.deleteOnExit();
Files.copy(is, f.toPath(), StandardCopyOption.REPLACE_EXISTING);
return f.toURI();
}
catch (IOException ex) {
throw new UncheckedIOException(ex);
}
finally {
try {
is.close();
}
catch (IOException ex) {
throw new UncheckedIOException(ex);
}
}
}
}
Expand Up @@ -66,7 +66,6 @@ public void testParseCSVFile() {
dataset = Dataframe.Builder.parseCSVFile(fileReader, "metro_population", headerDataTypes, ',', '"', "\r\n", null, null, configuration);
}
catch(UncheckedIOException | IOException ex) {
logger.warn("Unable to download datasets, skipping test.");
throw new RuntimeException(ex);
}

Expand Down
Expand Up @@ -16,6 +16,7 @@
package com.datumbox.framework.core.machinelearning.topicmodeling;

import com.datumbox.framework.common.Configuration;
import com.datumbox.framework.core.common.Datasets;
import com.datumbox.framework.core.common.dataobjects.Dataframe;
import com.datumbox.framework.core.common.dataobjects.Record;
import com.datumbox.framework.core.machinelearning.MLBuilder;
Expand All @@ -28,10 +29,7 @@
import com.datumbox.framework.tests.abstracts.AbstractTest;
import org.junit.Test;

import java.io.UncheckedIOException;
import java.net.URI;
import java.net.URISyntaxException;
import java.util.HashMap;
import java.util.Map;

import static org.junit.Assert.assertEquals;
Expand All @@ -55,16 +53,8 @@ public void testPredict() {

String storageName = this.getClass().getSimpleName();


Map<Object, URI> dataset = new HashMap<>();
try {
dataset.put("negative", this.getClass().getClassLoader().getResource("datasets/sentimentAnalysis.neg.txt").toURI());
dataset.put("positive", this.getClass().getClassLoader().getResource("datasets/sentimentAnalysis.pos.txt").toURI());
}
catch(UncheckedIOException | URISyntaxException ex) {
logger.warn("Unable to download datasets, skipping test.");
throw new RuntimeException(ex);
}

Map<Object, URI> dataset = Datasets.sentimentAnalysis();

UniqueWordSequenceExtractor wsExtractor = new UniqueWordSequenceExtractor(new UniqueWordSequenceExtractor.Parameters());

Expand Down
19 changes: 0 additions & 19 deletions pom.xml
Expand Up @@ -135,25 +135,6 @@
</modules>

<build>
<resources>
<resource>
<directory>${main.basedir}/datumbox-framework-common/src/main/resources</directory>
<filtering>false</filtering>
<excludes>
<exclude>license.txt</exclude>
</excludes>
</resource>
</resources>
<testResources>
<testResource>
<directory>${main.basedir}/datumbox-framework-common/src/test/resources</directory>
<filtering>false</filtering>
</testResource>
<testResource>
<directory>${main.basedir}/datumbox-framework-core/src/test/resources</directory>
<filtering>false</filtering>
</testResource>
</testResources>
<pluginManagement>
<plugins>
<plugin>
Expand Down

0 comments on commit 13333ce

Please sign in to comment.