Removing resources from master pom. Refactored tests and Datasets obj…

…ect.
datumbox · Jan 2, 2017 · 13333ce · 13333ce
1 parent 2f43c85
commit 13333ce
Show file tree

Hide file tree

Showing 9 changed files with 117 additions and 90 deletions.
diff --git a/...lications/src/test/java/com/datumbox/framework/applications/datamodeling/ModelerTest.java b/...lications/src/test/java/com/datumbox/framework/applications/datamodeling/ModelerTest.java
@@ -51,7 +51,7 @@ public void testTrainAndValidate() {
         logger.info("testTrainAndValidate");
 
         Configuration configuration = Configuration.getConfiguration();
-        
+
         Dataframe[] data = Datasets.heartDiseaseClusters(configuration);
 
         Dataframe trainingData = data[0];

diff --git a/...ramework-applications/src/test/java/com/datumbox/framework/applications/nlp/CETRTest.java b/...ramework-applications/src/test/java/com/datumbox/framework/applications/nlp/CETRTest.java
@@ -16,16 +16,10 @@
 package com.datumbox.framework.applications.nlp;
 
 import com.datumbox.framework.common.Configuration;
+import com.datumbox.framework.core.common.Datasets;
 import com.datumbox.framework.tests.abstracts.AbstractTest;
 import org.junit.Test;
 
-import java.io.IOException;
-import java.net.URISyntaxException;
-import java.nio.charset.StandardCharsets;
-import java.nio.file.Files;
-import java.nio.file.Paths;
-import java.util.List;
-
 import static org.junit.Assert.assertEquals;
 
 /**
@@ -43,20 +37,8 @@ public void testExtract() {
         logger.info("extract");
 
         Configuration configuration = Configuration.getConfiguration();
-
-        String text;        
-        try {
-            List<String> lines = Files.readAllLines(Paths.get(this.getClass().getClassLoader().getResource("datasets/example.com.html").toURI()), StandardCharsets.UTF_8);
-            StringBuilder sb = new StringBuilder();
-            for(String line: lines){
-                sb.append(line);
-                sb.append("\r\n");
-            }
-            text = sb.toString().trim();
-        }
-        catch(IOException | URISyntaxException ex) {
-            throw new RuntimeException(ex);
-        }
+
+        String text = Datasets.exampleHtmlCode();
 
         CETR.Parameters parameters = new CETR.Parameters();
         parameters.setNumberOfClusters(2);

diff --git a/...pplications/src/test/java/com/datumbox/framework/applications/nlp/TextClassifierTest.java b/...pplications/src/test/java/com/datumbox/framework/applications/nlp/TextClassifierTest.java
@@ -16,6 +16,7 @@
 package com.datumbox.framework.applications.nlp;
 
 import com.datumbox.framework.common.Configuration;
+import com.datumbox.framework.core.common.Datasets;
 import com.datumbox.framework.core.common.dataobjects.Dataframe;
 import com.datumbox.framework.core.common.dataobjects.Record;
 import com.datumbox.framework.core.machinelearning.MLBuilder;
@@ -33,11 +34,8 @@
 import com.datumbox.framework.tests.abstracts.AbstractTest;
 import org.junit.Test;
 
-import java.io.UncheckedIOException;
 import java.net.URI;
-import java.net.URISyntaxException;
 import java.util.Arrays;
-import java.util.HashMap;
 import java.util.List;
 import java.util.Map;
 
@@ -283,16 +281,8 @@ private <ML extends AbstractClassifier, FS extends AbstractFeatureSelector, NS e
 
 
         String storageName = this.getClass().getSimpleName() + testId;
-
-        Map<Object, URI> dataset = new HashMap<>();
-        try {
-            dataset.put("negative", this.getClass().getClassLoader().getResource("datasets/sentimentAnalysis.neg.txt").toURI());
-            dataset.put("positive", this.getClass().getClassLoader().getResource("datasets/sentimentAnalysis.pos.txt").toURI());
-        }
-        catch(UncheckedIOException | URISyntaxException ex) {
-            logger.warn("Unable to download datasets, skipping test.");
-            throw new RuntimeException(ex);
-        }
+
+        Map<Object, URI> dataset = Datasets.sentimentAnalysis();
 
         TextClassifier.TrainingParameters trainingParameters = new TextClassifier.TrainingParameters();
 
@@ -327,14 +317,7 @@ private <ML extends AbstractClassifier, FS extends AbstractFeatureSelector, NS e
 
 
         instance = MLBuilder.load(TextClassifier.class, storageName, configuration);
-        Dataframe validationData;
-        try {
-            validationData = instance.predict(this.getClass().getClassLoader().getResource("datasets/sentimentAnalysis.unlabelled.txt").toURI());
-        }
-        catch(UncheckedIOException | URISyntaxException ex) {
-            logger.warn("Unable to download datasets, skipping test.");
-            throw new RuntimeException(ex);
-        }
+        Dataframe validationData = instance.predict(Datasets.sentimentAnalysisUnlabeled());
 
         List<Object> expResult = Arrays.asList("negative","positive");
         int i = 0;

diff --git a/datumbox-framework-common/pom.xml b/datumbox-framework-common/pom.xml
@@ -34,6 +34,24 @@
         <main.basedir>..</main.basedir>
     </properties>
 
+    <build>
+        <resources>
+            <resource>
+                <directory>./src/main/resources</directory>
+                <filtering>false</filtering>
+                <excludes>
+                    <exclude>license.txt</exclude>
+                </excludes>
+            </resource>
+        </resources>
+        <testResources>
+            <testResource>
+                <directory>./src/test/resources</directory>
+                <filtering>false</filtering>
+            </testResource>
+        </testResources>
+    </build>
+
     <dependencies>
         <dependency>
             <groupId>org.apache.commons</groupId>

diff --git a/datumbox-framework-core/pom.xml b/datumbox-framework-core/pom.xml
@@ -34,6 +34,15 @@
         <main.basedir>..</main.basedir>
     </properties>
 
+    <build>
+        <testResources>
+            <testResource>
+                <directory>./src/test/resources</directory>
+                <filtering>false</filtering>
+            </testResource>
+        </testResources>
+    </build>
+
     <dependencies>
         <dependency>
             <groupId>org.apache.commons</groupId>

diff --git a/datumbox-framework-core/src/test/java/com/datumbox/framework/core/common/Datasets.java b/datumbox-framework-core/src/test/java/com/datumbox/framework/core/common/Datasets.java
@@ -23,12 +23,12 @@
 import com.datumbox.framework.core.common.dataobjects.Dataframe;
 import com.datumbox.framework.core.common.dataobjects.Record;
 
-import java.io.IOException;
-import java.io.InputStreamReader;
-import java.io.Reader;
-import java.io.UncheckedIOException;
-import java.util.LinkedHashMap;
-import java.util.Random;
+import java.io.*;
+import java.net.URI;
+import java.nio.charset.StandardCharsets;
+import java.nio.file.Files;
+import java.nio.file.StandardCopyOption;
+import java.util.*;
 
 
 /**
@@ -75,7 +75,7 @@ public static Dataframe[] carsNumeric(Configuration configuration) {
             - c2: no
         */
         Dataframe trainingData;
-        try (Reader fileReader = new InputStreamReader(Datasets.class.getClassLoader().getResourceAsStream("datasets/carsNumeric.csv"), "UTF-8")) {
+        try (Reader fileReader = new InputStreamReader(Datasets.class.getClassLoader().getResourceAsStream("datasets/carsNumeric.csv"), StandardCharsets.UTF_8)) {
             LinkedHashMap<String, TypeInference.DataType> headerDataTypes = new LinkedHashMap<>(); 
             headerDataTypes.put("red", TypeInference.DataType.BOOLEAN);
             headerDataTypes.put("yellow", TypeInference.DataType.BOOLEAN);
@@ -121,7 +121,7 @@ public static Dataframe[] carsCategorical(Configuration configuration) {
             - stolen: yes/no
         */
         Dataframe trainingData;
-        try (Reader fileReader = new InputStreamReader(Datasets.class.getClassLoader().getResourceAsStream("datasets/carsCategorical.csv"), "UTF-8")) {
+        try (Reader fileReader = new InputStreamReader(Datasets.class.getClassLoader().getResourceAsStream("datasets/carsCategorical.csv"), StandardCharsets.UTF_8)) {
             LinkedHashMap<String, TypeInference.DataType> headerDataTypes = new LinkedHashMap<>(); 
             headerDataTypes.put("color", TypeInference.DataType.CATEGORICAL);
             headerDataTypes.put("type", TypeInference.DataType.CATEGORICAL);
@@ -153,7 +153,7 @@ public static Dataframe[] carsCategorical(Configuration configuration) {
     public static Dataframe[] housingNumerical(Configuration configuration) {
         //Data from https://archive.ics.uci.edu/ml/machine-learning-databases/housing/housing.names
         Dataframe trainingData;
-        try (Reader fileReader = new InputStreamReader(Datasets.class.getClassLoader().getResourceAsStream("datasets/housing.csv"), "UTF-8")) {
+        try (Reader fileReader = new InputStreamReader(Datasets.class.getClassLoader().getResourceAsStream("datasets/housing.csv"), StandardCharsets.UTF_8)) {
             LinkedHashMap<String, TypeInference.DataType> headerDataTypes = new LinkedHashMap<>();
             headerDataTypes.put("CRIM", TypeInference.DataType.NUMERICAL);
             headerDataTypes.put("ZN", TypeInference.DataType.NUMERICAL);
@@ -190,7 +190,7 @@ public static Dataframe[] housingNumerical(Configuration configuration) {
     public static Dataframe[] winesOrdinal(Configuration configuration) {
         //Data from http://www.unt.edu/rss/class/Jon/R_SC/
         Dataframe trainingData;
-        try (Reader fileReader = new InputStreamReader(Datasets.class.getClassLoader().getResourceAsStream("datasets/winesOrdinal.csv"), "UTF-8")) {
+        try (Reader fileReader = new InputStreamReader(Datasets.class.getClassLoader().getResourceAsStream("datasets/winesOrdinal.csv"), StandardCharsets.UTF_8)) {
             LinkedHashMap<String, TypeInference.DataType> headerDataTypes = new LinkedHashMap<>(); 
             headerDataTypes.put("c1", TypeInference.DataType.NUMERICAL);
             headerDataTypes.put("c2", TypeInference.DataType.NUMERICAL);
@@ -315,7 +315,7 @@ public static Dataframe[] heartDiseaseClusters(Configuration configuration) {
         //Heart Disease - C2: Age, Sex, ChestPain, RestBP, Cholesterol, BloodSugar, ECG, MaxHeartRate, Angina, OldPeak, STSlope, Vessels, Thal
         //http://www.sgi.com/tech/mlc/db/heart.names
         Dataframe trainingData;
-        try (Reader fileReader = new InputStreamReader(Datasets.class.getClassLoader().getResourceAsStream("datasets/heart.csv"), "UTF-8")) {
+        try (Reader fileReader = new InputStreamReader(Datasets.class.getClassLoader().getResourceAsStream("datasets/heart.csv"), StandardCharsets.UTF_8)) {
             LinkedHashMap<String, TypeInference.DataType> headerDataTypes = new LinkedHashMap<>(); 
             headerDataTypes.put("Age", TypeInference.DataType.NUMERICAL);
             headerDataTypes.put("Sex", TypeInference.DataType.CATEGORICAL);
@@ -653,7 +653,7 @@ public static Dataframe[] regressionNumeric(Configuration configuration) {
         $dataTable[]=array(array($x1,$x2),null);
         */
         Dataframe trainingData;
-        try (Reader fileReader = new InputStreamReader(Datasets.class.getClassLoader().getResourceAsStream("datasets/regressionNumeric.csv"), "UTF-8")) {
+        try (Reader fileReader = new InputStreamReader(Datasets.class.getClassLoader().getResourceAsStream("datasets/regressionNumeric.csv"), StandardCharsets.UTF_8)) {
             LinkedHashMap<String, TypeInference.DataType> headerDataTypes = new LinkedHashMap<>(); 
             headerDataTypes.put("c1", TypeInference.DataType.NUMERICAL);
             headerDataTypes.put("c2", TypeInference.DataType.NUMERICAL);
@@ -689,7 +689,7 @@ public static Dataframe[] regressionMixed(Configuration configuration) {
         $dataTable[]=array(array((string)$x1,$x2,$x3,(string)$x4),null);
         */
         Dataframe trainingData;
-        try (Reader fileReader = new InputStreamReader(Datasets.class.getClassLoader().getResourceAsStream("datasets/regressionMixed.csv"), "UTF-8")) {
+        try (Reader fileReader = new InputStreamReader(Datasets.class.getClassLoader().getResourceAsStream("datasets/regressionMixed.csv"), StandardCharsets.UTF_8)) {
             LinkedHashMap<String, TypeInference.DataType> headerDataTypes = new LinkedHashMap<>(); 
             headerDataTypes.put("c1", TypeInference.DataType.CATEGORICAL);
             headerDataTypes.put("c2", TypeInference.DataType.NUMERICAL);
@@ -707,5 +707,70 @@ public static Dataframe[] regressionMixed(Configuration configuration) {
 
         return new Dataframe[] {trainingData, validationData};
     }
-
+
+    /**
+     * Returns a map with the URIs of a sentiment analysis dataset.
+     *
+     * @return
+     */
+    public static Map<Object, URI> sentimentAnalysis() {
+        Map<Object, URI> dataset = new HashMap<>();
+        dataset.put("negative", inputStreamToURI(Datasets.class.getClassLoader().getResourceAsStream("datasets/sentimentAnalysis.neg.txt")));
+        dataset.put("positive", inputStreamToURI(Datasets.class.getClassLoader().getResourceAsStream("datasets/sentimentAnalysis.pos.txt")));
+        return dataset;
+    }
+
+    /**
+     * Returns the URI of an unlabelled sentiment analysis dataset.
+     *
+     * @return
+     */
+    public static URI sentimentAnalysisUnlabeled() {
+        return inputStreamToURI(Datasets.class.getClassLoader().getResourceAsStream("datasets/sentimentAnalysis.unlabelled.txt"));
+    }
+
+    /**
+     * Returns the HTML code of example.com.
+     *
+     * @return
+     */
+    public static String exampleHtmlCode() {
+        try(BufferedReader fileReader = new BufferedReader(new InputStreamReader(Datasets.class.getClassLoader().getResourceAsStream("datasets/example.com.html"), StandardCharsets.UTF_8))) {
+            StringBuilder sb = new StringBuilder();
+            for(String line;(line = fileReader.readLine()) != null;){
+                sb.append(line);
+                sb.append("\r\n");
+            }
+            return sb.toString().trim();
+        }
+        catch(IOException ex) {
+            throw new UncheckedIOException(ex);
+        }
+    }
+
+    /**
+     * Converts an input stream to a URI.
+     *
+     * @param is
+     * @return
+     */
+    private static URI inputStreamToURI(InputStream is) {
+        try {
+            File f = File.createTempFile("is2uri", "tmp");
+            f.deleteOnExit();
+            Files.copy(is, f.toPath(), StandardCopyOption.REPLACE_EXISTING);
+            return f.toURI();
+        }
+        catch (IOException ex) {
+            throw new UncheckedIOException(ex);
+        }
+        finally {
+            try {
+                is.close();
+            }
+            catch (IOException ex) {
+                throw new UncheckedIOException(ex);
+            }
+        }
+    }
 }
diff --git a/...work-core/src/test/java/com/datumbox/framework/core/common/dataobjects/DataframeTest.java b/...work-core/src/test/java/com/datumbox/framework/core/common/dataobjects/DataframeTest.java
@@ -66,7 +66,6 @@ public void testParseCSVFile() {
             dataset = Dataframe.Builder.parseCSVFile(fileReader, "metro_population", headerDataTypes, ',', '"', "\r\n", null, null, configuration);
         }
         catch(UncheckedIOException | IOException ex) {
-            logger.warn("Unable to download datasets, skipping test.");
             throw new RuntimeException(ex);
         }
 

diff --git a/.../datumbox/framework/core/machinelearning/topicmodeling/LatentDirichletAllocationTest.java b/.../datumbox/framework/core/machinelearning/topicmodeling/LatentDirichletAllocationTest.java
@@ -16,6 +16,7 @@
 package com.datumbox.framework.core.machinelearning.topicmodeling;
 
 import com.datumbox.framework.common.Configuration;
+import com.datumbox.framework.core.common.Datasets;
 import com.datumbox.framework.core.common.dataobjects.Dataframe;
 import com.datumbox.framework.core.common.dataobjects.Record;
 import com.datumbox.framework.core.machinelearning.MLBuilder;
@@ -28,10 +29,7 @@
 import com.datumbox.framework.tests.abstracts.AbstractTest;
 import org.junit.Test;
 
-import java.io.UncheckedIOException;
 import java.net.URI;
-import java.net.URISyntaxException;
-import java.util.HashMap;
 import java.util.Map;
 
 import static org.junit.Assert.assertEquals;
@@ -55,16 +53,8 @@ public void testPredict() {
 
         String storageName = this.getClass().getSimpleName();
 
-
-        Map<Object, URI> dataset = new HashMap<>();
-        try {
-            dataset.put("negative", this.getClass().getClassLoader().getResource("datasets/sentimentAnalysis.neg.txt").toURI());
-            dataset.put("positive", this.getClass().getClassLoader().getResource("datasets/sentimentAnalysis.pos.txt").toURI());
-        }
-        catch(UncheckedIOException | URISyntaxException ex) {
-            logger.warn("Unable to download datasets, skipping test.");
-            throw new RuntimeException(ex);
-        }
+
+        Map<Object, URI> dataset = Datasets.sentimentAnalysis();
 
         UniqueWordSequenceExtractor wsExtractor = new UniqueWordSequenceExtractor(new UniqueWordSequenceExtractor.Parameters());
 

diff --git a/pom.xml b/pom.xml
@@ -135,25 +135,6 @@
     </modules>
 
     <build>
-        <resources>
-            <resource>
-                <directory>${main.basedir}/datumbox-framework-common/src/main/resources</directory>
-                <filtering>false</filtering>
-                <excludes>
-                    <exclude>license.txt</exclude>
-                </excludes>
-            </resource>
-        </resources>
-        <testResources>
-            <testResource>
-                <directory>${main.basedir}/datumbox-framework-common/src/test/resources</directory>
-                <filtering>false</filtering>
-            </testResource>
-            <testResource>
-                <directory>${main.basedir}/datumbox-framework-core/src/test/resources</directory>
-                <filtering>false</filtering>
-            </testResource>
-        </testResources>
         <pluginManagement>
             <plugins>
                 <plugin>