From de1ecb335d8aba781c3ce55ab523ee470292a3d8 Mon Sep 17 00:00:00 2001 From: Vasilis Vryniotis Date: Sun, 20 Mar 2016 22:29:54 +0000 Subject: [PATCH] Removing commons lang from dependencies. --- CHANGELOG.md | 4 +- NOTICE | 5 - README.md | 2 +- .../framework/applications/nlp/CETRTest.java | 8 +- datumbox-framework-common/pom.xml | 4 - .../common/utilities/PHPMethods.java | 13 - datumbox-framework-core/pom.xml | 4 - .../validators/AbstractValidator.java | 3 +- .../utilities/text/parsers/HTMLParser.java | 223 +++++++++++++++++- pom.xml | 6 - 10 files changed, 229 insertions(+), 43 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index ac18bc65..7b1b7594 100755 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,10 +1,10 @@ CHANGELOG ========= -Version 0.7.1-SNAPSHOT - Build 20160319 +Version 0.7.1-SNAPSHOT - Build 20160320 --------------------------------------- -- Add changes here. +- Removing Apache Commons Lang from the dependencies. Adding a faster custom unescapeHtml method in HTMLParser. Version 0.7.0 - Build 20160319 ------------------------------ diff --git a/NOTICE b/NOTICE index 89e5e453..fa6d510b 100755 --- a/NOTICE +++ b/NOTICE @@ -4,11 +4,6 @@ Copyright (C) 2013 Vasilis Vryniotis The following libraries are included in packaged versions of this project: -* Apache Commons Lang - * COPYRIGHT: Copyright 2001 The Apache Software Foundation - * LICENSE: http://www.apache.org/licenses/LICENSE-2.0.txt (Apache License, Version 2.0) - * HOMEPAGE: https://commons.apache.org/proper/commons-lang/ - * Apache Commons Math * COPYRIGHT: Copyright 2003 The Apache Software Foundation * LICENSE: http://www.apache.org/licenses/LICENSE-2.0.txt (Apache License, Version 2.0) diff --git a/README.md b/README.md index 382eb2d6..596643f4 100755 --- a/README.md +++ b/README.md @@ -16,7 +16,7 @@ The code is licensed under the [Apache License, Version 2.0](./LICENSE). Version ------- -The latest version is 0.7.1-SNAPSHOT (Build 20160319). +The latest version is 0.7.1-SNAPSHOT (Build 20160320). The [master branch](https://github.com/datumbox/datumbox-framework/tree/master) is the latest stable version of the framework. The [devel branch](https://github.com/datumbox/datumbox-framework/tree/devel) is the development branch. All the previous stable versions are marked with [tags](https://github.com/datumbox/datumbox-framework/releases). diff --git a/datumbox-framework-applications/src/test/java/com/datumbox/framework/applications/nlp/CETRTest.java b/datumbox-framework-applications/src/test/java/com/datumbox/framework/applications/nlp/CETRTest.java index 53bb60f7..c404f8a0 100755 --- a/datumbox-framework-applications/src/test/java/com/datumbox/framework/applications/nlp/CETRTest.java +++ b/datumbox-framework-applications/src/test/java/com/datumbox/framework/applications/nlp/CETRTest.java @@ -17,7 +17,6 @@ import com.datumbox.framework.common.Configuration; import com.datumbox.framework.tests.abstracts.AbstractTest; -import org.apache.commons.lang3.StringUtils; import org.junit.Test; import java.io.IOException; @@ -50,7 +49,12 @@ public void testExtract() { String text; try { List lines = Files.readAllLines(Paths.get(this.getClass().getClassLoader().getResource("datasets/example.com.html").toURI()), StandardCharsets.UTF_8); - text = StringUtils.join(lines, "\r\n"); + StringBuilder sb = new StringBuilder(); + for(String line: lines){ + sb.append(line); + sb.append("\r\n"); + } + text = sb.toString().trim(); } catch(IOException | URISyntaxException ex) { throw new RuntimeException(ex); diff --git a/datumbox-framework-common/pom.xml b/datumbox-framework-common/pom.xml index 97a73425..c5b55890 100755 --- a/datumbox-framework-common/pom.xml +++ b/datumbox-framework-common/pom.xml @@ -35,10 +35,6 @@ - - org.apache.commons - commons-lang3 - org.apache.commons commons-math3 diff --git a/datumbox-framework-common/src/main/java/com/datumbox/framework/common/utilities/PHPMethods.java b/datumbox-framework-common/src/main/java/com/datumbox/framework/common/utilities/PHPMethods.java index a6e2b5dd..a232c871 100755 --- a/datumbox-framework-common/src/main/java/com/datumbox/framework/common/utilities/PHPMethods.java +++ b/datumbox-framework-common/src/main/java/com/datumbox/framework/common/utilities/PHPMethods.java @@ -15,8 +15,6 @@ */ package com.datumbox.framework.common.utilities; -import org.apache.commons.lang3.builder.ToStringBuilder; - import java.util.*; import java.util.regex.Matcher; import java.util.regex.Pattern; @@ -248,17 +246,6 @@ public static void shuffle(T[] array) { } } - /** - * Returns the contexts of an Objects in a human readable format. - * - * @param - * @param object - * @return - */ - public static String var_export(T object) { - return ToStringBuilder.reflectionToString(object); - } - /** * Sorts an array in ascending order and returns an array with indexes of * the original order. diff --git a/datumbox-framework-core/pom.xml b/datumbox-framework-core/pom.xml index c7d187e9..c7cdd2be 100755 --- a/datumbox-framework-core/pom.xml +++ b/datumbox-framework-core/pom.xml @@ -35,10 +35,6 @@ - - org.apache.commons - commons-lang3 - org.apache.commons commons-math3 diff --git a/datumbox-framework-core/src/main/java/com/datumbox/framework/core/machinelearning/common/abstracts/validators/AbstractValidator.java b/datumbox-framework-core/src/main/java/com/datumbox/framework/core/machinelearning/common/abstracts/validators/AbstractValidator.java index f953e8f6..b1940f40 100755 --- a/datumbox-framework-core/src/main/java/com/datumbox/framework/core/machinelearning/common/abstracts/validators/AbstractValidator.java +++ b/datumbox-framework-core/src/main/java/com/datumbox/framework/core/machinelearning/common/abstracts/validators/AbstractValidator.java @@ -20,6 +20,7 @@ import com.datumbox.framework.common.dataobjects.FlatDataList; import com.datumbox.framework.common.interfaces.Trainable; import com.datumbox.framework.common.utilities.PHPMethods; +import com.datumbox.framework.core.machinelearning.common.abstracts.AbstractTrainer; import com.datumbox.framework.core.machinelearning.common.abstracts.modelers.AbstractModeler; import com.datumbox.framework.core.machinelearning.common.interfaces.ModelParameters; import com.datumbox.framework.core.machinelearning.common.interfaces.TrainingParameters; @@ -122,7 +123,7 @@ public VM kFoldCrossValidation(Dataframe dataset, int k, String dbName, Configur Dataframe trainingData = dataset.getSubset(foldTrainingIds); - modeler.fit(trainingData, trainingParameters); + modeler.fit(trainingData, (AbstractTrainer.AbstractTrainingParameters) trainingParameters); trainingData.delete(); //trainingData = null; diff --git a/datumbox-framework-core/src/main/java/com/datumbox/framework/core/utilities/text/parsers/HTMLParser.java b/datumbox-framework-core/src/main/java/com/datumbox/framework/core/utilities/text/parsers/HTMLParser.java index 8cd2533f..95689ae9 100755 --- a/datumbox-framework-core/src/main/java/com/datumbox/framework/core/utilities/text/parsers/HTMLParser.java +++ b/datumbox-framework-core/src/main/java/com/datumbox/framework/core/utilities/text/parsers/HTMLParser.java @@ -16,7 +16,6 @@ package com.datumbox.framework.core.utilities.text.parsers; import com.datumbox.framework.common.utilities.StringCleaner; -import org.apache.commons.lang3.StringEscapeUtils; import java.util.*; import java.util.regex.Matcher; @@ -35,7 +34,221 @@ public class HTMLParser { private static final Pattern TITLE_PATTERN = Pattern.compile("]*>(.*?)", Pattern.DOTALL | Pattern.CASE_INSENSITIVE); private static final Pattern HYPERLINK_PATTERN = Pattern.compile("<[\\s]*a[^>]*href[\\s]*=[\\s]*[\\\"']([^\\\"']*)[\\\"'][^>]*>(.*?)", Pattern.DOTALL | Pattern.CASE_INSENSITIVE); private static final Pattern METATAG_PATTERN = Pattern.compile("<[\\s]*meta[^>]*name[\\s]*=[\\s]*[\\\"']([^\\\"']*)[\\\"'][^>]*content[\\s]*=[\\s]*[\\\"']([^\\\"']*)[\\\"'][^>]*>", Pattern.DOTALL | Pattern.CASE_INSENSITIVE); - private static final Pattern HX_PATTERN = Pattern.compile("<[\\s]*(H[1-6])[^>]*?>(.*?)", Pattern.DOTALL|Pattern.CASE_INSENSITIVE); + private static final Pattern HX_PATTERN = Pattern.compile("<[\\s]*(H[1-6])[^>]*?>(.*?)", Pattern.DOTALL|Pattern.CASE_INSENSITIVE); + + private static final String[][] ESCAPES = { + {"\"", "quot"}, // " - double-quote + {"&", "amp"}, // & - ampersand + {"<", "lt"}, // < - less-than + {">", "gt"}, // > - greater-than + + // Mapping to escape ISO-8859-1 characters to their named HTML 3.x equivalents. + {"\u00A0", "nbsp"}, // non-breaking space + {"\u00A1", "iexcl"}, // inverted exclamation mark + {"\u00A2", "cent"}, // cent sign + {"\u00A3", "pound"}, // pound sign + {"\u00A4", "curren"}, // currency sign + {"\u00A5", "yen"}, // yen sign = yuan sign + {"\u00A6", "brvbar"}, // broken bar = broken vertical bar + {"\u00A7", "sect"}, // section sign + {"\u00A8", "uml"}, // diaeresis = spacing diaeresis + {"\u00A9", "copy"}, // © - copyright sign + {"\u00AA", "ordf"}, // feminine ordinal indicator + {"\u00AB", "laquo"}, // left-pointing double angle quotation mark = left pointing guillemet + {"\u00AC", "not"}, // not sign + {"\u00AD", "shy"}, // soft hyphen = discretionary hyphen + {"\u00AE", "reg"}, // ® - registered trademark sign + {"\u00AF", "macr"}, // macron = spacing macron = overline = APL overbar + {"\u00B0", "deg"}, // degree sign + {"\u00B1", "plusmn"}, // plus-minus sign = plus-or-minus sign + {"\u00B2", "sup2"}, // superscript two = superscript digit two = squared + {"\u00B3", "sup3"}, // superscript three = superscript digit three = cubed + {"\u00B4", "acute"}, // acute accent = spacing acute + {"\u00B5", "micro"}, // micro sign + {"\u00B6", "para"}, // pilcrow sign = paragraph sign + {"\u00B7", "middot"}, // middle dot = Georgian comma = Greek middle dot + {"\u00B8", "cedil"}, // cedilla = spacing cedilla + {"\u00B9", "sup1"}, // superscript one = superscript digit one + {"\u00BA", "ordm"}, // masculine ordinal indicator + {"\u00BB", "raquo"}, // right-pointing double angle quotation mark = right pointing guillemet + {"\u00BC", "frac14"}, // vulgar fraction one quarter = fraction one quarter + {"\u00BD", "frac12"}, // vulgar fraction one half = fraction one half + {"\u00BE", "frac34"}, // vulgar fraction three quarters = fraction three quarters + {"\u00BF", "iquest"}, // inverted question mark = turned question mark + {"\u00C0", "Agrave"}, // А - uppercase A, grave accent + {"\u00C1", "Aacute"}, // Б - uppercase A, acute accent + {"\u00C2", "Acirc"}, // В - uppercase A, circumflex accent + {"\u00C3", "Atilde"}, // Г - uppercase A, tilde + {"\u00C4", "Auml"}, // Д - uppercase A, umlaut + {"\u00C5", "Aring"}, // Е - uppercase A, ring + {"\u00C6", "AElig"}, // Ж - uppercase AE + {"\u00C7", "Ccedil"}, // З - uppercase C, cedilla + {"\u00C8", "Egrave"}, // И - uppercase E, grave accent + {"\u00C9", "Eacute"}, // Й - uppercase E, acute accent + {"\u00CA", "Ecirc"}, // К - uppercase E, circumflex accent + {"\u00CB", "Euml"}, // Л - uppercase E, umlaut + {"\u00CC", "Igrave"}, // М - uppercase I, grave accent + {"\u00CD", "Iacute"}, // Н - uppercase I, acute accent + {"\u00CE", "Icirc"}, // О - uppercase I, circumflex accent + {"\u00CF", "Iuml"}, // П - uppercase I, umlaut + {"\u00D0", "ETH"}, // Р - uppercase Eth, Icelandic + {"\u00D1", "Ntilde"}, // С - uppercase N, tilde + {"\u00D2", "Ograve"}, // Т - uppercase O, grave accent + {"\u00D3", "Oacute"}, // У - uppercase O, acute accent + {"\u00D4", "Ocirc"}, // Ф - uppercase O, circumflex accent + {"\u00D5", "Otilde"}, // Х - uppercase O, tilde + {"\u00D6", "Ouml"}, // Ц - uppercase O, umlaut + {"\u00D7", "times"}, // multiplication sign + {"\u00D8", "Oslash"}, // Ш - uppercase O, slash + {"\u00D9", "Ugrave"}, // Щ - uppercase U, grave accent + {"\u00DA", "Uacute"}, // Ъ - uppercase U, acute accent + {"\u00DB", "Ucirc"}, // Ы - uppercase U, circumflex accent + {"\u00DC", "Uuml"}, // Ь - uppercase U, umlaut + {"\u00DD", "Yacute"}, // Э - uppercase Y, acute accent + {"\u00DE", "THORN"}, // Ю - uppercase THORN, Icelandic + {"\u00DF", "szlig"}, // Я - lowercase sharps, German + {"\u00E0", "agrave"}, // а - lowercase a, grave accent + {"\u00E1", "aacute"}, // б - lowercase a, acute accent + {"\u00E2", "acirc"}, // в - lowercase a, circumflex accent + {"\u00E3", "atilde"}, // г - lowercase a, tilde + {"\u00E4", "auml"}, // д - lowercase a, umlaut + {"\u00E5", "aring"}, // е - lowercase a, ring + {"\u00E6", "aelig"}, // ж - lowercase ae + {"\u00E7", "ccedil"}, // з - lowercase c, cedilla + {"\u00E8", "egrave"}, // и - lowercase e, grave accent + {"\u00E9", "eacute"}, // й - lowercase e, acute accent + {"\u00EA", "ecirc"}, // к - lowercase e, circumflex accent + {"\u00EB", "euml"}, // л - lowercase e, umlaut + {"\u00EC", "igrave"}, // м - lowercase i, grave accent + {"\u00ED", "iacute"}, // н - lowercase i, acute accent + {"\u00EE", "icirc"}, // о - lowercase i, circumflex accent + {"\u00EF", "iuml"}, // п - lowercase i, umlaut + {"\u00F0", "eth"}, // р - lowercase eth, Icelandic + {"\u00F1", "ntilde"}, // с - lowercase n, tilde + {"\u00F2", "ograve"}, // т - lowercase o, grave accent + {"\u00F3", "oacute"}, // у - lowercase o, acute accent + {"\u00F4", "ocirc"}, // ф - lowercase o, circumflex accent + {"\u00F5", "otilde"}, // х - lowercase o, tilde + {"\u00F6", "ouml"}, // ц - lowercase o, umlaut + {"\u00F7", "divide"}, // division sign + {"\u00F8", "oslash"}, // ш - lowercase o, slash + {"\u00F9", "ugrave"}, // щ - lowercase u, grave accent + {"\u00FA", "uacute"}, // ъ - lowercase u, acute accent + {"\u00FB", "ucirc"}, // ы - lowercase u, circumflex accent + {"\u00FC", "uuml"}, // ь - lowercase u, umlaut + {"\u00FD", "yacute"}, // э - lowercase y, acute accent + {"\u00FE", "thorn"}, // ю - lowercase thorn, Icelandic + {"\u00FF", "yuml"}, // я - lowercase y, umlaut + }; + + private static final int MIN_ESCAPE = 2; + private static final int MAX_ESCAPE = 6; + + private static final HashMap LOOKUP_MAP; + static { + LOOKUP_MAP = new HashMap<>(); + for (final CharSequence[] seq : ESCAPES) { + LOOKUP_MAP.put(seq[1].toString(), seq[0]); + } + } + + /** + * Unescapes HTML3 chars from a string. + * + * Modified version of http://stackoverflow.com/questions/994331/java-how-to-decode-html-character-entities-in-java-like-httputility-htmldecode/24575417#24575417. + * + * @param input + * @return + */ + private static String unescapeHtml(final String input) { + StringBuilder writer = null; + int len = input.length(); + int i = 1; + int st = 0; + while (true) { + // look for '&' + while (i < len && input.charAt(i-1) != '&') { + i++; + } + if (i >= len) { + break; + } + + // found '&', look for ';' + int j = i; + while (j < len && j < i + MAX_ESCAPE + 1 && input.charAt(j) != ';') { + j++; + } + if (j == len || j < i + MIN_ESCAPE || j == i + MAX_ESCAPE + 1) { + i++; + continue; + } + + // found escape + if (input.charAt(i) == '#') { + // numeric escape + int k = i + 1; + int radix = 10; + + final char firstChar = input.charAt(k); + if (firstChar == 'x' || firstChar == 'X') { + k++; + radix = 16; + } + + try { + int entityValue = Integer.parseInt(input.substring(k, j), radix); + + if (writer == null) { + writer = new StringBuilder(input.length()); + } + writer.append(input.substring(st, i - 1)); + + if (entityValue > 0xFFFF) { + final char[] chrs = Character.toChars(entityValue); + writer.append(chrs[0]); + writer.append(chrs[1]); + } + else if(entityValue == 39) { + writer.append('\''); + } + else { + writer.append(entityValue); + } + + } + catch (NumberFormatException ex) { + i++; + continue; + } + } + else { + // named escape + CharSequence value = LOOKUP_MAP.get(input.substring(i, j)); + if (value == null) { + i++; + continue; + } + + if (writer == null) { + writer = new StringBuilder(input.length()); + } + writer.append(input.substring(st, i - 1)); + + writer.append(value); + } + + // skip escape + st = j + 1; + i = st; + } + + if (writer != null) { + writer.append(input.substring(st, len)); + return writer.toString(); + } + return input; + } /** * Replaces the img tags with their alt text. @@ -111,7 +324,7 @@ public static String removeNonTextTagsAndAttributes(String html) { html = m.replaceAll("<$1$2>"); } - html = StringEscapeUtils.unescapeHtml4(html); + html = unescapeHtml(html); return html; } @@ -127,13 +340,13 @@ public static String extractText(String html) { html = replaceImgWithAlt(html); html = safeRemoveAllTags(html); - html = StringEscapeUtils.unescapeHtml4(html); + html = unescapeHtml(html); return html; } private static String clear(String html) { - return StringCleaner.removeExtraSpaces(StringEscapeUtils.unescapeHtml4(unsafeRemoveAllTags(html))); + return StringCleaner.removeExtraSpaces(unescapeHtml(unsafeRemoveAllTags(html))); } /** diff --git a/pom.xml b/pom.xml index 5aa38ceb..43a98fd7 100755 --- a/pom.xml +++ b/pom.xml @@ -107,7 +107,6 @@ 1.6 - 3.4 3.6 1.2 1.7.19 @@ -362,11 +361,6 @@ - - org.apache.commons - commons-lang3 - ${commons-lang-version} - org.apache.commons commons-math3