Skip to content

Commit

Permalink
Removing commons lang from dependencies.
Browse files Browse the repository at this point in the history
  • Loading branch information
datumbox committed Mar 20, 2016
1 parent ac488ee commit de1ecb3
Show file tree
Hide file tree
Showing 10 changed files with 229 additions and 43 deletions.
4 changes: 2 additions & 2 deletions CHANGELOG.md
@@ -1,10 +1,10 @@
CHANGELOG
=========

Version 0.7.1-SNAPSHOT - Build 20160319
Version 0.7.1-SNAPSHOT - Build 20160320
---------------------------------------

- Add changes here.
- Removing Apache Commons Lang from the dependencies. Adding a faster custom unescapeHtml method in HTMLParser.

Version 0.7.0 - Build 20160319
------------------------------
Expand Down
5 changes: 0 additions & 5 deletions NOTICE
Expand Up @@ -4,11 +4,6 @@ Copyright (C) 2013 Vasilis Vryniotis <bbriniotis@datumbox.com>

The following libraries are included in packaged versions of this project:

* Apache Commons Lang
* COPYRIGHT: Copyright 2001 The Apache Software Foundation
* LICENSE: http://www.apache.org/licenses/LICENSE-2.0.txt (Apache License, Version 2.0)
* HOMEPAGE: https://commons.apache.org/proper/commons-lang/

* Apache Commons Math
* COPYRIGHT: Copyright 2003 The Apache Software Foundation
* LICENSE: http://www.apache.org/licenses/LICENSE-2.0.txt (Apache License, Version 2.0)
Expand Down
2 changes: 1 addition & 1 deletion README.md
Expand Up @@ -16,7 +16,7 @@ The code is licensed under the [Apache License, Version 2.0](./LICENSE).
Version
-------

The latest version is 0.7.1-SNAPSHOT (Build 20160319).
The latest version is 0.7.1-SNAPSHOT (Build 20160320).

The [master branch](https://github.com/datumbox/datumbox-framework/tree/master) is the latest stable version of the framework. The [devel branch](https://github.com/datumbox/datumbox-framework/tree/devel) is the development branch. All the previous stable versions are marked with [tags](https://github.com/datumbox/datumbox-framework/releases).

Expand Down
Expand Up @@ -17,7 +17,6 @@

import com.datumbox.framework.common.Configuration;
import com.datumbox.framework.tests.abstracts.AbstractTest;
import org.apache.commons.lang3.StringUtils;
import org.junit.Test;

import java.io.IOException;
Expand Down Expand Up @@ -50,7 +49,12 @@ public void testExtract() {
String text;
try {
List<String> lines = Files.readAllLines(Paths.get(this.getClass().getClassLoader().getResource("datasets/example.com.html").toURI()), StandardCharsets.UTF_8);
text = StringUtils.join(lines, "\r\n");
StringBuilder sb = new StringBuilder();
for(String line: lines){
sb.append(line);
sb.append("\r\n");
}
text = sb.toString().trim();
}
catch(IOException | URISyntaxException ex) {
throw new RuntimeException(ex);
Expand Down
4 changes: 0 additions & 4 deletions datumbox-framework-common/pom.xml
Expand Up @@ -35,10 +35,6 @@
</properties>

<dependencies>
<dependency>
<groupId>org.apache.commons</groupId>
<artifactId>commons-lang3</artifactId>
</dependency>
<dependency>
<groupId>org.apache.commons</groupId>
<artifactId>commons-math3</artifactId>
Expand Down
Expand Up @@ -15,8 +15,6 @@
*/
package com.datumbox.framework.common.utilities;

import org.apache.commons.lang3.builder.ToStringBuilder;

import java.util.*;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
Expand Down Expand Up @@ -248,17 +246,6 @@ public static <T> void shuffle(T[] array) {
}
}

/**
* Returns the contexts of an Objects in a human readable format.
*
* @param <T>
* @param object
* @return
*/
public static <T> String var_export(T object) {
return ToStringBuilder.reflectionToString(object);
}

/**
* Sorts an array in ascending order and returns an array with indexes of
* the original order.
Expand Down
4 changes: 0 additions & 4 deletions datumbox-framework-core/pom.xml
Expand Up @@ -35,10 +35,6 @@
</properties>

<dependencies>
<dependency>
<groupId>org.apache.commons</groupId>
<artifactId>commons-lang3</artifactId>
</dependency>
<dependency>
<groupId>org.apache.commons</groupId>
<artifactId>commons-math3</artifactId>
Expand Down
Expand Up @@ -20,6 +20,7 @@
import com.datumbox.framework.common.dataobjects.FlatDataList;
import com.datumbox.framework.common.interfaces.Trainable;
import com.datumbox.framework.common.utilities.PHPMethods;
import com.datumbox.framework.core.machinelearning.common.abstracts.AbstractTrainer;
import com.datumbox.framework.core.machinelearning.common.abstracts.modelers.AbstractModeler;
import com.datumbox.framework.core.machinelearning.common.interfaces.ModelParameters;
import com.datumbox.framework.core.machinelearning.common.interfaces.TrainingParameters;
Expand Down Expand Up @@ -122,7 +123,7 @@ public VM kFoldCrossValidation(Dataframe dataset, int k, String dbName, Configur


Dataframe trainingData = dataset.getSubset(foldTrainingIds);
modeler.fit(trainingData, trainingParameters);
modeler.fit(trainingData, (AbstractTrainer.AbstractTrainingParameters) trainingParameters);
trainingData.delete();
//trainingData = null;

Expand Down
Expand Up @@ -16,7 +16,6 @@
package com.datumbox.framework.core.utilities.text.parsers;

import com.datumbox.framework.common.utilities.StringCleaner;
import org.apache.commons.lang3.StringEscapeUtils;

import java.util.*;
import java.util.regex.Matcher;
Expand All @@ -35,7 +34,221 @@ public class HTMLParser {
private static final Pattern TITLE_PATTERN = Pattern.compile("<title[^>]*>(.*?)</title>", Pattern.DOTALL | Pattern.CASE_INSENSITIVE);
private static final Pattern HYPERLINK_PATTERN = Pattern.compile("<[\\s]*a[^>]*href[\\s]*=[\\s]*[\\\"']([^\\\"']*)[\\\"'][^>]*>(.*?)</a>", Pattern.DOTALL | Pattern.CASE_INSENSITIVE);
private static final Pattern METATAG_PATTERN = Pattern.compile("<[\\s]*meta[^>]*name[\\s]*=[\\s]*[\\\"']([^\\\"']*)[\\\"'][^>]*content[\\s]*=[\\s]*[\\\"']([^\\\"']*)[\\\"'][^>]*>", Pattern.DOTALL | Pattern.CASE_INSENSITIVE);
private static final Pattern HX_PATTERN = Pattern.compile("<[\\s]*(H[1-6])[^>]*?>(.*?)</\\1>", Pattern.DOTALL|Pattern.CASE_INSENSITIVE);
private static final Pattern HX_PATTERN = Pattern.compile("<[\\s]*(H[1-6])[^>]*?>(.*?)</\\1>", Pattern.DOTALL|Pattern.CASE_INSENSITIVE);

private static final String[][] ESCAPES = {
{"\"", "quot"}, // " - double-quote
{"&", "amp"}, // & - ampersand
{"<", "lt"}, // < - less-than
{">", "gt"}, // > - greater-than

// Mapping to escape ISO-8859-1 characters to their named HTML 3.x equivalents.
{"\u00A0", "nbsp"}, // non-breaking space
{"\u00A1", "iexcl"}, // inverted exclamation mark
{"\u00A2", "cent"}, // cent sign
{"\u00A3", "pound"}, // pound sign
{"\u00A4", "curren"}, // currency sign
{"\u00A5", "yen"}, // yen sign = yuan sign
{"\u00A6", "brvbar"}, // broken bar = broken vertical bar
{"\u00A7", "sect"}, // section sign
{"\u00A8", "uml"}, // diaeresis = spacing diaeresis
{"\u00A9", "copy"}, // © - copyright sign
{"\u00AA", "ordf"}, // feminine ordinal indicator
{"\u00AB", "laquo"}, // left-pointing double angle quotation mark = left pointing guillemet
{"\u00AC", "not"}, // not sign
{"\u00AD", "shy"}, // soft hyphen = discretionary hyphen
{"\u00AE", "reg"}, // ® - registered trademark sign
{"\u00AF", "macr"}, // macron = spacing macron = overline = APL overbar
{"\u00B0", "deg"}, // degree sign
{"\u00B1", "plusmn"}, // plus-minus sign = plus-or-minus sign
{"\u00B2", "sup2"}, // superscript two = superscript digit two = squared
{"\u00B3", "sup3"}, // superscript three = superscript digit three = cubed
{"\u00B4", "acute"}, // acute accent = spacing acute
{"\u00B5", "micro"}, // micro sign
{"\u00B6", "para"}, // pilcrow sign = paragraph sign
{"\u00B7", "middot"}, // middle dot = Georgian comma = Greek middle dot
{"\u00B8", "cedil"}, // cedilla = spacing cedilla
{"\u00B9", "sup1"}, // superscript one = superscript digit one
{"\u00BA", "ordm"}, // masculine ordinal indicator
{"\u00BB", "raquo"}, // right-pointing double angle quotation mark = right pointing guillemet
{"\u00BC", "frac14"}, // vulgar fraction one quarter = fraction one quarter
{"\u00BD", "frac12"}, // vulgar fraction one half = fraction one half
{"\u00BE", "frac34"}, // vulgar fraction three quarters = fraction three quarters
{"\u00BF", "iquest"}, // inverted question mark = turned question mark
{"\u00C0", "Agrave"}, // А - uppercase A, grave accent
{"\u00C1", "Aacute"}, // Б - uppercase A, acute accent
{"\u00C2", "Acirc"}, // В - uppercase A, circumflex accent
{"\u00C3", "Atilde"}, // Г - uppercase A, tilde
{"\u00C4", "Auml"}, // Д - uppercase A, umlaut
{"\u00C5", "Aring"}, // Е - uppercase A, ring
{"\u00C6", "AElig"}, // Ж - uppercase AE
{"\u00C7", "Ccedil"}, // З - uppercase C, cedilla
{"\u00C8", "Egrave"}, // И - uppercase E, grave accent
{"\u00C9", "Eacute"}, // Й - uppercase E, acute accent
{"\u00CA", "Ecirc"}, // К - uppercase E, circumflex accent
{"\u00CB", "Euml"}, // Л - uppercase E, umlaut
{"\u00CC", "Igrave"}, // М - uppercase I, grave accent
{"\u00CD", "Iacute"}, // Н - uppercase I, acute accent
{"\u00CE", "Icirc"}, // О - uppercase I, circumflex accent
{"\u00CF", "Iuml"}, // П - uppercase I, umlaut
{"\u00D0", "ETH"}, // Р - uppercase Eth, Icelandic
{"\u00D1", "Ntilde"}, // С - uppercase N, tilde
{"\u00D2", "Ograve"}, // Т - uppercase O, grave accent
{"\u00D3", "Oacute"}, // У - uppercase O, acute accent
{"\u00D4", "Ocirc"}, // Ф - uppercase O, circumflex accent
{"\u00D5", "Otilde"}, // Х - uppercase O, tilde
{"\u00D6", "Ouml"}, // Ц - uppercase O, umlaut
{"\u00D7", "times"}, // multiplication sign
{"\u00D8", "Oslash"}, // Ш - uppercase O, slash
{"\u00D9", "Ugrave"}, // Щ - uppercase U, grave accent
{"\u00DA", "Uacute"}, // Ъ - uppercase U, acute accent
{"\u00DB", "Ucirc"}, // Ы - uppercase U, circumflex accent
{"\u00DC", "Uuml"}, // Ь - uppercase U, umlaut
{"\u00DD", "Yacute"}, // Э - uppercase Y, acute accent
{"\u00DE", "THORN"}, // Ю - uppercase THORN, Icelandic
{"\u00DF", "szlig"}, // Я - lowercase sharps, German
{"\u00E0", "agrave"}, // а - lowercase a, grave accent
{"\u00E1", "aacute"}, // б - lowercase a, acute accent
{"\u00E2", "acirc"}, // в - lowercase a, circumflex accent
{"\u00E3", "atilde"}, // г - lowercase a, tilde
{"\u00E4", "auml"}, // д - lowercase a, umlaut
{"\u00E5", "aring"}, // е - lowercase a, ring
{"\u00E6", "aelig"}, // ж - lowercase ae
{"\u00E7", "ccedil"}, // з - lowercase c, cedilla
{"\u00E8", "egrave"}, // и - lowercase e, grave accent
{"\u00E9", "eacute"}, // й - lowercase e, acute accent
{"\u00EA", "ecirc"}, // к - lowercase e, circumflex accent
{"\u00EB", "euml"}, // л - lowercase e, umlaut
{"\u00EC", "igrave"}, // м - lowercase i, grave accent
{"\u00ED", "iacute"}, // н - lowercase i, acute accent
{"\u00EE", "icirc"}, // о - lowercase i, circumflex accent
{"\u00EF", "iuml"}, // п - lowercase i, umlaut
{"\u00F0", "eth"}, // р - lowercase eth, Icelandic
{"\u00F1", "ntilde"}, // с - lowercase n, tilde
{"\u00F2", "ograve"}, // т - lowercase o, grave accent
{"\u00F3", "oacute"}, // у - lowercase o, acute accent
{"\u00F4", "ocirc"}, // ф - lowercase o, circumflex accent
{"\u00F5", "otilde"}, // х - lowercase o, tilde
{"\u00F6", "ouml"}, // ц - lowercase o, umlaut
{"\u00F7", "divide"}, // division sign
{"\u00F8", "oslash"}, // ш - lowercase o, slash
{"\u00F9", "ugrave"}, // щ - lowercase u, grave accent
{"\u00FA", "uacute"}, // ъ - lowercase u, acute accent
{"\u00FB", "ucirc"}, // ы - lowercase u, circumflex accent
{"\u00FC", "uuml"}, // ь - lowercase u, umlaut
{"\u00FD", "yacute"}, // э - lowercase y, acute accent
{"\u00FE", "thorn"}, // ю - lowercase thorn, Icelandic
{"\u00FF", "yuml"}, // я - lowercase y, umlaut
};

private static final int MIN_ESCAPE = 2;
private static final int MAX_ESCAPE = 6;

private static final HashMap<String, CharSequence> LOOKUP_MAP;
static {
LOOKUP_MAP = new HashMap<>();
for (final CharSequence[] seq : ESCAPES) {
LOOKUP_MAP.put(seq[1].toString(), seq[0]);
}
}

/**
* Unescapes HTML3 chars from a string.
*
* Modified version of http://stackoverflow.com/questions/994331/java-how-to-decode-html-character-entities-in-java-like-httputility-htmldecode/24575417#24575417.
*
* @param input
* @return
*/
private static String unescapeHtml(final String input) {
StringBuilder writer = null;
int len = input.length();
int i = 1;
int st = 0;
while (true) {
// look for '&'
while (i < len && input.charAt(i-1) != '&') {
i++;
}
if (i >= len) {
break;
}

// found '&', look for ';'
int j = i;
while (j < len && j < i + MAX_ESCAPE + 1 && input.charAt(j) != ';') {
j++;
}
if (j == len || j < i + MIN_ESCAPE || j == i + MAX_ESCAPE + 1) {
i++;
continue;
}

// found escape
if (input.charAt(i) == '#') {
// numeric escape
int k = i + 1;
int radix = 10;

final char firstChar = input.charAt(k);
if (firstChar == 'x' || firstChar == 'X') {
k++;
radix = 16;
}

try {
int entityValue = Integer.parseInt(input.substring(k, j), radix);

if (writer == null) {
writer = new StringBuilder(input.length());
}
writer.append(input.substring(st, i - 1));

if (entityValue > 0xFFFF) {
final char[] chrs = Character.toChars(entityValue);
writer.append(chrs[0]);
writer.append(chrs[1]);
}
else if(entityValue == 39) {
writer.append('\'');
}
else {
writer.append(entityValue);
}

}
catch (NumberFormatException ex) {
i++;
continue;
}
}
else {
// named escape
CharSequence value = LOOKUP_MAP.get(input.substring(i, j));
if (value == null) {
i++;
continue;
}

if (writer == null) {
writer = new StringBuilder(input.length());
}
writer.append(input.substring(st, i - 1));

writer.append(value);
}

// skip escape
st = j + 1;
i = st;
}

if (writer != null) {
writer.append(input.substring(st, len));
return writer.toString();
}
return input;
}

/**
* Replaces the img tags with their alt text.
Expand Down Expand Up @@ -111,7 +324,7 @@ public static String removeNonTextTagsAndAttributes(String html) {
html = m.replaceAll("<$1$2>");
}

html = StringEscapeUtils.unescapeHtml4(html);
html = unescapeHtml(html);

return html;
}
Expand All @@ -127,13 +340,13 @@ public static String extractText(String html) {
html = replaceImgWithAlt(html);
html = safeRemoveAllTags(html);

html = StringEscapeUtils.unescapeHtml4(html);
html = unescapeHtml(html);

return html;
}

private static String clear(String html) {
return StringCleaner.removeExtraSpaces(StringEscapeUtils.unescapeHtml4(unsafeRemoveAllTags(html)));
return StringCleaner.removeExtraSpaces(unescapeHtml(unsafeRemoveAllTags(html)));
}

/**
Expand Down
6 changes: 0 additions & 6 deletions pom.xml
Expand Up @@ -107,7 +107,6 @@
<gpg-plugin-version>1.6</gpg-plugin-version>

<!-- Code Dependencies -->
<commons-lang-version>3.4</commons-lang-version>
<commons-math-version>3.6</commons-math-version>
<commons-csv-version>1.2</commons-csv-version>
<slf4j-api-version>1.7.19</slf4j-api-version>
Expand Down Expand Up @@ -362,11 +361,6 @@

<dependencyManagement>
<dependencies>
<dependency>
<groupId>org.apache.commons</groupId>
<artifactId>commons-lang3</artifactId>
<version>${commons-lang-version}</version>
</dependency>
<dependency>
<groupId>org.apache.commons</groupId>
<artifactId>commons-math3</artifactId>
Expand Down

0 comments on commit de1ecb3

Please sign in to comment.