Skip to content

Commit

Permalink
HTML cleaner
Browse files Browse the repository at this point in the history
  • Loading branch information
Emir Munoz committed Sep 1, 2014
1 parent 1518086 commit 14c230b
Show file tree
Hide file tree
Showing 7 changed files with 185 additions and 7 deletions.
6 changes: 6 additions & 0 deletions pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -77,6 +77,12 @@
<artifactId>htmlcleaner</artifactId>
<version>2.9</version>
</dependency>
<!-- JSOUP -->
<dependency>
<groupId>org.jsoup</groupId>
<artifactId>jsoup</artifactId>
<version>1.7.3</version>
</dependency>
</dependencies>

<build>
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -4,17 +4,28 @@
import java.io.IOException;

import org.apache.commons.io.FileUtils;
import org.ki2na.ld4ie.extractor.HtmlCleaner;
import org.ki2na.ld4ie.io.HtmlInputReader;

/**
*
* @author Emir Munoz (Emir.Munoz@ie.fujitsu.com)
* @version 0.0.1
* @since 01/09/2014
*
*/
public class App
public class CleanSetGenerator
{

/**
* Main.
*
* @param args
* @throws IOException
*/
public static void main(String[] args) throws IOException
{
HtmlCleaner cleaner = new HtmlCleaner();
HtmlInputReader reader = new HtmlInputReader("data/train1.clean.html.txt.gz");
reader.read();
System.out.println(reader.getCount() + " documents found!");
Expand All @@ -23,8 +34,8 @@ public static void main(String[] args) throws IOException
for (int i = 0; i < reader.getCount(); i++)
{
System.out.println(reader.get(i));
htmlTemplateFile = new File("./data/cleanCorpus/" + i + ".html");
FileUtils.writeStringToFile(htmlTemplateFile, reader.get(i).getContent());
htmlTemplateFile = new File("./data/cleanCorpus2/" + i + ".html");
FileUtils.writeStringToFile(htmlTemplateFile, cleaner.clean(reader.get(i).getContent()));
}
}

Expand Down
33 changes: 33 additions & 0 deletions src/main/java/org/ki2na/ld4ie/TrainSerGenerator.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
package org.ki2na.ld4ie;

import java.io.File;
import java.io.IOException;

import org.apache.commons.io.FileUtils;
import org.ki2na.ld4ie.extractor.HtmlCleaner;
import org.ki2na.ld4ie.io.HtmlInputReader;

/**
* @author Emir Munoz (Emir.Munoz@ie.fujitsu.com)
*
*/
public class TrainSerGenerator
{

public static void main(String[] args) throws IOException
{
HtmlCleaner cleaner = new HtmlCleaner();
HtmlInputReader reader = new HtmlInputReader("data/train1.html.txt.gz");
reader.read();
System.out.println(reader.getCount() + " documents found!");

File htmlTemplateFile;
for (int i = 0; i < reader.getCount(); i++)
{
System.out.println(reader.get(i));
htmlTemplateFile = new File("./data/trainCorpus3/" + i + ".html");
FileUtils.writeStringToFile(htmlTemplateFile, cleaner.annotate(reader.get(i).getContent()));
}
}

}
6 changes: 5 additions & 1 deletion src/main/java/org/ki2na/ld4ie/extractor/HCardExtractor.java
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,8 @@ public class HCardExtractor
// private static final VCard vVCARD = VCard.getInstance();
/** Internal connection used to collect extraction results. */
private RepositoryConnection conn;
/** Graph name */
private String graphName = "<ex:html-mf-hcard> .";

/**
* Constructor.
Expand Down Expand Up @@ -138,7 +140,9 @@ public void showStatements() throws RepositoryException
_log.info("Conn object does not contains elements");

while (statements.hasNext())
System.out.println(statements.next().toString());
{
System.out.println(statements.next().toString().replace(" .\n", graphName));
}
}

/**
Expand Down
88 changes: 88 additions & 0 deletions src/main/java/org/ki2na/ld4ie/extractor/HtmlCleaner.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,88 @@
package org.ki2na.ld4ie.extractor;

import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.nodes.Entities.EscapeMode;
import org.jsoup.safety.Cleaner;
import org.jsoup.safety.Whitelist;
import org.jsoup.select.Elements;

/**
* Cleaner process for HTML pages. Using the JSOUP class <code>WhiteList</code>
* (http://jsoup.org/apidocs/org/jsoup/safety/Whitelist.html)
*
* Based on <a
* href="http://stackoverflow.com/questions/8683018/jsoup-clean-without-adding-html-entities"
* >link</a>
*
* @author Emir Munoz (Emir.Munoz@ie.fujitsu.com)
* @version 0.0.1
* @since 01/09/2014
*
*/
public class HtmlCleaner
{

private final Whitelist wl = Whitelist.relaxed();

/**
* Class constructor.
*/
public HtmlCleaner()
{
String[] tags = new String[] { "title", "span" };
wl.addTags(tags);
wl.addAttributes(":all", "class");
}

public String clean(String html)
{
// Parser str into a Document
Document doc = Jsoup.parse(html);
// Clean the document
doc = new Cleaner(wl).clean(doc);
// Adjust escape mode
doc.outputSettings().escapeMode(EscapeMode.xhtml);

// Get back the string of the Document
return doc.html();
}

public Document clean2(String html)
{
// Parser str into a Document
Document doc = Jsoup.parse(html);
// Clean the document
doc = new Cleaner(wl).clean(doc);

// Get back the string of the Document
return doc;
}

public String annotate(String html)
{
// Parser str into a Document
Document doc = clean2(html);
// Visit all nodes and look for properties
Elements vcards = doc.select("[class*=vcard]");
// add slot attribute
for (Element vcard : vcards)
{
vcard.attr("slot", "vcard");
}

Elements fns = doc.select("[class*=fn]");
// add slot attribute
for (Element fn : fns)
{
fn.attr("slot", "fn");
}

// Adjust escape mode
doc.outputSettings().escapeMode(EscapeMode.xhtml);

return doc.html();
}

}
11 changes: 11 additions & 0 deletions src/main/java/org/ki2na/ld4ie/util/FileUtils.java
Original file line number Diff line number Diff line change
@@ -1,10 +1,13 @@
package org.ki2na.ld4ie.util;

import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.nio.charset.Charset;
import java.util.Scanner;

import org.apache.commons.io.IOUtils;

/**
* Helper class for file utils.
*
Expand Down Expand Up @@ -35,4 +38,12 @@ public static String readFile(String path, Charset encoding) throws IOException
return new Scanner(new File(path), "UTF-8").useDelimiter("\\A").next();
}

public static String readFile2(String path, Charset encoding) throws IOException
{
FileInputStream is = new FileInputStream(path);
String content = IOUtils.toString(is);
is.close();
return content;
}

}
31 changes: 28 additions & 3 deletions src/test/java/org/ki2na/ld4ie/extractor/TestHCardExtractor.java
Original file line number Diff line number Diff line change
@@ -1,10 +1,15 @@
package org.ki2na.ld4ie.extractor;

import java.io.IOException;
import java.net.URI;
import java.net.URISyntaxException;
import java.nio.charset.Charset;

import org.junit.Before;
import org.junit.Ignore;
import org.junit.Test;
import org.ki2na.ld4ie.io.HtmlInputReader;
import org.ki2na.ld4ie.util.FileUtils;
import org.openrdf.repository.RepositoryException;
import org.openrdf.sail.SailException;

Expand Down Expand Up @@ -36,7 +41,8 @@ public void setUp() throws SailException, RepositoryException, IOException
}

@Test
public void extract() throws RepositoryException
@Ignore
public void extractFromCorpus() throws RepositoryException
{
// HCardExtractor hCard = new HCardExtractor();

Expand All @@ -53,7 +59,9 @@ public void extract() throws RepositoryException
// hCard.extract(content, baseURI);

// test extraction for first document
int index = 2;
int index = 0;
System.out.println(String.format("%s %s", reader.get(index).getContent(), reader.get(index).getURI()));

hCard.extract(reader.get(index).getContent(), reader.get(index).getURI());

if (hCard.isModelEmpty())
Expand All @@ -65,5 +73,22 @@ public void extract() throws RepositoryException
// close
hCard.tearDown();
}


@Test
public void extractFromFile() throws URISyntaxException, IOException, RepositoryException
{
String content = FileUtils.readFile2("data/trainCorpus3/0.html", Charset.forName("UTF-8"));
System.out.println(content);
hCard.extract(content, new URI("http://0016304756.blogspot.ru/"));

if (hCard.isModelEmpty())
System.out.println("model is empty");

// hCard.showStatements();
System.out.println(hCard.dumpModelToNQuads());

// close
hCard.tearDown();
}

}

0 comments on commit 14c230b

Please sign in to comment.