-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Emir Munoz
committed
Sep 1, 2014
1 parent
1518086
commit 14c230b
Showing
7 changed files
with
185 additions
and
7 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,33 @@ | ||
package org.ki2na.ld4ie; | ||
|
||
import java.io.File; | ||
import java.io.IOException; | ||
|
||
import org.apache.commons.io.FileUtils; | ||
import org.ki2na.ld4ie.extractor.HtmlCleaner; | ||
import org.ki2na.ld4ie.io.HtmlInputReader; | ||
|
||
/** | ||
* @author Emir Munoz (Emir.Munoz@ie.fujitsu.com) | ||
* | ||
*/ | ||
public class TrainSerGenerator | ||
{ | ||
|
||
public static void main(String[] args) throws IOException | ||
{ | ||
HtmlCleaner cleaner = new HtmlCleaner(); | ||
HtmlInputReader reader = new HtmlInputReader("data/train1.html.txt.gz"); | ||
reader.read(); | ||
System.out.println(reader.getCount() + " documents found!"); | ||
|
||
File htmlTemplateFile; | ||
for (int i = 0; i < reader.getCount(); i++) | ||
{ | ||
System.out.println(reader.get(i)); | ||
htmlTemplateFile = new File("./data/trainCorpus3/" + i + ".html"); | ||
FileUtils.writeStringToFile(htmlTemplateFile, cleaner.annotate(reader.get(i).getContent())); | ||
} | ||
} | ||
|
||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,88 @@ | ||
package org.ki2na.ld4ie.extractor; | ||
|
||
import org.jsoup.Jsoup; | ||
import org.jsoup.nodes.Document; | ||
import org.jsoup.nodes.Element; | ||
import org.jsoup.nodes.Entities.EscapeMode; | ||
import org.jsoup.safety.Cleaner; | ||
import org.jsoup.safety.Whitelist; | ||
import org.jsoup.select.Elements; | ||
|
||
/** | ||
* Cleaner process for HTML pages. Using the JSOUP class <code>WhiteList</code> | ||
* (http://jsoup.org/apidocs/org/jsoup/safety/Whitelist.html) | ||
* | ||
* Based on <a | ||
* href="http://stackoverflow.com/questions/8683018/jsoup-clean-without-adding-html-entities" | ||
* >link</a> | ||
* | ||
* @author Emir Munoz (Emir.Munoz@ie.fujitsu.com) | ||
* @version 0.0.1 | ||
* @since 01/09/2014 | ||
* | ||
*/ | ||
public class HtmlCleaner | ||
{ | ||
|
||
private final Whitelist wl = Whitelist.relaxed(); | ||
|
||
/** | ||
* Class constructor. | ||
*/ | ||
public HtmlCleaner() | ||
{ | ||
String[] tags = new String[] { "title", "span" }; | ||
wl.addTags(tags); | ||
wl.addAttributes(":all", "class"); | ||
} | ||
|
||
public String clean(String html) | ||
{ | ||
// Parser str into a Document | ||
Document doc = Jsoup.parse(html); | ||
// Clean the document | ||
doc = new Cleaner(wl).clean(doc); | ||
// Adjust escape mode | ||
doc.outputSettings().escapeMode(EscapeMode.xhtml); | ||
|
||
// Get back the string of the Document | ||
return doc.html(); | ||
} | ||
|
||
public Document clean2(String html) | ||
{ | ||
// Parser str into a Document | ||
Document doc = Jsoup.parse(html); | ||
// Clean the document | ||
doc = new Cleaner(wl).clean(doc); | ||
|
||
// Get back the string of the Document | ||
return doc; | ||
} | ||
|
||
public String annotate(String html) | ||
{ | ||
// Parser str into a Document | ||
Document doc = clean2(html); | ||
// Visit all nodes and look for properties | ||
Elements vcards = doc.select("[class*=vcard]"); | ||
// add slot attribute | ||
for (Element vcard : vcards) | ||
{ | ||
vcard.attr("slot", "vcard"); | ||
} | ||
|
||
Elements fns = doc.select("[class*=fn]"); | ||
// add slot attribute | ||
for (Element fn : fns) | ||
{ | ||
fn.attr("slot", "fn"); | ||
} | ||
|
||
// Adjust escape mode | ||
doc.outputSettings().escapeMode(EscapeMode.xhtml); | ||
|
||
return doc.html(); | ||
} | ||
|
||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters