/
HTMLParser.scala
36 lines (28 loc) · 1012 Bytes
/
HTMLParser.scala
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
package biz.neumann
/**
* AN-iT
*
* User: Andreas Neumann
* Mail: andreas@neumann.biz
* URL: http://www.an-it.com
* Date: 31.10.11
* Time: 19:52
* Package: biz.neumann.parser
*/
import xml.NodeSeq
import java.net.URL
import java.io.{FileInputStream, File, InputStream}
class HTMLParser {
/* Quelle: http://www.hars.de/2009/01/html-as-xml-in-scala.html*/
val parserFactory = new org.ccil.cowan.tagsoup.jaxp.SAXFactoryImpl
val parser = parserFactory.newSAXParser()
val adapter = new scala.xml.parsing.NoBindingFactoryAdapter
def fromFile(file : File) : NodeSeq = fromStream( new FileInputStream( file ) )
def fromFile(file : String) : NodeSeq = fromStream( new FileInputStream( new File(file) ) )
def fromURL(url : String) : NodeSeq = fromStream( new URL(url).openStream() )
def fromStream(stream : InputStream ) : NodeSeq = {
val HTMLFile = new org.xml.sax.InputSource(stream )
HTMLFile.setEncoding("UTF-8") // Force UTF-8
adapter.loadXML(HTMLFile, parser)
}
}