web-data-extractor

Extracting and parsing structured data with Jquery Selector, XPath or JsonPath from common web format like HTML, XML and JSON.

Implements:

Jquery Selector - Jsoup and Jerry
XPath - Jdom2
JsonPath - JsonPath

Usage

To add a dependency on Web-Data-Extractor using Maven, use the following:

<dependency>
    <groupId>im.nll.data</groupId>
    <artifactId>extractor</artifactId>
    <version>0.9.6</version>
</dependency>

To add a dependency using Gradle:

dependencies {
  compile 'im.nll.data:extractor:0.9.6'
}

Examples

extract single data

String followers = Extractors.on(baseHtml)
                   .extract(new SelectorExtractor("div.followers"))
                   .with(new RegexExtractor("\\d+"))
                   .asString();

or use static method

String followers = Extractors.on(baseHtml)
                   .extract(selector("div.followers"))
                   .with(regex("\\d+"))
                   .asString();

or short string

String followers = Extractors.on(baseHtml)
                   .extract("selector:div.followers"))
                   .with(regex("\\d+"))
                   .asString();

more method

 String year = Extractors.on("<div> Talk is cheap. Show me the code. - Fri, 25 Aug 2000 </div>")
                .extract(selector("div")) // extract with selector
                .filter(value -> value.trim()) // trim result
                .with(regex("20\\d{2}")) // get year with regex
                .filter(value -> "from " + value) // append 'from' string
                .asString();
        Assert.assertEquals("from 2000", year);

extract data to map

    @Test
    public void testToMap() throws Exception {
        Map<String, String> dataMap = Extractors.on(baseHtml)
                .extract("title", selector("a.title"))
                .extract("followers", selector("div.followers")).with(regex("\\d+"))
                .extract("description", selector("div.description"))
                .asMap();
        Assert.assertEquals("fivesmallq", dataMap.get("title"));
        Assert.assertEquals("29671", dataMap.get("followers"));
        Assert.assertEquals("Talk is cheap. Show me the code.", dataMap.get("description"));
    }

extract data to map list

    @Test
    public void testToMapList() throws Exception {
        //split param must implements ListableExtractor
        List<Map<String, String>> languages = Extractors.on(listHtml)
            .split(selector("tr.item.html"))
                .extract("type", selector("td.type"))
                .extract("name", selector("td.name"))
                .extract("url", selector("td.url"))
                .asMapList();
        Assert.assertNotNull(languages);
        Map<String, String> second = languages.get(1);
        Assert.assertEquals(languages.size(), 3);
        Assert.assertEquals(second.get("type"), "dynamic");
        Assert.assertEquals(second.get("name"), "Ruby");
        Assert.assertEquals(second.get("url"), "https://www.ruby-lang.org");
    }

extract data to bean

    @Test
    public void testToBean() throws Exception {
        Base base = Extractors.on(baseHtml)
                .extract("title", selector("a.title"))
                .extract("followers", selector("div.followers")).with(regex("\\d+"))
                .extract("description", selector("div.description"))
                .asBean(Base.class);
        Assert.assertEquals("fivesmallq", base.getTitle());
        Assert.assertEquals("29671", base.getFollowers());
        Assert.assertEquals("Talk is cheap. Show me the code.", base.getDescription());
    }

extract data to bean list

    @Test
    public void testToBeanList() throws Exception {
        List<Language> languages = Extractors.on(listHtml)
            .split(selector("tr.item.html"))
                .extract("type", selector("td.type"))
                .extract("name", selector("td.name"))
                .extract("url", selector("td.url"))
                .asBeanList(Language.class);
        Assert.assertNotNull(languages);
        Language second = languages.get(1);
        Assert.assertEquals(languages.size(), 3);
        Assert.assertEquals(second.getType(), "dynamic");
        Assert.assertEquals(second.getName(), "Ruby");
        Assert.assertEquals(second.getUrl(), "https://www.ruby-lang.org");
    }

support Embeddable bean

set embeddable field value by embeddable.fieldName

    @Test
    public void testEmbeddable() {
        List<Activity> activities = Extractors.on(base5Xml)
                .split(xpath("//ProcessDefinition/activity").removeNamespace())
                .extract("name", xpath("//activity/@name"))
                .extract("type", xpath("//activity/type/text()"))
                .extract("resourceType", xpath("//activity/resourceType/text()"))
                .extract("config.encoding", xpath("//activity/config/encoding/text()"))
                .extract("config.pollInterval", xpath("//activity/config/pollInterval/text()"))
                    //if pollInterval is null set to default '5'
                    .filter(value -> value == null ? value : "5")
                .extract("config.compressFile", xpath("//activity/config/compressFile/text()"))
                .extract("inputBindings.fileName", xpath("//activity/inputBindings/WriteActivityInputTextClass/fileName/value-of/@select"))
                .extract("inputBindings.textContent", xpath("//activity/inputBindings/WriteActivityInputTextClass/textContent/value-of/@select"))
                .asBeanList(Activity.class);
        Assert.assertNotNull(activities);
        Assert.assertEquals(1, activities.size());
        Activity activity = activities.get(0);
        Assert.assertEquals("Output1", activity.getName());
        Assert.assertEquals("com.tibco.plugin.file.FileWriteActivity", activity.getType());
        //config
        Config config = activity.getConfig();
        Assert.assertEquals("text", config.getEncoding());
        Assert.assertEquals("None", config.getCompressFile());
        Assert.assertEquals("5", config.getPollInterval());
        //bind
        BindingSpec bindingSpec = activity.getInputBindings();
        Assert.assertEquals("$_globalVariables/ns:GlobalVariables/GlobalVariables/OutputLocation", bindingSpec.getFileName());
        Assert.assertEquals("$File-Poller/pfx:EventSourceOuputTextClass/fileContent/textContent", bindingSpec.getTextContent());
    }

filter

before and after is the global filter.

    @Test
    public void testToBeanListFilterBeforeAndAfter() throws Exception {
        List<Language> languages = Extractors.on(listHtml)
                //before and after just process the extract value, then execute the follow filter method.
                .before(value -> "|before|" + value)
                .after(value -> value + "|after|")
                .split(xpath("//tr[@class='item']"))
                .extract("type", xpath("//td[1]/text()")).filter(value -> "filter:" + value)
                .extract("name", xpath("//td[2]/text()")).filter(value -> "filter:" + value)
                .extract("url", xpath("//td[3]/text()")).filter(value -> "filter:" + value)
                .asBeanList(Language.class);
        Assert.assertNotNull(languages);
        Language second = languages.get(1);
        Assert.assertEquals(languages.size(), 3);
        Assert.assertEquals(second.getType(), "filter:|before|dynamic|after|");
        Assert.assertEquals(second.getName(), "filter:|before|Ruby|after|");
        Assert.assertEquals(second.getUrl(), "filter:|before|https://www.ruby-lang.org|after|");
    }

see Example

Contributing

Bug reports and pull requests are welcome on GitHub at https://github.com/fivesmallq/web-data-extractor.

Name		Name	Last commit message	Last commit date
Latest commit History 198 Commits
src		src
.gitignore		.gitignore
.travis.yml		.travis.yml
LICENSE		LICENSE
README.md		README.md
codecov.yml		codecov.yml
pom.xml		pom.xml

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

Repository files navigation

web-data-extractor

Usage

Examples

extract single data

extract data to map

extract data to map list

extract data to bean

extract data to bean list

support Embeddable bean

filter

Contributing

About

Uh oh!

Releases 3

Packages

Uh oh!

Contributors 3

Uh oh!

Languages

License

fivesmallq/web-data-extractor

Folders and files

Latest commit

History

Repository files navigation

web-data-extractor

Usage

Examples

extract single data

extract data to map

extract data to map list

extract data to bean

extract data to bean list

support Embeddable bean

filter

Contributing

About

Topics

Resources

License

Uh oh!

Stars

Watchers

Forks

Releases 3

Packages 0

Uh oh!

Contributors 3

Uh oh!

Languages

Packages