Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
@@ -0,0 +1,94 @@
package us.codecraft.webmagic.downloader;

import org.apache.http.annotation.ThreadSafe;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.Request;
import us.codecraft.webmagic.Task;
import us.codecraft.webmagic.selector.PlainText;

import java.io.*;

/**
* this downloader is used to download pages which need to render the javascript
*
* @author dolphineor@gmail.com
* @version 0.5.3
*/
@ThreadSafe
public class PhantomJSDownloader extends AbstractDownloader {

private static Logger logger = LoggerFactory.getLogger(PhantomJSDownloader.class);
private static String phantomJSPath;

private int retryNum;
private int threadNum;

public PhantomJSDownloader() {
PhantomJSDownloader.phantomJSPath = new File(this.getClass().getResource("/").getPath()).getPath() + System.getProperty("file.separator") + "crawl.js ";
}

@Override
public Page download(Request request, Task task) {
if (logger.isInfoEnabled()) {
logger.info("downloading page: " + request.getUrl());
}
String content = getPage(request);
if (content.contains("HTTP request failed")) {
for (int i = 1; i <= getRetryNum(); i++) {
content = getPage(request);
if (!content.contains("HTTP request failed")) {
break;
}
}
if (content.contains("HTTP request failed")) {
//when failed
Page page = new Page();
page.setRequest(request);
return page;
}
}

Page page = new Page();
page.setRawText(content);
page.setUrl(new PlainText(request.getUrl()));
page.setRequest(request);
page.setStatusCode(200);
return page;
}

@Override
public void setThread(int threadNum) {
this.threadNum = threadNum;
}

protected String getPage(Request request) {
try {
String url = request.getUrl();
Runtime runtime = Runtime.getRuntime();
Process process = runtime.exec("phantomjs " + phantomJSPath + url);
InputStream is = process.getInputStream();
BufferedReader br = new BufferedReader(new InputStreamReader(is));
StringBuffer stringBuffer = new StringBuffer();
String line;
while ((line = br.readLine()) != null) {
stringBuffer.append(line).append("\n");
}
return stringBuffer.toString();
} catch (IOException e) {
e.printStackTrace();
}

return null;
}

public int getRetryNum() {
return retryNum;
}

public PhantomJSDownloader setRetryNum(int retryNum) {
this.retryNum = retryNum;
return this;
}
}
17 changes: 17 additions & 0 deletions webmagic-extension/src/main/resources/crawl.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
var system = require('system');
var url = system.args[1];

var page = require('webpage').create();
page.settings.loadImages = false;
page.settings.resourceTimeout = 5000;

page.open(url, function (status) {
if (status != 'success') {
console.log("HTTP request failed!");
} else {
console.log(page.content);
}

page.close();
phantom.exit();
});
Original file line number Diff line number Diff line change
@@ -0,0 +1,54 @@
package us.codecraft.webmagic.samples;

import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.ResultItems;
import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.Spider;
import us.codecraft.webmagic.downloader.PhantomJSDownloader;
import us.codecraft.webmagic.pipeline.CollectorPipeline;
import us.codecraft.webmagic.pipeline.ResultItemsCollectorPipeline;
import us.codecraft.webmagic.processor.PageProcessor;

import java.util.List;

/**
* Created by dolphineor on 2014-11-21.
* <p/>
* 以淘宝为例, 搜索冬装的相关结果
*/
public class PhantomJSPageProcessor implements PageProcessor {

private Site site = Site.me()
.setDomain("s.taobao.com")
.setCharset("GBK")
.addHeader("Referer", "http://www.taobao.com/")
.setRetryTimes(3).setSleepTime(1000);

@Override
public void process(Page page) {
if (page.getRawText() != null)
page.putField("html", page.getRawText());
}

@Override
public Site getSite() {
return site;
}

public static void main(String[] args) throws Exception {
PhantomJSDownloader phantomDownloader = new PhantomJSDownloader().setRetryNum(3);

CollectorPipeline<ResultItems> collectorPipeline = new ResultItemsCollectorPipeline();

Spider.create(new PhantomJSPageProcessor())
.addUrl("http://s.taobao.com/search?q=%B6%AC%D7%B0&sort=sale-desc") //%B6%AC%D7%B0为冬装的GBK编码
.setDownloader(phantomDownloader)
.addPipeline(collectorPipeline)
.thread((Runtime.getRuntime().availableProcessors() - 1) << 1)
.run();

List<ResultItems> resultItemsList = collectorPipeline.getCollected();
System.out.println(resultItemsList.get(0).get("html").toString());
}

}
17 changes: 17 additions & 0 deletions webmagic-samples/src/main/resources/crawl.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
var system = require('system');
var url = system.args[1];

var page = require('webpage').create();
page.settings.loadImages = false;
page.settings.resourceTimeout = 5000;

page.open(url, function (status) {
if (status != 'success') {
console.log("HTTP request failed!");
} else {
console.log(page.content);
}

page.close();
phantom.exit();
});