Skip to content

Commit

Permalink
#27 customize http header for downloader
Browse files Browse the repository at this point in the history
  • Loading branch information
code4craft committed Oct 11, 2013
1 parent 1a2c84e commit 16e12e3
Show file tree
Hide file tree
Showing 3 changed files with 39 additions and 6 deletions.
32 changes: 29 additions & 3 deletions webmagic-core/src/main/java/us/codecraft/webmagic/Site.java
Original file line number Diff line number Diff line change
Expand Up @@ -8,8 +8,8 @@
* Object contains setting for crawler.<br>
*
* @author code4crafter@gmail.com <br>
* @since 0.1.0
* @see us.codecraft.webmagic.processor.PageProcessor
* @since 0.1.0
*/
public class Site {

Expand Down Expand Up @@ -38,6 +38,14 @@ public class Site {

private Set<Integer> acceptStatCode = DEFAULT_STATUS_CODE_SET;

private Map<String,String> headers = new HashMap<String, String>();

public static interface HeaderConst {

public static final String REFERER = "Referer";
}


static {
DEFAULT_STATUS_CODE_SET.add(200);
}
Expand Down Expand Up @@ -139,10 +147,12 @@ public int getTimeOut() {

/**
* set timeout for downloader in ms
*
* @param timeOut
*/
public void setTimeOut(int timeOut) {
public Site setTimeOut(int timeOut) {
this.timeOut = timeOut;
return this;
}

/**
Expand Down Expand Up @@ -216,14 +226,30 @@ public int getSleepTime() {
}

/**
* Get retry times when download fail immediately, 0 by default.<br>
* Get retry times immediately when download fail, 0 by default.<br>
*
* @return retry times when download fail
*/
public int getRetryTimes() {
return retryTimes;
}

public Map<String, String> getHeaders() {
return headers;
}

/**
* Put an Http header for downloader. <br/>
* Use {@link #addCookie(String, String)} for cookie and {@link #setUserAgent(String)} for user-agent. <br/>
* @param key key of http header, there are some keys constant in {@link HeaderConst}
* @param value value of header
* @return
*/
public Site addHeader(String key, String value){
headers.put(key,value);
return this;
}

/**
* Set retry times when download fail, 0 by default.<br>
*
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@

import java.io.IOException;
import java.util.HashSet;
import java.util.Map;
import java.util.Set;


Expand Down Expand Up @@ -66,10 +67,12 @@ public Page download(Request request, Task task) {
int retryTimes = 0;
Set<Integer> acceptStatCode;
String charset = null;
Map<String,String> headers = null;
if (site != null) {
retryTimes = site.getRetryTimes();
acceptStatCode = site.getAcceptStatCode();
charset = site.getCharset();
headers = site.getHeaders();
} else {
acceptStatCode = new HashSet<Integer>();
acceptStatCode.add(200);
Expand All @@ -78,6 +81,11 @@ public Page download(Request request, Task task) {
HttpClient httpClient = HttpClientPool.getInstance(poolSize).getClient(site);
try {
HttpGet httpGet = new HttpGet(request.getUrl());
if (headers!=null){
for (Map.Entry<String, String> headerEntry : headers.entrySet()) {
httpGet.addHeader(headerEntry.getKey(),headerEntry.getValue());
}
}
HttpResponse httpResponse = null;
int tried = 0;
boolean retry;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -54,7 +54,7 @@ private HttpClient generateClient(Site site) {
}
params.setIntParameter(CoreConnectionPNames.SO_TIMEOUT, site.getTimeOut());
params.setIntParameter(CoreConnectionPNames.CONNECTION_TIMEOUT, site.getTimeOut());

params.setParameter(ClientPNames.COOKIE_POLICY, CookiePolicy.BEST_MATCH);
HttpProtocolParamBean paramsBean = new HttpProtocolParamBean(params);
paramsBean.setVersion(HttpVersion.HTTP_1_1);
if (site != null && site.getCharset() != null) {
Expand All @@ -73,8 +73,7 @@ private HttpClient generateClient(Site site) {
if (site != null) {
generateCookie(httpClient, site);
}
httpClient.getParams().setIntParameter("http.socket.timeout", 60000);
httpClient.getParams().setParameter(ClientPNames.COOKIE_POLICY, CookiePolicy.BEST_MATCH);

return httpClient;
}

Expand Down

0 comments on commit 16e12e3

Please sign in to comment.