Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Empty file modified webmagic-scripts/README.md
100644 → 100755
Empty file.
Empty file modified webmagic-scripts/deploy.sh
100644 → 100755
Empty file.
4 changes: 4 additions & 0 deletions webmagic-scripts/pom.xml
100644 → 100755
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,10 @@
<artifactId>jruby</artifactId>
<version>1.7.6</version>
</dependency>
<dependency><groupId>org.python</groupId>
<artifactId>jython</artifactId>
<version>2.5.3</version>
</dependency>
<dependency>
<groupId>commons-cli</groupId>
<artifactId>commons-cli</artifactId>
Expand Down
4 changes: 3 additions & 1 deletion webmagic-scripts/src/main/java/us/codecraft/webmagic/scripts/Language.java
100644 → 100755
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,9 @@ public enum Language {

JavaScript("javascript","js/defines.js",""),

JRuby("jruby","ruby/defines.rb","");
JRuby("jruby","ruby/defines.rb",""),

Jython("jython","python/defines.py","");

private String engineName;

Expand Down
Empty file.
Empty file.
48 changes: 34 additions & 14 deletions webmagic-scripts/src/main/java/us/codecraft/webmagic/scripts/ScriptProcessor.java
100644 → 100755
Original file line number Diff line number Diff line change
@@ -1,6 +1,9 @@
package us.codecraft.webmagic.scripts;

import org.apache.commons.io.IOUtils;
import org.jruby.RubyHash;
import org.python.core.PyDictionary;
import sun.org.mozilla.javascript.internal.NativeObject;
import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.processor.PageProcessor;
Expand All @@ -10,6 +13,8 @@
import javax.script.ScriptException;
import java.io.IOException;
import java.io.InputStream;
import java.util.Iterator;
import java.util.Map;

/**
* @author code4crafter@gmail.com
Expand Down Expand Up @@ -50,20 +55,34 @@ public void process(Page page) {
context.setAttribute("page", page, ScriptContext.ENGINE_SCOPE);
context.setAttribute("config", site, ScriptContext.ENGINE_SCOPE);
try {
engine.eval(defines + "\n" + script, context);
// switch (language) {
// case JavaScript:
// NativeObject o = (NativeObject) engine.get("result");
// if (o != null) {
// for (Map.Entry<Object, Object> objectObjectEntry : o.entrySet()) {
// page.getResultItems().put(objectObjectEntry.getKey().toString(), objectObjectEntry.getValue());
// }
// }
// break;
// case JRuby:
// Object o1 = engine.get("result");
// break;
// }
switch (language) {
case JavaScript:
engine.eval(defines + "\n" + script, context);
NativeObject o = (NativeObject) engine.get("result");
if (o != null) {
for (Map.Entry<Object, Object> objectObjectEntry : o.entrySet()) {
page.getResultItems().put(objectObjectEntry.getKey().toString(), objectObjectEntry.getValue());
}
}
break;
case JRuby:
RubyHash oRuby=(RubyHash)engine.eval(defines+"\n"+script,context);
Iterator itruby = oRuby.entrySet().iterator();
while (itruby.hasNext()) {
Map.Entry pairs = (Map.Entry)itruby.next();
page.getResultItems().put(pairs.getKey().toString(),pairs.getValue());
}
break;
case Jython:
engine.eval(defines + "\n" + script, context);
PyDictionary oJython=(PyDictionary)engine.get("result");
Iterator it = oJython.entrySet().iterator();
while (it.hasNext()) {
Map.Entry pairs = (Map.Entry)it.next();
page.getResultItems().put(pairs.getKey().toString(),pairs.getValue());
}
break;
}
} catch (ScriptException e) {
e.printStackTrace();
}
Expand All @@ -72,6 +91,7 @@ public void process(Page page) {
}
}


@Override
public Site getSite() {
return site;
Expand Down
Empty file.
Empty file modified webmagic-scripts/src/main/resources/js/defines.js
100644 → 100755
Empty file.
Empty file modified webmagic-scripts/src/main/resources/js/github.js
100644 → 100755
Empty file.
Empty file modified webmagic-scripts/src/main/resources/js/oschina.js
100644 → 100755
Empty file.
Empty file modified webmagic-scripts/src/main/resources/log4j.xml
100644 → 100755
Empty file.
13 changes: 13 additions & 0 deletions webmagic-scripts/src/main/resources/python/defines.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
def xpath(str):
return page.getHtml().xpath(str).toString()

def css(str):
return page.getHtml().css(str).toString()

def urls(str):
links=page.getHtml().links().regex(str).all()
page.addTargetRequests(links);

def tomap(key,value):
return "hello world"

4 changes: 4 additions & 0 deletions webmagic-scripts/src/main/resources/python/oschina.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
title=xpath("div[@class=BlogTitle]")
urls="http://my\\.oschina\\.net/flashsword/blog/\\d+"

result={"title":title,"urls":urls}
Empty file modified webmagic-scripts/src/main/resources/ruby/defines.rb
100644 → 100755
Empty file.
Empty file modified webmagic-scripts/src/main/resources/ruby/github.rb
100644 → 100755
Empty file.
5 changes: 4 additions & 1 deletion webmagic-scripts/src/main/resources/ruby/oschina.rb
Original file line number Diff line number Diff line change
@@ -1,3 +1,6 @@
urls "http://my\\.oschina\\.net/flashsword/blog/\\d+"
title = css "div.BlogTitle h1"
content = css "div.BlogContent"
urls "http://my\\.oschina\\.net/flashsword/blog/\\d+"

return {"title"=>title,"content"=>content}

8 changes: 8 additions & 0 deletions webmagic-scripts/src/test/java/us/codecraft/webmagic/scripts/ScriptProcessorTest.java
100644 → 100755
Original file line number Diff line number Diff line change
Expand Up @@ -22,4 +22,12 @@ public void testRubyProcessor() {
pageProcessor.getSite().setSleepTime(0);
Spider.create(pageProcessor).addUrl("http://my.oschina.net/flashsword/blog").setSpawnUrl(false).run();
}


@Test
public void testPythonProcessor() {
ScriptProcessor pageProcessor = ScriptProcessorBuilder.custom().language(Language.Jython).scriptFromClassPathFile("python/oschina.py").build();
pageProcessor.getSite().setSleepTime(0);
Spider.create(pageProcessor).addUrl("http://my.oschina.net/flashsword/blog").setSpawnUrl(false).run();
}
}
Empty file modified webmagic-scripts/src/test/resouces/log4j.xml
100644 → 100755
Empty file.