Skip to content

Commit

Permalink
全面升级
Browse files Browse the repository at this point in the history
一、界面升级:
1. Windows下编译时自动添加图标
2. web及cmd版的命令行窗口添加软件名称

二、下载器相关升级:
1. 初步增加PhantomJS下载器,实现双下载引擎,可在请求中指定
2. 下载控制转移至Request中,从而下载更灵活
3. Request增加重定向跳转次数控制功能
4. Request可以支持无限重载
5. Request增加序列化方法
6. 下载器实例改为全局唯一

三、规则语法升级:
1. 文本结果输出方法改为Output(ruleName string, resp *context.Response, item interface{})
2. 文件输出方法建议用FileOutput(resp *context.Response, name ...string)
3. 更改解析函数Parse(resp *context.Response, ruleName ...string),ruleName为空时调用Root函数
4. OutFeild()方法改为IndexOutFeild()
5. Spider结构体新增CreatItem(ruleName string, item map[int]interface{}) map[string]interface{},将map索引自动转换为OutFeild字段
6. Spider结构体新增代理服务器列表
6. AddQueue()接收*Request参数,UseCookie改为EnableCookie
7. 规则方法库中升级timer计时器

四、数据库操作升级:
1. 优化更新mongodb操作方法库
2. 数据库连接池增加空闲连接定时回收
  • Loading branch information
andeya committed Oct 20, 2015
1 parent ccb5e04 commit 5b2fa32
Show file tree
Hide file tree
Showing 14 changed files with 202 additions and 132 deletions.
27 changes: 16 additions & 11 deletions README.md
Expand Up @@ -4,7 +4,7 @@ Pholcus(幽灵蛛)是一款纯Go语言编写的高并发、分布式、重

![image](https://github.com/henrylee2cn/pholcus/blob/master/doc/icon.png)

* 稳定版: [Version 0.7.2 (Oct 11, 2015)](https://github.com/henrylee2cn/pholcus/releases)
* 稳定版: [Version 0.7.3 (Oct 20, 2015)](https://github.com/henrylee2cn/pholcus/releases)

* 官方QQ群:Go大数据 42731170 [![Go大数据群](http://pub.idqqimg.com/wpa/images/group.png)](http://shang.qq.com/wpa/qunwpa?idkey=83ee3e1a4be6bdb2b08a51a044c06ae52cf10a082f7c5cf6b36c1f78e8b03589)

Expand Down Expand Up @@ -56,7 +56,7 @@ import (
"github.com/henrylee2cn/pholcus/logs"
_ "github.com/pholcus/spider_lib" // 此为公开维护的spider规则库
// _ "path/myrule_lib" // 同样你也可以自由添加自己的规则库
// _ "github.com/pholcus/spider_lib_pte" // 同样你也可以自由添加自己的规则库
)
func main() {
Expand All @@ -74,30 +74,35 @@ func main() {
// 自定义相关配置,将覆盖默认值
func SetConf() {
//mongodb服务器地址
config.MGO_OUTPUT.Host = "127.0.0.1:27017"
config.MGO_OUTPUT.HOST = "127.0.0.1:27017"
// mongodb数据库
config.MGO_OUTPUT.DB = "pholcus"
// mongodb输出时的内容分类
// key:蜘蛛规则清单
// value:数据库名
config.MGO_OUTPUT.DBClass = map[string]string{
config.MGO_OUTPUT.DB_CLASS = map[string]string{
"百度RSS新闻": "1_1",
}
// mongodb输出时非默认数据库时以当前时间为集合名
// h: 精确到小时 (格式 2015-08-28-09)
// d: 精确到天 (格式 2015-08-28)
config.MGO_OUTPUT.TableFmt = "d"
config.MGO_OUTPUT.COLLECTION_FMT = "d"
//mysql连接池容量
config.MGO_OUTPUT.MaxConns = 1024
config.MGO_OUTPUT.MAX_CONNS = 1024
//mysql服务器地址
config.MYSQL_OUTPUT.Host = "127.0.0.1:3306"
config.MYSQL_OUTPUT.HOST = "127.0.0.1:3306"
//msyql数据库
config.MYSQL_OUTPUT.DefaultDB = "pholcus"
config.MYSQL_OUTPUT.DB = "pholcus"
//mysql用户
config.MYSQL_OUTPUT.User = "root"
config.MYSQL_OUTPUT.USER = "root"
//mysql密码
config.MYSQL_OUTPUT.Password = ""
config.MYSQL_OUTPUT.PASSWORD = ""
//mysql连接池容量
config.MYSQL_OUTPUT.MaxConns = 1024
config.MYSQL_OUTPUT.MAX_CONNS = 1024
// Surfer-Phantom下载器配置
config.SURFER_PHANTOM.FULL_APP_NAME = "phantomjs" //phantomjs软件相对路径与名称
}
```
 
Expand Down
10 changes: 9 additions & 1 deletion app/downloader/context/request.go
Expand Up @@ -56,6 +56,9 @@ type Request struct {

// 是否可以被重复下载(即不被去重)
Duplicatable bool

//是否使用PhantomJS下载器,特点破防力强,速度慢
UsePhantomJS bool
}

// 发送请求前的准备工作,设置一系列默认值
Expand All @@ -68,7 +71,8 @@ type Request struct {
// Request.ConnTimeout默认为常量DefaultConnTimeout,小于0时不限制下载超时;
// Request.TryTimes默认为常量DefaultTryTimes,小于0时不限制失败重载次数;
// Request.RedirectTimes默认不限制重定向次数,小于0时可禁止重定向跳转;
// Request.RetryPause默认为常量DefaultRetryPause.
// Request.RetryPause默认为常量DefaultRetryPause;
// Request.UsePhantomJS为true时,使用PhantomJS下载器下载,破防力强,速度慢,暂不支持图片下载。
func (self *Request) Prepare() *Request {
if self.Method == "" {
self.Method = "GET"
Expand Down Expand Up @@ -233,3 +237,7 @@ func (self *Request) SetPriority(priority int) *Request {
self.Priority = priority
return self
}

func (self *Request) GetUsePhantomJS() bool {
return self.UsePhantomJS
}
40 changes: 40 additions & 0 deletions app/downloader/downloader_surf.go
@@ -0,0 +1,40 @@
package downloader

import (
"net/http"

"github.com/henrylee2cn/pholcus/app/downloader/context"
"github.com/henrylee2cn/pholcus/config"
"github.com/henrylee2cn/surfer"
)

type Surfer struct {
surf surfer.Surfer
phantom surfer.Surfer
}

var SurferDownloader = &Surfer{
surf: surfer.New(),
phantom: surfer.NewPhantom(config.SURFER_PHANTOM.FULL_APP_NAME, config.SURFER_PHANTOM.FULL_TEMP_JS),
}

func (self *Surfer) Download(cReq *context.Request) *context.Response {
cResp := context.NewResponse(nil)

var resp *http.Response
var err error

if cReq.GetUsePhantomJS() {
resp, err = self.phantom.Download(cReq)
} else {
resp, err = self.surf.Download(cReq)
}

cResp.SetRequest(cReq)

cResp.SetResponse(resp)

cResp.SetError(err)

return cResp
}
10 changes: 5 additions & 5 deletions app/pipeline/collector/output_csv.go
Expand Up @@ -6,6 +6,7 @@ import (
"os"

"github.com/henrylee2cn/pholcus/common/util"
"github.com/henrylee2cn/pholcus/config"
"github.com/henrylee2cn/pholcus/logs"
)

Expand All @@ -18,14 +19,13 @@ func init() {
}
}()

folder1 := "result/data"
folder2 := folder1 + "/" + self.startTime.Format("2006年01月02日 15时04分05秒")
filenameBase := folder2 + "/" + util.FileNameReplace(self.Spider.GetName()+"_"+self.Spider.GetKeyword()+" "+fmt.Sprintf("%v", self.sum[0])+"-"+fmt.Sprintf("%v", self.sum[1]))
folder := config.COMM_PATH.TEXT + "/" + self.startTime.Format("2006年01月02日 15时04分05秒")
filenameBase := folder + "/" + util.FileNameReplace(self.Spider.GetName()+"_"+self.Spider.GetKeyword()+" "+fmt.Sprintf("%v", self.sum[0])+"-"+fmt.Sprintf("%v", self.sum[1]))

// 创建/打开目录
f2, err := os.Stat(folder2)
f2, err := os.Stat(folder)
if err != nil || !f2.IsDir() {
if err := os.MkdirAll(folder2, 0777); err != nil {
if err := os.MkdirAll(folder, 0777); err != nil {
logs.Log.Error("Error: %v\n", err)
}
}
Expand Down
13 changes: 7 additions & 6 deletions app/pipeline/collector/output_excel.go
Expand Up @@ -2,10 +2,12 @@ package collector

import (
"fmt"
"os"

"github.com/henrylee2cn/pholcus/common/util"
"github.com/henrylee2cn/pholcus/config"
"github.com/henrylee2cn/pholcus/logs"
"github.com/tealeg/xlsx"
"os"
)

/************************ excel 输出 ***************************/
Expand All @@ -23,9 +25,8 @@ func init() {
var cell *xlsx.Cell
var err error

folder1 := "result/data"
folder2 := folder1 + "/" + self.startTime.Format("2006年01月02日 15时04分05秒")
filename := folder2 + "/" + util.FileNameReplace(self.Spider.GetName()+"_"+self.Spider.GetKeyword()+" "+fmt.Sprintf("%v", self.sum[0])+"-"+fmt.Sprintf("%v", self.sum[1])) + ".xlsx"
folder := config.COMM_PATH.TEXT + "/" + self.startTime.Format("2006年01月02日 15时04分05秒")
filename := folder + "/" + util.FileNameReplace(self.Spider.GetName()+"_"+self.Spider.GetKeyword()+" "+fmt.Sprintf("%v", self.sum[0])+"-"+fmt.Sprintf("%v", self.sum[1])) + ".xlsx"

// 创建文件
file = xlsx.NewFile()
Expand Down Expand Up @@ -79,9 +80,9 @@ func init() {
}

// 创建/打开目录
f2, err := os.Stat(folder2)
f2, err := os.Stat(folder)
if err != nil || !f2.IsDir() {
if err := os.MkdirAll(folder2, 0777); err != nil {
if err := os.MkdirAll(folder, 0777); err != nil {
logs.Log.Error("Error: %v\n", err)
}
}
Expand Down
6 changes: 3 additions & 3 deletions app/pipeline/collector/output_util.go
Expand Up @@ -9,8 +9,8 @@ import (

// 返回数据库及集合名称
func dbOrTabName(c *Collector) (dbName, tableName string) {
if v, ok := config.MGO_OUTPUT.DBClass[c.Spider.GetName()]; ok {
switch config.MGO_OUTPUT.TableFmt {
if v, ok := config.MGO_OUTPUT.DB_CLASS[c.Spider.GetName()]; ok {
switch config.MGO_OUTPUT.COLLECTION_FMT {
case "h":
return v, time.Now().Format("2006-01-02-15")
case "d":
Expand All @@ -19,7 +19,7 @@ func dbOrTabName(c *Collector) (dbName, tableName string) {
return v, time.Now().Format("2006-01-02")
}
}
return config.MGO_OUTPUT.DefaultDB, ""
return config.MGO_OUTPUT.DB, ""
}

// 当输出数据库为config.MGO_OUTPUT.DefaultDB时,使用tabName获取table名
Expand Down
8 changes: 5 additions & 3 deletions app/pipeline/collector/savefile.go
@@ -1,11 +1,13 @@
package collector

import (
"github.com/henrylee2cn/pholcus/common/util"
"github.com/henrylee2cn/pholcus/logs"
"io"
"os"
"time"

"github.com/henrylee2cn/pholcus/common/util"
"github.com/henrylee2cn/pholcus/config"
"github.com/henrylee2cn/pholcus/logs"
)

//文件输出管理
Expand All @@ -19,7 +21,7 @@ func (self *Collector) SaveFile() {
self.setFileSum(1)

// 路径: file/"RuleName"/"time"/"Name"
dir := `result/file/` + self.Spider.GetName() + `/` + util.FileNameReplace(file["RuleName"].(string)) + `/` + self.startTime.Format("2006年01月02日 15时04分05秒") + `/`
dir := config.COMM_PATH.FILE + `/` + self.Spider.GetName() + `/` + util.FileNameReplace(file["RuleName"].(string)) + `/` + self.startTime.Format("2006年01月02日 15时04分05秒") + `/`

// 创建/打开目录
d, err := os.Stat(dir)
Expand Down
3 changes: 2 additions & 1 deletion app/spider/spider.go
Expand Up @@ -38,7 +38,8 @@ type Spider struct {
// Request.ConnTimeout默认为常量context.DefaultConnTimeout,小于0时不限制下载超时;
// Request.TryTimes默认为常量context.DefaultTryTimes,小于0时不限制失败重载次数;
// Request.RedirectTimes默认不限制重定向次数,小于0时可禁止重定向跳转;
// Request.RetryPause默认为常量context.DefaultRetryPause.
// Request.RetryPause默认为常量context.DefaultRetryPause;
// Request.UsePhantomJS为true时,使用PhantomJS下载器下载,破防力强,速度慢,暂不支持图片下载。
func (self *Spider) AddQueue(req *context.Request) {
req.
SetSpiderName(self.Name).
Expand Down
25 changes: 10 additions & 15 deletions common/deduplicate/deduplicate.go
Expand Up @@ -2,7 +2,6 @@ package deduplicate

import (
"encoding/json"
// "fmt"
"github.com/henrylee2cn/pholcus/common/mgo"
"github.com/henrylee2cn/pholcus/common/util"
"github.com/henrylee2cn/pholcus/config"
Expand All @@ -11,16 +10,11 @@ import (
"gopkg.in/mgo.v2/bson"
"io/ioutil"
"os"
"path"
"strings"
"sync"
)

const (
dir = `result/cache/`
fileName = "deduplication"
collection = "deduplication_log"
)

type Deduplicate interface {
// 采集非重复样本并返回对比结果,重复为true
Compare(obj interface{}) bool
Expand Down Expand Up @@ -79,24 +73,25 @@ func (self *Deduplication) Write(provider string) {
i++
}
mgo.Mgo(nil, "insert", map[string]interface{}{
"Database": config.MGO_OUTPUT.DefaultDB,
"Collection": collection,
"Database": config.DEDUPLICATION.DB,
"Collection": config.DEDUPLICATION.COLLECTION,
"Docs": docs,
})

case status.FILE:
fallthrough
default:
p, _ := path.Split(config.DEDUPLICATION.FULL_FILE_NAME)
// 创建/打开目录
d, err := os.Stat(dir)
d, err := os.Stat(p)
if err != nil || !d.IsDir() {
if err := os.MkdirAll(dir, 0777); err != nil {
if err := os.MkdirAll(p, 0777); err != nil {
logs.Log.Error("Error: %v\n", err)
}
}

// 创建并写入文件
f, _ := os.Create(dir + fileName)
f, _ := os.Create(config.DEDUPLICATION.FULL_FILE_NAME)
b, _ := json.Marshal(self.sampling)
f.Write(b)
f.Close()
Expand All @@ -110,8 +105,8 @@ func (self *Deduplication) ReRead(provider string) {
case status.MGO:
var docs = map[string]interface{}{}
err := mgo.Mgo(&docs, "find", map[string]interface{}{
"Database": config.MGO_OUTPUT.DefaultDB,
"Collection": collection,
"Database": config.DEDUPLICATION.DB,
"Collection": config.DEDUPLICATION.COLLECTION,
})
if err != nil {
logs.Log.Error("去重读取mgo: %v", err)
Expand All @@ -124,7 +119,7 @@ func (self *Deduplication) ReRead(provider string) {
case status.FILE:
fallthrough
default:
f, err := os.Open(dir + fileName)
f, err := os.Open(config.DEDUPLICATION.FULL_FILE_NAME)
if err != nil {
return
}
Expand Down
4 changes: 2 additions & 2 deletions common/mgo/pool.go
Expand Up @@ -12,11 +12,11 @@ type MgoSrc struct {
*mgo.Session
}

var MgoPool = pool.NewPool(new(MgoSrc), config.MGO_OUTPUT.MaxConns)
var MgoPool = pool.NewPool(new(MgoSrc), config.MGO_OUTPUT.MAX_CONNS)

// 新建数据库连接
func (*MgoSrc) New() pool.Src {
session, err := mgo.Dial(config.MGO_OUTPUT.Host)
session, err := mgo.Dial(config.MGO_OUTPUT.HOST)
if err != nil {
logs.Log.Error("%v", err)
}
Expand Down
4 changes: 2 additions & 2 deletions common/mysql/mysql.go
Expand Up @@ -10,14 +10,14 @@ import (
)

/************************ Mysql 输出 ***************************/
var MysqlPool = pool.NewPool(new(MysqlSrc), config.MYSQL_OUTPUT.MaxConns)
var MysqlPool = pool.NewPool(new(MysqlSrc), config.MYSQL_OUTPUT.MAX_CONNS)

type MysqlSrc struct {
*sql.DB
}

func (self *MysqlSrc) New() pool.Src {
db, err := sql.Open("mysql", config.MYSQL_OUTPUT.User+":"+config.MYSQL_OUTPUT.Password+"@tcp("+config.MYSQL_OUTPUT.Host+")/"+config.MYSQL_OUTPUT.DefaultDB+"?charset=utf8")
db, err := sql.Open("mysql", config.MYSQL_OUTPUT.USER+":"+config.MYSQL_OUTPUT.PASSWORD+"@tcp("+config.MYSQL_OUTPUT.HOST+")/"+config.MYSQL_OUTPUT.DB+"?charset=utf8")
if err != nil {
panic(err)
}
Expand Down

0 comments on commit 5b2fa32

Please sign in to comment.