Skip to content

爬虫+Web demo #9

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 4 commits into from
Sep 18, 2017
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 11 additions & 0 deletions apps/regex-test2.ss
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
(import (scheme) (regex regex))

(define pattern "([^=&]+)=([^=&]*)")
(define str "a=b&c=d&=e&g=&1=2")

(printf "match?: ~a \nmatch: ~a \nmatch-count?: ~a \nmatches: ~a \n"
(regex-match? pattern str)
(regex-match pattern str)
(regex-match-count pattern str)
(regex-matches pattern str)
)
2 changes: 1 addition & 1 deletion apps/web-demo.ss
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
(load "../packages/web/libra.scm")
(import (web libra))

(get! "/"
(lambda (p) (view "../apps/index.html")))
Expand Down
25 changes: 25 additions & 0 deletions apps/web/app.ss
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
(import (web libra)
(sqlite sqlite))
(using "spider.ss")

(sqlite-name! (string-append (get-option "app-path") "/spider.db"))
(sqlite-exec "create table if not exists ImageInfo (id int primary key not null, page int);")

(get! "/"
(lambda (p) (view "index")))

(get! "/spider"
(lambda (p)
(define url (hashtable-ref p "key" "http://www.mm131.com/xinggan"))
(if (eq? #f (string-index url #\_))
(set! url (string-append url "/")))
(default-make-json (url->id/page url (get-option "app-path")))))

(run 8080)







3 changes: 3 additions & 0 deletions apps/web/content/css/index.css
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
#img-container{
margin-top: 30px
}
Binary file added apps/web/content/images/favicon.ico
Binary file not shown.
43 changes: 43 additions & 0 deletions apps/web/content/js/app.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
$(function(){
for(i=1; i<200; i++){
$('#page').append('<option value="' + i + '">' + i + '</option>');
}
var id = getUrlParam('id');
if (id == undefined || id == null || id == "")
return;
var page = new Number(getUrlParam('page'));
$('#img-container').html('');
for(i=1; i <= page; i++){
$('#img-container').append('<img src="/content/images/mm/' + id + '-' + i + '.jpg"><br/>');
}
});

$('.summit').click(function(){
var type = $('#type').val();
var page = $('#page').val();
var id = new Number($('#type option:selected').data('id'));
var url = "http://www.mm131.com/" + type;
if (page > 1)
url = url + "/list_" + id + "_" + page + ".html";
$('#tip').html('请求中...');
$.ajax({
url: "/spider?key=" +url,
success: function(data){
if (data == false || data.length == 0){
$('#tip').html('无内容, 请重试');
return;
}
$('#img-container').html('');
for(i in data){
$('#img-container').append('<a target="_blank" href="/?id=' + i + '&page=' + data[i] +'"><img src="/content/images/mm/' + i + '-1.jpg"><br/></a>');
}
$('#tip').html('请求完成, 点击图片查看详情');
}
});
});

function getUrlParam(name) {
var reg = new RegExp("(^|&)" + name + "=([^&]*)(&|$)"); //构造一个含有目标参数的正则表达式对象
var r = window.location.search.substr(1).match(reg); //匹配目标参数
if (r != null) return unescape(r[2]); return null; //返回参数值
}
71 changes: 71 additions & 0 deletions apps/web/spider.ss
Original file line number Diff line number Diff line change
@@ -0,0 +1,71 @@
(import (net curl)
(regex regex)
(sqlite sqlite))

;; 获取图片列表html
(define (get-images-html html)
(define pattern "<(dl*)\\b[^>]*>(.*?)</dl>")
(define rst (regex-match pattern html))
(if (null? rst)
""
(car rst))
)

;; 获取名称/id
(define (html->ids html)
(define pattern "<dd><a target.+?xinggan/(\\d+?).html\">[^>]+?>(.+?)</a>")
(define rst (regex-matches pattern html))
(define ids '())
(map (lambda (lst)
(set! ids (cons (cadr lst) ids)))
rst)
(reverse ids)
)

;; 获取一组图片数目
(define (get-max-page html)
(define pattern "<span class=\"page-ch\">.+?(\\d+).+?</span>")
(define rst (regex-match pattern html))
(if (null? rst)
0
(string->number (cadr rst)))
)

;; 下载某组图片所有地址
(define (id->page id path)
(define rst (sqlite-exec (string-append "select * from ImageInfo where id=" id)))

(define max-page (if (null? rst) 0 (string->number (cadr (car rst)))))
(if (= max-page 0)
(begin
(set! max-page (get-max-page (url->html (string-append "http://www.mm131.com/xinggan/" id ".html"))))
(let loop [(index 1)]
(if (<= index max-page)
(begin
(url->file
(string-append "http://img1.mm131.com/pic/" id "/" (number->string index) ".jpg")
(string-append path "/content/images/mm/" id "-" (number->string index) ".jpg"))
(loop (+ 1 index))
)
)
)
(sqlite-exec (string-append "INSERT INTO ImageInfo VALUES (" id "," (number->string max-page) ");"))
)
)
max-page
)

;; 方法组合
(define (url->id/page url path)
(define html (get-images-html (url->html url)))
(define infos (make-hashtable string-hash string=?))
(if (string=? html "")
#f
(begin
(map
(lambda (id) (hashtable-set! infos id (id->page id path)))
(html->ids html))
infos
)
)
)
31 changes: 31 additions & 0 deletions apps/web/views/index.html
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="UTF-8">
<meta name="viewport" content="width=device-width, initial-scale=1.0">
<meta http-equiv="X-UA-Compatible" content="ie=edge">
<link rel="icon" href="/content/images/favicon.ico" type="image/x-icon" />
<link rel="stylesheet" href="/content/css/index.css" type="text/css">
<title>爬虫</title>
</head>
<body>
<div>
<select id="type">
<option value="xinggan" data-id="6">性感美女</option>
<option value="qingchun" data-id="1">清纯美眉</option>
<option value="xiaohua" data-id="2">美女校花</option>
<option value="chemo" data-id="3">性感车模</option>
<option value="qipao" data-id="4">旗袍美女</option>
<option value="mingxing" data-id="7">性感车模</option>
</select>
<select id="page">
</select>
<button class="summit">提交</button>
<span id="tip"></span>
</div>
<div id="img-container">
</div>
<script src="http://lib.sinaapp.com/js/jquery/2.0.2/jquery-2.0.2.min.js"></script>
<script src="/content/js/app.js"></script>
</body>
</html>
65 changes: 65 additions & 0 deletions packages/net/curl.ss
Original file line number Diff line number Diff line change
@@ -0,0 +1,65 @@
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; created by : 1481892212@qq.com
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
(library (net curl)
(export
url->html
url->file)
(import (scheme)
(cffi cffi)
(net curl-ffi)
(c c-ffi))

(def-function-callback
make-write-callback
(void* int int void*)
int)

;; 返回html字符串
(define (url->html url)
(define curl (cffi-alloc 1024))
(define content "")
(define res -1)
(curl-global-init 3)
(set! curl (curl-easy-init))
(curl-easy-setopt curl 10002 url)
(curl-easy-setopt curl 20011
(make-write-callback
(lambda (ptr size nmemb stream)
(set! content (string-append content (cffi-string ptr)))
;; (display (format "callback ~a ~a ~a ~a\n" ptr size nmemb stream))
(* size nmemb))))
(set! res (curl-easy-perform curl))
(if (not (= 0 res))
(display (format "curl-easy-perform failed ~s\n" (curl-easy-strerror res))))
(curl-easy-cleanup curl)
(curl-global-cleanup)
(cffi-free curl)
content
)

;; 保存为文件
(define (url->file url file-name)
(define curl (cffi-alloc 1024))
(define my-file (c-fopen file-name "wb"))
(define res -1)
(define rst #f)
(curl-global-init CURL_GLOBAL_ALL)
(set! curl (curl-easy-init))
(curl-easy-setopt curl 10002 url)
(curl-easy-setopt curl 20011
(make-write-callback
(lambda (ptr size nmemb stream)
(c-fwrite ptr size nmemb my-file)
(* size nmemb))))
(set! res (curl-easy-perform curl))
(if (= 0 res)
(set! rst #t)
(display (format "curl-easy-perform failed ~s\n" (curl-easy-strerror res))))
(c-fclose my-file)
(curl-easy-cleanup curl)
(curl-global-cleanup)
(cffi-free curl)
rst
)
)
Loading