Skip to content

Commit

Permalink
Make crawlers able to crawl internal hyper links
Browse files Browse the repository at this point in the history
  • Loading branch information
fanfank committed Oct 26, 2015
1 parent f74afc1 commit ab66444
Show file tree
Hide file tree
Showing 4 changed files with 191 additions and 4 deletions.
57 changes: 54 additions & 3 deletions Phpfetcher/Crawler/Default.php
Original file line number Diff line number Diff line change
Expand Up @@ -36,8 +36,18 @@ abstract class Phpfetcher_Crawler_Default extends Phpfetcher_Crawler_Abstract {
protected $_arrFetchJobs = array();
protected $_arrHash = array();
protected $_arrAdditionalUrls = array();
protected $_objSchemeTrie = array(); //合法url scheme的字典树
//protected $_objPage = NULL; //Phpfetcher_Page_Default;

public function __construct($arrInitParam = array()) {
if (!isset($arrInitParam['url_schemes'])) {
$arrInitParam['url_schemes'] = array("http", "https", "ftp");
}

$this->_objSchemeTrie =
new Phpfetcher_Util_Trie($arrInitParam['url_schemes']);
}

/**
* @author xuruiqi
* @param
Expand Down Expand Up @@ -220,6 +230,7 @@ public function &run($arrInput = array()) {
while (!empty($arrJobs[$arrIndice[0]])
&& ($job_rules['max_depth'] === -1 || $intDepth < $job_rules['max_depth'])
&& ($job_rules['max_pages'] === -1 || $intPageNum < $job_rules['max_pages'])) {

$intDepth += 1;
$intPopIndex = $arrIndice[0];
$intPushIndex = $arrIndice[1];
Expand All @@ -232,15 +243,55 @@ public function &run($arrInput = array()) {
$objPage->read();

//获取所有的超链接
$arrLinks = $objPage->getHyperLinks();
$arrLinks = $objPage->getHyperLinks();

//解析当前URL的各个组成部分,以应对超链接中存在站内链接
//的情况,如"/entry"等形式的URL
$strCurUrl = $objPage->getUrl();
$arrUrlComponents = parse_url($strCurUrl);

//匹配超链接
foreach ($job_rules['link_rules'] as $link_rule) {
foreach ($arrLinks as $link) {
//if (preg_match($link_rule, $link) === 1
// && !$this->getHash($link)) {
// $this->setHash($link, true);
// $arrJobs[$intPushIndex][] = $link;
//}
if (preg_match($link_rule, $link) === 1
&& !$this->getHash($link)) {
&& !$this->getHash($link)) {

//拼出实际的URL
$real_link = $link;

//不使用strpos,防止扫描整个字符串
//这里只需要扫描前6个字符即可
$colon_pos = false;
for ($i = 0; $i <= 5; ++$i) {
if ($link[$i] == ':') {
$colon_pos = $i;
break;
}
}

if ($colon_pos === false
|| !$this->_objSchemeTrie->has(
substr($link, 0, $colon_pos))) {
//将站内地址转换为完整地址
$real_link = $arrUrlComponents['scheme']
. "://"
. $arrUrlComponents['host']
. (isset($arrUrlComponents['port'])
&& strlen($arrUrlComponents['port']) != 0 ?
":{$arrUrlComponents['port']}" :
"")
. ($link[0] == '/' ?
$link : "/$link");
}

$this->setHash($link, true);
$arrJobs[$intPushIndex][] = $link;
$this->setHash($real_link, true);
$arrJobs[$intPushIndex][] = $real_link;
}
}
}
Expand Down
87 changes: 87 additions & 0 deletions Phpfetcher/Util/Trie.php
Original file line number Diff line number Diff line change
@@ -0,0 +1,87 @@
<?php
/*
* @author xuruiqi
* @date 2015-10-26
* @copyright reetsee.com
* @desc 字典树的简单实现,没有做内存优化
* A simple implementation of trie without improvements on memory
*/
class Phpfetcher_Util_Trie {
protected $_arrTrieRoot = array();

public function __construct($arrStrings = array()) {
$this->_arrTrieRoot = array(
'children' => array(),
'count' => 0,
);
foreach ($arrStrings as $str) {
$this->insert($str);
}
}

public function insert($str) {
try {
$str = strval($str);
$intLen = strlen($str);
$arrCurNode = &$this->_arrTrieRoot;

for ($i = 0; $i < $intLen; ++$i) {
if (!isset($arrCurNode['children'][$str[$i]])) {
$arrCurNode['children'][$str[$i]] = array(
'children' => array(),
'count' => 0,
);
}
$arrCurNode = &$arrCurNode['children'][$str[$i]];
}

$arrCurNode['count'] += 1;
unset($arrCurNode);

} catch (Exception $e) {
Phpfetcher_Log::fatal($e->getMessage());
return false;
}

return true;
}

public function delete($str) {
$arrCurNode = &$this->_locateNode($str);
if (!is_null($arrCurNode) && $arrCurNode['count'] > 0) {
$arrCurNode['count'] -= 1;
}
unset($arrCurNode);
return true;
}

public function has($str) {
$arrTargetNode = &$this->_locateNode($str);
$bolRes = false;
if (!is_null($arrTargetNode) && $arrTargetNode['count'] > 0) {
$bolRes = true;
}
unset($arrTargetNode);
return $bolRes;
}

protected function &_locateNode($str) {
$str = strval($str);
$intLen = strlen($str);
$arrCurNode = &$this->_arrTrieRoot;

for ($i = 0; $i < $intLen; ++$i) {
if (!isset($arrCurNode['children'][$str[$i]])) {
return null;
}
$arrCurNode = &$arrCurNode['children'][$str[$i]];
}

return $arrCurNode;
}

//public function startsWith($str) {
// $str = strval($str);
// //TODO
//}
};
7 changes: 6 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,8 @@
#Phpfetcher

##重要修改记录 - Important Improvements Log
2015-10-26 可以爬取网站内链(如"/entry"的超链接)。 Able to crawl website internal hyper links(say "/entry").

##中文说明(Scroll Down to See The English Description)
一个PHP爬虫框架
框架的起源请参见:http://blog.reetsee.com/archives/366
Expand Down Expand Up @@ -279,4 +282,6 @@ There are still lots of imperfect sides of Phpfetcher, including multi-threading
But that is probably what makes this framework easy to learn, to maintain.
I will not deny that there are many designing problems despite of the lack of features, and I will push the project forward once more and more developers demand more and more necessary features.
Until now, this framework meets most of the demands of its little user group.
I hope you enjoy using Phpfetcher!
I hope you enjoy using Phpfetcher!


44 changes: 44 additions & 0 deletions tests/test_phpfetcher_util_trie.php
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
<?php
$test_include_path = dirname(__FILE__) . '/../';
set_include_path(get_include_path() . PATH_SEPARATOR . $test_include_path);

require_once('phpfetcher.php');

function print_trie(&$trie) {
echo "has ftp:" . var_export($trie->has("ftp"), true) . "\n";
echo "has http:" . var_export($trie->has("http"), true) . "\n";
echo "has https:" . var_export($trie->has("https"), true) . "\n";
echo "\n";
}

$arrSchemes = array(
"http",
"https",
"ftp",
);
$trie = new Phpfetcher_Util_Trie($arrSchemes);
print_trie($trie);

echo "delete 'abc'\n";
$trie->delete("abc");
print_trie($trie);

echo "delete 'ftp'\n";
$trie->delete("ftp");
print_trie($trie);

echo "delete 'http'\n";
$trie->delete("http");
print_trie($trie);

echo "insert 'ftp'\n";
$trie->insert("ftp");
print_trie($trie);

echo "delete 'https'\n";
$trie->delete("https");
print_trie($trie);

echo "insert 'http'\n";
$trie->insert("http");
print_trie($trie);

0 comments on commit ab66444

Please sign in to comment.