Make crawlers able to crawl internal hyper links

fanfank · Oct 26, 2015 · ab66444 · ab66444
1 parent f74afc1
commit ab66444
Show file tree

Hide file tree

Showing 4 changed files with 191 additions and 4 deletions.
diff --git a/Phpfetcher/Crawler/Default.php b/Phpfetcher/Crawler/Default.php
@@ -36,8 +36,18 @@ abstract class Phpfetcher_Crawler_Default extends Phpfetcher_Crawler_Abstract {
     protected $_arrFetchJobs = array();
     protected $_arrHash = array();
     protected $_arrAdditionalUrls = array();
+    protected $_objSchemeTrie = array(); //合法url scheme的字典树
     //protected $_objPage = NULL; //Phpfetcher_Page_Default;
 
+    public function __construct($arrInitParam = array()) {
+        if (!isset($arrInitParam['url_schemes'])) {
+            $arrInitParam['url_schemes'] = array("http", "https", "ftp");
+        }
+
+        $this->_objSchemeTrie = 
+                new Phpfetcher_Util_Trie($arrInitParam['url_schemes']);
+    }
+
     /**
      * @author xuruiqi
      * @param
@@ -220,6 +230,7 @@ public function &run($arrInput = array()) {
             while (!empty($arrJobs[$arrIndice[0]])
                 && ($job_rules['max_depth'] === -1 || $intDepth < $job_rules['max_depth']) 
                 && ($job_rules['max_pages'] === -1 || $intPageNum < $job_rules['max_pages'])) {
+
                 $intDepth += 1;
                 $intPopIndex = $arrIndice[0];
                 $intPushIndex = $arrIndice[1];
@@ -232,15 +243,55 @@ public function &run($arrInput = array()) {
                     $objPage->read();
 
                     //获取所有的超链接
-                    $arrLinks = $objPage->getHyperLinks();
+                    $arrLinks  = $objPage->getHyperLinks();
+
+                    //解析当前URL的各个组成部分，以应对超链接中存在站内链接
+                    //的情况，如"/entry"等形式的URL
+                    $strCurUrl = $objPage->getUrl();
+                    $arrUrlComponents = parse_url($strCurUrl);
 
                     //匹配超链接
                     foreach ($job_rules['link_rules'] as $link_rule) {
                         foreach ($arrLinks as $link) {
+                            //if (preg_match($link_rule, $link) === 1
+                            //        && !$this->getHash($link)) {
+                            //    $this->setHash($link, true);
+                            //    $arrJobs[$intPushIndex][] = $link;
+                            //}
                             if (preg_match($link_rule, $link) === 1
-                            && !$this->getHash($link)) {
+                                    && !$this->getHash($link)) {
+
+                                //拼出实际的URL
+                                $real_link = $link;
+
+                                //不使用strpos，防止扫描整个字符串
+                                //这里只需要扫描前6个字符即可
+                                $colon_pos = false;
+                                for ($i = 0; $i <= 5; ++$i) {
+                                    if ($link[$i] == ':') {
+                                        $colon_pos = $i;
+                                        break;
+                                    }
+                                }
+
+                                if ($colon_pos === false
+                                        || !$this->_objSchemeTrie->has(
+                                            substr($link, 0, $colon_pos))) {
+                                    //将站内地址转换为完整地址
+                                    $real_link = $arrUrlComponents['scheme']
+                                            . "://"
+                                            . $arrUrlComponents['host']
+                                            . (isset($arrUrlComponents['port'])
+                                                && strlen($arrUrlComponents['port']) != 0 ?
+                                                    ":{$arrUrlComponents['port']}" :
+                                                    "")
+                                            . ($link[0] == '/' ?
+                                                $link : "/$link");
+                                }
+
                                 $this->setHash($link, true);
-                                $arrJobs[$intPushIndex][] = $link;
+                                $this->setHash($real_link, true);
+                                $arrJobs[$intPushIndex][] = $real_link;
                             }
                         }
                     }

diff --git a/Phpfetcher/Util/Trie.php b/Phpfetcher/Util/Trie.php
@@ -0,0 +1,87 @@
+<?php
+/*
+ * @author xuruiqi
+ * @date 2015-10-26
+ * @copyright reetsee.com
+ * @desc 字典树的简单实现，没有做内存优化
+ *       A simple implementation of trie without improvements on memory
+ */
+class Phpfetcher_Util_Trie {
+    protected $_arrTrieRoot = array();
+
+    public function __construct($arrStrings = array()) {
+        $this->_arrTrieRoot = array(
+            'children' => array(),       
+            'count'    => 0,
+        );
+        foreach ($arrStrings as $str) {
+            $this->insert($str);
+        }
+    }
+
+    public function insert($str) {
+        try {
+            $str        = strval($str);
+            $intLen     = strlen($str);
+            $arrCurNode = &$this->_arrTrieRoot;
+
+            for ($i = 0; $i < $intLen; ++$i) {
+                if (!isset($arrCurNode['children'][$str[$i]])) {
+                    $arrCurNode['children'][$str[$i]] = array(
+                        'children' => array(),
+                        'count'    => 0,
+                    );
+                }
+                $arrCurNode = &$arrCurNode['children'][$str[$i]];
+            }
+
+            $arrCurNode['count'] += 1;
+            unset($arrCurNode);
+
+        } catch (Exception $e) {
+            Phpfetcher_Log::fatal($e->getMessage());
+            return false;
+        }
+
+        return true;
+    }
+
+    public function delete($str) {
+        $arrCurNode = &$this->_locateNode($str);
+        if (!is_null($arrCurNode) && $arrCurNode['count'] > 0) {
+            $arrCurNode['count'] -= 1;
+        }
+        unset($arrCurNode);
+        return true;
+    }
+
+    public function has($str) {
+        $arrTargetNode = &$this->_locateNode($str);
+        $bolRes = false;
+        if (!is_null($arrTargetNode) && $arrTargetNode['count'] > 0) {
+            $bolRes = true;
+        }
+        unset($arrTargetNode);
+        return $bolRes;
+    }
+
+    protected function &_locateNode($str) {
+        $str = strval($str);
+        $intLen     = strlen($str);
+        $arrCurNode = &$this->_arrTrieRoot;
+
+        for ($i = 0; $i < $intLen; ++$i) {
+            if (!isset($arrCurNode['children'][$str[$i]])) {
+                return null;
+            }
+            $arrCurNode = &$arrCurNode['children'][$str[$i]];
+        }
+
+        return $arrCurNode;
+    }
+
+    //public function startsWith($str) {
+    //    $str = strval($str);
+    //    //TODO
+    //}
+};
diff --git a/README.md b/README.md
@@ -1,5 +1,8 @@
 #Phpfetcher    
 
+##重要修改记录 - Important Improvements Log   
+2015-10-26 可以爬取网站内链（如"/entry"的超链接）。 Able to crawl website internal hyper links(say "/entry").      
+
 ##中文说明(Scroll Down to See The English Description)    
 一个PHP爬虫框架   
 框架的起源请参见：http://blog.reetsee.com/archives/366        
@@ -279,4 +282,6 @@ There are still lots of imperfect sides of Phpfetcher, including multi-threading
 But that is probably what makes this framework easy to learn, to maintain.
 I will not deny that there are many designing problems despite of the lack of features, and I will push the project forward once more and more developers demand more and more necessary features.
 Until now, this framework meets most of the demands of its little user group.
-I hope you enjoy using Phpfetcher!
+I hope you enjoy using Phpfetcher!     
+
+
diff --git a/tests/test_phpfetcher_util_trie.php b/tests/test_phpfetcher_util_trie.php
@@ -0,0 +1,44 @@
+<?php
+$test_include_path = dirname(__FILE__) . '/../';
+set_include_path(get_include_path() . PATH_SEPARATOR . $test_include_path);
+
+require_once('phpfetcher.php');
+
+function print_trie(&$trie) {
+    echo "has ftp:" . var_export($trie->has("ftp"), true) . "\n";
+    echo "has http:" . var_export($trie->has("http"), true) . "\n";
+    echo "has https:" . var_export($trie->has("https"), true) . "\n";
+    echo "\n";
+}
+
+$arrSchemes = array(
+    "http",        
+    "https",
+    "ftp",
+);
+$trie = new Phpfetcher_Util_Trie($arrSchemes);
+print_trie($trie);
+
+echo "delete 'abc'\n";
+$trie->delete("abc");
+print_trie($trie);
+
+echo "delete 'ftp'\n";
+$trie->delete("ftp");
+print_trie($trie);
+
+echo "delete 'http'\n";
+$trie->delete("http");
+print_trie($trie);
+
+echo "insert 'ftp'\n";
+$trie->insert("ftp");
+print_trie($trie);
+
+echo "delete 'https'\n";
+$trie->delete("https");
+print_trie($trie);
+
+echo "insert 'http'\n";
+$trie->insert("http");
+print_trie($trie);