Skip to content

Commit

Permalink
add whitelistURL option
Browse files Browse the repository at this point in the history
  • Loading branch information
computermacgyver committed Nov 4, 2012
1 parent a0dc4ca commit b717db9
Show file tree
Hide file tree
Showing 2 changed files with 27 additions and 0 deletions.
18 changes: 18 additions & 0 deletions LIB_simple_spider.php
Expand Up @@ -45,6 +45,12 @@
$whitelistdomainlist_part=":$whitelistdomainlist:";
}

global $whitelisturl,$whitelisturllist,$whitelisturllist_arr;
if ($whitelisturl) {
$whitelisturl_arr=explode(":",$whitelisturllist);
}





Expand Down Expand Up @@ -339,6 +345,18 @@ function exclude_link($link)
$exclude=true;
}
}
}

global $whitelisturl,$$whitelisturllist_arr
if ($whitelisturl) {
$found=false;
for ($x=0;$x<count($whitelisturllist_arr);$x++) {
if (strpos($link,$whitelisturllist_arr[$x])!==false) {
$found=true;
break;
}
}
if ($found===false) $exclude=true;
}

return $exclude;
Expand Down
9 changes: 9 additions & 0 deletions example_CONFIG_db.php
Expand Up @@ -28,6 +28,15 @@
$whitelistdomainlevel=2;
//list of domains separated with : (no starting / ending :)
$whitelistdomainlist="gov.uk:org.uk";

//Restrict crawling to a URL whitelist? true | false
$whitelisturl=false;
//list of phrases that must appear in url separated with : (no starting / ending :)
$whitelisturllist="example.com/this-part-only:anothersite.com:thirdsite.com/with-sub-section";




//Fetch only first part of each page, to avoid huge files? (Experimental!)
$fetchrangeonly=false;

Expand Down

0 comments on commit b717db9

Please sign in to comment.