Permalink
Browse files

Add ability to whitelist remove uk centric code

  • Loading branch information...
1 parent f2d4c0f commit de193c8af74506bda12635d1b027cf42b3d9a600 @computermacgyver committed Jul 20, 2012
Showing with 52 additions and 38 deletions.
  1. +1 −1 .gitignore
  2. +3 −2 LIB_db_functions.php
  3. +40 −27 LIB_simple_spider.php
  4. +8 −8 spider.php
View
2 .gitignore
@@ -1,3 +1,3 @@
CONFIG_db.php
.screenrc
-
+*~
View
5 LIB_db_functions.php
@@ -2,7 +2,7 @@
#Database Functions
-include("CONFIG_db.php");
+include_once("CONFIG_db.php");
# Above file defines the following variables:
#
# $db_host = "localhost";
@@ -11,6 +11,7 @@
# $db_name = "somedb";
## $operator_email is used to mail the user on script completion
# $operator_email = "j.random@example.com";
+# $whitelistdomain, $whitelistdomainlevel, $whitelistdomainlist; //NEED to document SAH
#
function /*public*/ db_connect() {
@@ -82,7 +83,7 @@ function db_get_next_spider_target() {
global $MAX_PENETRATION;
#echo "Getting next target from database...\n";
$strSQL = "SELECT * from tblPages WHERE NOT bolHarvested AND iLevel < " . $MAX_PENETRATION .
- " AND strURL LIKE '%direct.gov.uk%' LIMIT 1";
+ " LIMIT 1";
return db_run_select($strSQL);
}
View
67 LIB_simple_spider.php
@@ -126,39 +126,51 @@ function get_domain($url) {
//remove www. as it is a default 3rd level domain that will usually be added if not present.
$url = str_replace("www.", "", $url);
+ $url = str_replace(":".get_port($url),"",$url);
+
// Remove page and directory references
if(stristr($url, "/"))
$url = substr($url, 0, strpos($url, "/"));
return $url;
}
-function get_second_level_domain($url) {
+function get_port($url) {
+ $url = str_replace(get_protocol($url), "", $url);
+ if (stristr($url,":")) {
+ $start=strpos($url,":");
+ $end=strpos($url,"/",$start);
+ echo "S=$start END=$end\n";
+ if ($end===false) return substr($url,$start+1);
+ return substr($url,$start+1,$end-1-$start);
+ } else {
+ return 80;
+ }
+}
+
+//return url from xth level domain 1=top level (uk), 2=gov.uk, 3= direct.gov.uk
+function get_domain_part($url,$level) {
$d = get_domain($url);
- //Some issues with ports or other strange things, chop off at .uk
- $d = substr($d,0,strrpos($d,".uk")+3);
-
- if (substr_count($d,".")<3) return $d;
-
+ if (substr_count($d,".")<=$level-1) return $d;
$len = strlen($d);
- //find third to last period
- $pos = strrpos($d,".");
- if ($pos===false) return $d;
- $pos=$len - $pos + 2;
-
- $pos = strrpos($d,".",-1*$pos);
- if ($pos===false) return $d;
- $pos=$len - $pos + 2;
-
-
- $pos = strrpos($d,".",-1*$pos);
- if ($pos===false) return $d;
- $pos = $pos+1;
- return substr($d,$pos);
+ //find $level to last period
+ $pos = -1;
+ $count=0;
+ while ($count<$level) {
+ $count++;
+ $pos = strrpos($d,".",-1*$pos-1);
+ if ($pos===false) break;
+
+ echo "POS=$pos";
+
+ $pos=$len - $pos;
+ echo " POS'=$pos str: " .substr($d,-1*$pos) . "\n";
+ }
+ $pos = $pos-1;
+ return substr($d,-1*$pos);
}
-
function get_protocol($url) {
$proto="udef://";
@@ -298,12 +310,13 @@ function exclude_link($link)
$exclude=true;
}
}*/
- $domain = get_domain($link);
- if (strpos($domain,".gov.uk")===false && strpos($domain,"mod.uk")===false && strpos($domain,"parliament.uk")===false
- && strpos($domain,"bl.uk")===false && strpos($domain,"nls.uk")===false) {
- $exclude=true;
- }
-
+ global $whitelistdomain, $whitelistdomainlevel, $whitelistdomainlist;
+ if ($whitelistdomain) {
+ $domain = get_domain_part($link,$whitelistdomainlevel);
+ if (strpos($whitelistdomainlist,":".$domain.":")===false) {
+ $exclude=true;
+ }
+ }
return $exclude;
}
View
16 spider.php
@@ -3,14 +3,14 @@
date_default_timezone_set('Europe/London');
# Initialization
-include("LIB_http.php"); // http library
-include("LIB_parse.php"); // parse library
-include("LIB_resolve_addresses.php"); // address resolution library
-include("LIB_exclusion_list.php"); // list of excluded keywords
-include("LIB_simple_spider.php"); // spider routines used by this app.
-include("LIB_db_functions.php");
-include("LIB_encoding.php");
-include("CONFIG_db.php");
+include_once("LIB_http.php"); // http library
+include_once("LIB_parse.php"); // parse library
+include_once("LIB_resolve_addresses.php"); // address resolution library
+include_once("LIB_exclusion_list.php"); // list of excluded keywords
+include_once("LIB_simple_spider.php"); // spider routines used by this app.
+include_once("LIB_db_functions.php");
+include_once("LIB_encoding.php");
+include_once("CONFIG_db.php");
set_time_limit(0); // Don't let PHP timeout

0 comments on commit de193c8

Please sign in to comment.