Permalink
Browse files

example config file + move some config from spider to config

probably need to rename config_db to config (sorry)
  • Loading branch information...
1 parent de193c8 commit dcc4c8f220f728914b03a718aa1823b0ecbb0dfa @computermacgyver committed Jul 20, 2012
Showing with 41 additions and 8 deletions.
  1. +33 −0 CONFIG_db_example.php
  2. +8 −8 spider.php
View
@@ -0,0 +1,33 @@
+<?php
+
+global $db_host,$db_username,$db_name,$operator_email,$whitelistdomain,$whitelistdomainlevel,$whitelistdomainlist;
+
+$db_host = "localhost";
+$db_username = "jrandom";
+$db_password = "ASDF!!1!one1";
+$db_name = "somedb";
+## $operator_email is used to mail the user on script completion
+$operator_email = "j.random@example.com";
+
+# $whitelistdomain, $whitelistdomainlevel, $whitelistdomainlist; //NEED to document SAH
+//Crawl all .gov.uk and .org.uk sites
+
+//Restrict crawling to a whitelist? true | false
+$whitelistdomain=true;
+//What level of domain to match (1=tld, 2=sld, etc. e.g. 1=uk, 2=gov.uk, 3=direct.gov.uk)
+$whitelistdomainlevel=2;
+//list of domains starting, ending, and separated with :
+$whitelistdomainlist=":gov.uk:.org.uk:";
+
+// Set spider penetration depth. If 0 crawl only pages in database.
+$MAX_PENETRATION = 5;
+// Wait one second between page fetches
+$FETCH_DELAY = 1;
+// Wait five seconds between page fetches on same domain (NOT IMPLEMENTED YET)
+$SAME_DOMAIN_FETCH_DELAY= 5;
+// Don't allow spider to roam from the SEED_URL's domain
+$ALLOW_OFFSITE = true;
+// Only include URL's to remote domains
+$ONLY_OFFSITE = false;
+
+?>
View
@@ -3,22 +3,19 @@
date_default_timezone_set('Europe/London');
# Initialization
+
+include_once("CONFIG_db.php"); //Include configuration (do this first)
+
include_once("LIB_http.php"); // http library
include_once("LIB_parse.php"); // parse library
include_once("LIB_resolve_addresses.php"); // address resolution library
include_once("LIB_exclusion_list.php"); // list of excluded keywords
include_once("LIB_simple_spider.php"); // spider routines used by this app.
include_once("LIB_db_functions.php");
include_once("LIB_encoding.php");
-include_once("CONFIG_db.php");
-set_time_limit(0); // Don't let PHP timeout
-$MAX_PENETRATION = 5; // Set spider penetration depth
-$FETCH_DELAY = 1; // Wait one second between page fetches
-$ALLOW_OFFSITE = true; // Don't allow spider to roam from the SEED_URL's domain
-$ONLY_OFFSITE = false; // Only include URL's to remote domains
-#$spider_array = array();
+set_time_limit(0); // Don't let PHP timeout
db_connect();
@@ -96,7 +93,10 @@
if (!exclude_link($resolved_address)) {
#$link_array[] = $resolved_addres;
try {
- db_store_link($seed,$resolved_address);//_internal_only for only links in DB
+ if ($MAX_PENETRATION==0)//crawl only links in db
+ db_store_link_internal_only($seed,$resolved_address);
+ else //grow crawl list (possibly in conjuction with white list)
+ db_store_link($seed,$resolved_address);
} catch(Exception $e) {
echo "***ERROR***\n";
echo "Couldn't store: $resolved_address\n";

0 comments on commit dcc4c8f

Please sign in to comment.