Permalink
Browse files

Merged in edits from LG; corrected small syntax errors

  • Loading branch information...
1 parent d4d76a8 commit f390cbfbd8ae18d3078bcb7c54d2f9d89a6d0c45 @computermacgyver committed Jul 22, 2012
Showing with 11 additions and 10 deletions.
  1. +4 −3 CONFIG_db_example.php
  2. +1 −1 LIB_db_functions.php
  3. +4 −4 LIB_http.php
  4. +2 −2 spider.php
@@ -8,9 +8,10 @@
$db_name = "somedb";
## $operator_email is used to mail the user on script completion
$operator_email = "j.random@example.com";
+
## User agent string to send in HTTP requests
-$user_agent = "Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.0.18) Gecko/2010021501 Ubuntu/8.04 (hardy) Firefox/3.0.18"
-$cookie_file_location = "/home/jrandom/cookies.txt"
+$user_agent = "Mozilla/5.0 phpWebCrawler Contact Info Here ";
+$cookie_file_location = "/home/jrandom/cookies.txt";
# $whitelistdomain, $whitelistdomainlevel, $whitelistdomainlist; //NEED to document SAH
//Crawl all .gov.uk and .org.uk sites
@@ -24,7 +25,7 @@
//Fetch only first part of each page, to avoid huge files?
$fetchrangeonly=true;
// If $fetchrangeonly=true, what range to fetch? Here, the first 100KB is specified.
-$maxfetchsize=100000
+$maxfetchsize=100000;
// Exclude these file extensions from being fetched:
$excludedextensions = array(".avi", ".mp4", ".mp3", ".wma", ".wmv", ".ogg", ".pdf", ".doc", ".xls", ".docx", ".xlsx", ".ppt", ".pptx", ".odt", ".ods", ".odp", ".flv");
@@ -89,7 +89,7 @@ function db_get_next_to_harvest() {
if ($result==NULL) {//try without domain table
$strSQL = "SELECT tblPages.*, CURRENT_TIMESTAMP AS dtLastAccessed from tblPages WHERE bolHarvested=0 LIMIT 1";
- print "$strSQL\n";
+ //print "$strSQL\n";
$result = db_run_select($strSQL);
if ($result == NULL) return $result; //No more pages
//else wait the appropriate time to return a page of the same domain
View
@@ -145,7 +145,7 @@ function http_get_withheader_suffixcheck($target, $ref)
{
foreach ($excludedextensions as $dotext)
{
- if ($dotext == substr($target, -(strlen($dotext)))
+ if ($dotext == substr($target, -(strlen($dotext))))
{
# Create return array
$return_array['FILE'] = "";
@@ -350,9 +350,9 @@ function http($target, $ref, $method, $data_array, $incl_head)
curl_setopt($ch, CURLOPT_MAXREDIRS, 4); // Limit redirections to four
curl_setopt($ch, CURLOPT_RETURNTRANSFER, TRUE); // Return in string
curl_setopt($ch, CURLOPT_HEADERFUNCTION, 'read_header'); // Callback function
- curl_setopt($ch, CURLOPT_HTTPHEADER,array('accept: text/*'); // Ask for text only
+ curl_setopt($ch, CURLOPT_HTTPHEADER,array('accept: text/*')); // Ask for text only
if ($fetchrangeonly == true)
- curl_setopt($ch, CURLOPT_RANGE, "0-".strval($maxfetchsize-1); // Size limit
+ curl_setopt($ch, CURLOPT_RANGE, "0-".strval($maxfetchsize-1)); // Size limit
# Create return array
$return_array['FILE'] = curl_exec($ch);
@@ -378,7 +378,7 @@ function read_header($ch, $string)
$length = strlen($string);
# echo "Header: $string<br />\n";
# XXX check http_parse_headers library is valid here. Otherwise, unpack from source
- $headerarray = http_parse_headers($string)
+ $headerarray = http_parse_headers($string);
if (array_key_exists('Content-Type', $headerarray))
{
if (preg_match( '/text\//', $headerarray['Content-Type']) == 0)
View
@@ -53,7 +53,7 @@
if ($code!=200 || strpos(strtolower($content_type),"text")===false) {
print "Skipping....http_code is $code content_type is $content_type\n";
db_marked_processed($seed);
- $seed = db_get_next_to_process();
+ $seed = db_get_next_to_harvest();
continue;
}
$strHTML = $downloaded_page['FILE'];
@@ -64,7 +64,7 @@
} catch (Exception $e) {
echo "Exeception caught: $e . Skipping page\n";
db_marked_harvested($seed);
- $seed = db_get_next_to_process();
+ $seed = db_get_next_to_harvest();
continue;
}
} else {

0 comments on commit f390cbf

Please sign in to comment.