Skip to content

Commit

Permalink
Browse files Browse the repository at this point in the history
Back out pre-fetch suffix check botch (exclusion_list covers same gro…
…und better); minor cleanups
  • Loading branch information
pmyteh committed Jul 23, 2012
1 parent f390cbf commit ac684e8
Show file tree
Hide file tree
Showing 4 changed files with 1 addition and 45 deletions.
2 changes: 0 additions & 2 deletions CONFIG_db_example.php
Expand Up @@ -26,8 +26,6 @@
$fetchrangeonly=true;
// If $fetchrangeonly=true, what range to fetch? Here, the first 100KB is specified.
$maxfetchsize=100000;
// Exclude these file extensions from being fetched:
$excludedextensions = array(".avi", ".mp4", ".mp3", ".wma", ".wmv", ".ogg", ".pdf", ".doc", ".xls", ".docx", ".xlsx", ".ppt", ".pptx", ".odt", ".ods", ".odp", ".flv");

// Set spider penetration depth. If 0 crawl only pages in database.
$MAX_PENETRATION = 5;
Expand Down
1 change: 0 additions & 1 deletion LIB_exclusion_list.php
Expand Up @@ -15,7 +15,6 @@
$exclusion_array[] = "/\.zip\b/i";
$exclusion_array[] = "/\.tar\b/i";
$exclusion_array[] = "/\.tar.gz\b/i";
$exclusion_array[] = "/\.xml\b/i";
$exclusion_array[] = "/\.flv\b/i";
$exclusion_array[] = "/\.avi\b/i";
$exclusion_array[] = "/\.wav\b/i";
Expand Down
36 changes: 0 additions & 36 deletions LIB_http.php
Expand Up @@ -124,42 +124,6 @@ function http_get($target, $ref)
return http($target, $ref, $method="GET", $data_array="", EXCL_HEAD);
}

/***********************************************************************
function http_get_withheader_suffixcheck($target, $ref)
-------------------------------------------------------------
DESCRIPTION:
Downloads an ASCII file with the http header. If the file has
a known filetype that we cannot handle, instead return an error.
INPUT:
$target The target file (to download)
$ref The server referer variable
OUTPUT:
$return_array['FILE'] = Contents of fetched file, will also
include the HTTP header if requested.
"" if file not fetched.
$return_array['STATUS'] = CURL generated status of transfer.
"" if file not fetched.
$return_array['ERROR'] = CURL generated error status
***********************************************************************/
function http_get_withheader_suffixcheck($target, $ref)
{
foreach ($excludedextensions as $dotext)
{
if ($dotext == substr($target, -(strlen($dotext))))
{
# Create return array
$return_array['FILE'] = "";
$return_array['STATUS'] = "";
$return_array['ERROR'] = "File extension on prohibited list.";
# Return results
return $return_array;
}
}

return http($target, $ref, $method="GET", $data_array="", INCL_HEAD);
}


/***********************************************************************
http_get_withheader($target, $ref)
-------------------------------------------------------------
Expand Down
7 changes: 1 addition & 6 deletions spider.php
Expand Up @@ -44,7 +44,7 @@
try {
$strURL = $seed["strURL"];
if (exclude_link($seed["strURL"])) throw new Exception("Page in excluded list: $strURL\n");
$downloaded_page = http_get_withheader_suffixcheck($seed["strURL"], "");
$downloaded_page = http_get_withheader($seed["strURL"], "");
# Catch fetch errors, oversize files, non-text extensions etc.
if ($downloaded_page['ERROR'] !== '') throw new Exception("Error fetching page: {$downloaded_page['ERROR']}");
$content_type=$downloaded_page['STATUS']['content_type'];
Expand Down Expand Up @@ -111,11 +111,6 @@

db_marked_harvested($seed);

echo "Pause...\n";

$wait = mt_rand(9000000,11000000);#9 to 11 seconds
usleep($wait); #(arg in microseconds)

$seed = db_get_next_to_harvest();
}
db_close();
Expand Down

0 comments on commit ac684e8

Please sign in to comment.