Skip to content

HTTPS clone URL

Subversion checkout URL

You can clone with HTTPS or Subversion.

Download ZIP
Browse files

Back out pre-fetch suffix check botch (exclusion_list covers same gro…

…und better); minor cleanups
  • Loading branch information...
commit ac684e86be71dd15ccbe96cf277eefe5718ce116 1 parent f390cbf
@pmyteh pmyteh authored
View
2  CONFIG_db_example.php
@@ -26,8 +26,6 @@
$fetchrangeonly=true;
// If $fetchrangeonly=true, what range to fetch? Here, the first 100KB is specified.
$maxfetchsize=100000;
-// Exclude these file extensions from being fetched:
-$excludedextensions = array(".avi", ".mp4", ".mp3", ".wma", ".wmv", ".ogg", ".pdf", ".doc", ".xls", ".docx", ".xlsx", ".ppt", ".pptx", ".odt", ".ods", ".odp", ".flv");
// Set spider penetration depth. If 0 crawl only pages in database.
$MAX_PENETRATION = 5;
View
1  LIB_exclusion_list.php
@@ -15,7 +15,6 @@
$exclusion_array[] = "/\.zip\b/i";
$exclusion_array[] = "/\.tar\b/i";
$exclusion_array[] = "/\.tar.gz\b/i";
-$exclusion_array[] = "/\.xml\b/i";
$exclusion_array[] = "/\.flv\b/i";
$exclusion_array[] = "/\.avi\b/i";
$exclusion_array[] = "/\.wav\b/i";
View
36 LIB_http.php
@@ -125,42 +125,6 @@ function http_get($target, $ref)
}
/***********************************************************************
-function http_get_withheader_suffixcheck($target, $ref)
--------------------------------------------------------------
-DESCRIPTION:
- Downloads an ASCII file with the http header. If the file has
- a known filetype that we cannot handle, instead return an error.
-INPUT:
- $target The target file (to download)
- $ref The server referer variable
-OUTPUT:
- $return_array['FILE'] = Contents of fetched file, will also
- include the HTTP header if requested.
- "" if file not fetched.
- $return_array['STATUS'] = CURL generated status of transfer.
- "" if file not fetched.
- $return_array['ERROR'] = CURL generated error status
-***********************************************************************/
-function http_get_withheader_suffixcheck($target, $ref)
- {
- foreach ($excludedextensions as $dotext)
- {
- if ($dotext == substr($target, -(strlen($dotext))))
- {
- # Create return array
- $return_array['FILE'] = "";
- $return_array['STATUS'] = "";
- $return_array['ERROR'] = "File extension on prohibited list.";
- # Return results
- return $return_array;
- }
- }
-
- return http($target, $ref, $method="GET", $data_array="", INCL_HEAD);
- }
-
-
-/***********************************************************************
http_get_withheader($target, $ref)
-------------------------------------------------------------
DESCRIPTION:
View
7 spider.php
@@ -44,7 +44,7 @@
try {
$strURL = $seed["strURL"];
if (exclude_link($seed["strURL"])) throw new Exception("Page in excluded list: $strURL\n");
- $downloaded_page = http_get_withheader_suffixcheck($seed["strURL"], "");
+ $downloaded_page = http_get_withheader($seed["strURL"], "");
# Catch fetch errors, oversize files, non-text extensions etc.
if ($downloaded_page['ERROR'] !== '') throw new Exception("Error fetching page: {$downloaded_page['ERROR']}");
$content_type=$downloaded_page['STATUS']['content_type'];
@@ -111,11 +111,6 @@
db_marked_harvested($seed);
- echo "Pause...\n";
-
- $wait = mt_rand(9000000,11000000);#9 to 11 seconds
- usleep($wait); #(arg in microseconds)
-
$seed = db_get_next_to_harvest();
}
db_close();
Please sign in to comment.
Something went wrong with that request. Please try again.