From ac684e86be71dd15ccbe96cf277eefe5718ce116 Mon Sep 17 00:00:00 2001 From: pmyteh Date: Mon, 23 Jul 2012 12:17:41 +0100 Subject: [PATCH] Back out pre-fetch suffix check botch (exclusion_list covers same ground better); minor cleanups --- CONFIG_db_example.php | 2 -- LIB_exclusion_list.php | 1 - LIB_http.php | 36 ------------------------------------ spider.php | 7 +------ 4 files changed, 1 insertion(+), 45 deletions(-) diff --git a/CONFIG_db_example.php b/CONFIG_db_example.php index 54a5e4c..65ec991 100644 --- a/CONFIG_db_example.php +++ b/CONFIG_db_example.php @@ -26,8 +26,6 @@ $fetchrangeonly=true; // If $fetchrangeonly=true, what range to fetch? Here, the first 100KB is specified. $maxfetchsize=100000; -// Exclude these file extensions from being fetched: -$excludedextensions = array(".avi", ".mp4", ".mp3", ".wma", ".wmv", ".ogg", ".pdf", ".doc", ".xls", ".docx", ".xlsx", ".ppt", ".pptx", ".odt", ".ods", ".odp", ".flv"); // Set spider penetration depth. If 0 crawl only pages in database. $MAX_PENETRATION = 5; diff --git a/LIB_exclusion_list.php b/LIB_exclusion_list.php index f9df169..6e4914b 100644 --- a/LIB_exclusion_list.php +++ b/LIB_exclusion_list.php @@ -15,7 +15,6 @@ $exclusion_array[] = "/\.zip\b/i"; $exclusion_array[] = "/\.tar\b/i"; $exclusion_array[] = "/\.tar.gz\b/i"; -$exclusion_array[] = "/\.xml\b/i"; $exclusion_array[] = "/\.flv\b/i"; $exclusion_array[] = "/\.avi\b/i"; $exclusion_array[] = "/\.wav\b/i"; diff --git a/LIB_http.php b/LIB_http.php index e81d3b6..e059919 100644 --- a/LIB_http.php +++ b/LIB_http.php @@ -124,42 +124,6 @@ function http_get($target, $ref) return http($target, $ref, $method="GET", $data_array="", EXCL_HEAD); } -/*********************************************************************** -function http_get_withheader_suffixcheck($target, $ref) -------------------------------------------------------------- -DESCRIPTION: - Downloads an ASCII file with the http header. If the file has - a known filetype that we cannot handle, instead return an error. -INPUT: - $target The target file (to download) - $ref The server referer variable -OUTPUT: - $return_array['FILE'] = Contents of fetched file, will also - include the HTTP header if requested. - "" if file not fetched. - $return_array['STATUS'] = CURL generated status of transfer. - "" if file not fetched. - $return_array['ERROR'] = CURL generated error status -***********************************************************************/ -function http_get_withheader_suffixcheck($target, $ref) - { - foreach ($excludedextensions as $dotext) - { - if ($dotext == substr($target, -(strlen($dotext)))) - { - # Create return array - $return_array['FILE'] = ""; - $return_array['STATUS'] = ""; - $return_array['ERROR'] = "File extension on prohibited list."; - # Return results - return $return_array; - } - } - - return http($target, $ref, $method="GET", $data_array="", INCL_HEAD); - } - - /*********************************************************************** http_get_withheader($target, $ref) ------------------------------------------------------------- diff --git a/spider.php b/spider.php index 997b01c..bb6432c 100644 --- a/spider.php +++ b/spider.php @@ -44,7 +44,7 @@ try { $strURL = $seed["strURL"]; if (exclude_link($seed["strURL"])) throw new Exception("Page in excluded list: $strURL\n"); - $downloaded_page = http_get_withheader_suffixcheck($seed["strURL"], ""); + $downloaded_page = http_get_withheader($seed["strURL"], ""); # Catch fetch errors, oversize files, non-text extensions etc. if ($downloaded_page['ERROR'] !== '') throw new Exception("Error fetching page: {$downloaded_page['ERROR']}"); $content_type=$downloaded_page['STATUS']['content_type']; @@ -111,11 +111,6 @@ db_marked_harvested($seed); - echo "Pause...\n"; - - $wait = mt_rand(9000000,11000000);#9 to 11 seconds - usleep($wait); #(arg in microseconds) - $seed = db_get_next_to_harvest(); } db_close();