Permalink
Browse files

Refuse to fetch specified file extensions

  • Loading branch information...
1 parent 55ebd3c commit f0410d40139692b132c805892ecea7a964ead146 @pmyteh pmyteh committed Jul 22, 2012
Showing with 22 additions and 4 deletions.
  1. +2 −0 CONFIG_db_example.php
  2. +18 −4 LIB_http.php
  3. +2 −0 spider.php
View
@@ -25,6 +25,8 @@
$fetchrangeonly=true;
// If $fetchrangeonly=true, what range to fetch? Here, the first 100KB is specified.
$maxfetchsize=100000
+// Exclude these file extensions from being fetched:
+$excludedextensions = array(".avi", ".mp4", ".mp3", ".wma", ".wmv", ".ogg", ".pdf", ".doc", ".xls", ".docx", ".xlsx", ".ppt", ".pptx", ".odt", ".ods", ".odp", ".flv");
// Set spider penetration depth. If 0 crawl only pages in database.
$MAX_PENETRATION = 5;
View
@@ -135,13 +135,27 @@ function http_get_withheader_suffixcheck($target, $ref)
$ref The server referer variable
OUTPUT:
$return_array['FILE'] = Contents of fetched file, will also
- include the HTTP header if requested
- $return_array['STATUS'] = CURL generated status of transfer
+ include the HTTP header if requested.
+ "" if file not fetched.
+ $return_array['STATUS'] = CURL generated status of transfer.
+ "" if file not fetched.
$return_array['ERROR'] = CURL generated error status
***********************************************************************/
function http_get_withheader_suffixcheck($target, $ref)
{
- # XXX TODO: Implement
+ foreach ($excludedextensions as $dotext)
+ {
+ if ($dotext == substr($target, -(strlen($dotext)))
+ {
+ # Create return array
+ $return_array['FILE'] = "";
+ $return_array['STATUS'] = "";
+ $return_array['ERROR'] = "File extension on prohibited list.";
+ # Return results
+ return $return_array;
+ }
+ }
+
return http($target, $ref, $method="GET", $data_array="", INCL_HEAD);
}
@@ -320,7 +334,7 @@ function http($target, $ref, $method, $data_array, $incl_head)
curl_setopt ($ch, CURLOPT_POST, TRUE);
curl_setopt ($ch, CURLOPT_HTTPGET, FALSE);
}
- curl_setopt($ch, CURLOPT_HEADER, $incl_head); // Include head as needed
+ curl_setopt($ch, CURLOPT_HEADER, $incl_head); // Include head as needed
curl_setopt($ch, CURLOPT_NOBODY, FALSE); // Return body
}
View
@@ -45,6 +45,8 @@
$strURL = $seed["strURL"];
if (exclude_link($seed["strURL"])) throw new Exception("Page in excluded list: $strURL\n");
$downloaded_page = http_get_withheader_suffixcheck($seed["strURL"], "");
+ # Catch fetch errors, oversize files, non-text extensions etc.
+ if ($downloaded_page['ERROR'] !== '') throw new Exception("Error fetching page: {$downloaded_page['ERROR']}");
$content_type=$downloaded_page['STATUS']['content_type'];
$strStatus=$downloaded_page['STATUS'];
$code=$strStatus["http_code"];

0 comments on commit f0410d4

Please sign in to comment.