Permalink
Browse files

Latest Ultimate Web Scraper Toolkit.

  • Loading branch information...
cubiclesoft committed Apr 4, 2018
1 parent 39c4723 commit 171840645063fe6ce7aafd115aa724d6c188d62f
Showing with 292 additions and 45 deletions.
  1. +1 −1 providers/sso_google/index.php
  2. +1 −1 providers/sso_linkedin/index.php
  3. +2 −1 support/emulate_curl.php
  4. +65 −18 support/http.php
  5. +193 −19 support/tag_filter.php
  6. +30 −5 support/web_browser.php
@@ -318,7 +318,7 @@ public function ProcessFrontend()
)
);
$web = new WebBrowser();
$result = $web->Process($url, "auto", $options);
$result = $web->Process($url, $options);
if (!$result["success"]) $this->DisplayError(BB_Translate("Sign in failed. Error retrieving URL for Google access token. %s", $result["error"]));
else if ($result["response"]["code"] != 200) $this->DisplayError(BB_Translate("Sign in failed. The Google access token server returned: %s", $result["response"]["code"] . " " . $result["response"]["meaning"]));
@@ -330,7 +330,7 @@ public function ProcessFrontend()
)
);
$web = new WebBrowser();
$result = $web->Process($url, "auto", $options);
$result = $web->Process($url, $options);
if (!$result["success"]) $this->DisplayError(BB_Translate("Sign in failed. Error retrieving URL for LinkedIn access token. %s", $result["error"]));
else if ($result["response"]["code"] != 200) $this->DisplayError(BB_Translate("Sign in failed. The LinkedIn access token server returned: %s", $result["response"]["code"] . " " . $result["response"]["meaning"]));
@@ -872,7 +872,8 @@ function curl_exec($ch)
$curl_init__map[$key]["outputbody"] = false;
// Process the request.
$result = $curl_init__map[$key]["browser"]->Process($url, "", $options);
$options["profile"] = "";
$result = $curl_init__map[$key]["browser"]->Process($url, $options);
$curl_init__map[$key]["lastresult"] = $result;
// Deal with cookies.
@@ -1,6 +1,6 @@
<?php
// CubicleSoft PHP HTTP class.
// (C) 2016 CubicleSoft. All Rights Reserved.
// (C) 2017 CubicleSoft. All Rights Reserved.
class HTTP
{
@@ -252,6 +252,34 @@ public static function GetUserAgent($type)
return "";
}
public static function GetSSLCiphers($type = "intermediate")
{
$type = strtolower($type);
// Cipher list last updated May 3, 2017.
if ($type == "modern") return "ECDHE-ECDSA-AES256-GCM-SHA384:ECDHE-RSA-AES256-GCM-SHA384:ECDHE-ECDSA-CHACHA20-POLY1305:ECDHE-RSA-CHACHA20-POLY1305:ECDHE-ECDSA-AES128-GCM-SHA256:ECDHE-RSA-AES128-GCM-SHA256:ECDHE-ECDSA-AES256-SHA384:ECDHE-RSA-AES256-SHA384:ECDHE-ECDSA-AES128-SHA256:ECDHE-RSA-AES128-SHA256";
else if ($type == "old") return "ECDHE-ECDSA-CHACHA20-POLY1305:ECDHE-RSA-CHACHA20-POLY1305:ECDHE-RSA-AES128-GCM-SHA256:ECDHE-ECDSA-AES128-GCM-SHA256:ECDHE-RSA-AES256-GCM-SHA384:ECDHE-ECDSA-AES256-GCM-SHA384:DHE-RSA-AES128-GCM-SHA256:DHE-DSS-AES128-GCM-SHA256:kEDH+AESGCM:ECDHE-RSA-AES128-SHA256:ECDHE-ECDSA-AES128-SHA256:ECDHE-RSA-AES128-SHA:ECDHE-ECDSA-AES128-SHA:ECDHE-RSA-AES256-SHA384:ECDHE-ECDSA-AES256-SHA384:ECDHE-RSA-AES256-SHA:ECDHE-ECDSA-AES256-SHA:DHE-RSA-AES128-SHA256:DHE-RSA-AES128-SHA:DHE-DSS-AES128-SHA256:DHE-RSA-AES256-SHA256:DHE-DSS-AES256-SHA:DHE-RSA-AES256-SHA:ECDHE-RSA-DES-CBC3-SHA:ECDHE-ECDSA-DES-CBC3-SHA:EDH-RSA-DES-CBC3-SHA:AES128-GCM-SHA256:AES256-GCM-SHA384:AES128-SHA256:AES256-SHA256:AES128-SHA:AES256-SHA:AES:DES-CBC3-SHA:HIGH:SEED:!aNULL:!eNULL:!EXPORT:!DES:!RC4:!MD5:!PSK:!RSAPSK:!aDH:!aECDH:!EDH-DSS-DES-CBC3-SHA:!KRB5-DES-CBC3-SHA:!SRP";
return "ECDHE-ECDSA-CHACHA20-POLY1305:ECDHE-RSA-CHACHA20-POLY1305:ECDHE-ECDSA-AES128-GCM-SHA256:ECDHE-RSA-AES128-GCM-SHA256:ECDHE-ECDSA-AES256-GCM-SHA384:ECDHE-RSA-AES256-GCM-SHA384:DHE-RSA-AES128-GCM-SHA256:DHE-RSA-AES256-GCM-SHA384:ECDHE-ECDSA-AES128-SHA256:ECDHE-RSA-AES128-SHA256:ECDHE-ECDSA-AES128-SHA:ECDHE-RSA-AES256-SHA384:ECDHE-RSA-AES128-SHA:ECDHE-ECDSA-AES256-SHA384:ECDHE-ECDSA-AES256-SHA:ECDHE-RSA-AES256-SHA:DHE-RSA-AES128-SHA256:DHE-RSA-AES128-SHA:DHE-RSA-AES256-SHA256:DHE-RSA-AES256-SHA:ECDHE-ECDSA-DES-CBC3-SHA:ECDHE-RSA-DES-CBC3-SHA:EDH-RSA-DES-CBC3-SHA:AES128-GCM-SHA256:AES256-GCM-SHA384:AES128-SHA256:AES256-SHA256:AES128-SHA:AES256-SHA:DES-CBC3-SHA:!DSS";
}
public static function GetSafeSSLOpts($cafile = true, $cipherstype = "intermediate")
{
// Result array last updated May 3, 2017.
$result = array(
"ciphers" => self::GetSSLCiphers($cipherstype),
"disable_compression" => true,
"allow_self_signed" => false,
"verify_peer" => true,
"verify_depth" => 5
);
if ($cafile === true) $result["auto_cainfo"] = true;
else if ($cafile !== false) $result["cafile"] = $cafile;
return $result;
}
// Reasonably parses RFC1123, RFC850, and asctime() dates.
public static function GetDateTimestamp($httpdate)
{
@@ -332,6 +360,21 @@ public static function NormalizeHeaders($headers)
return $result;
}
public static function MergeRawHeaders(&$headers, $rawheaders)
{
foreach ($rawheaders as $name => $val)
{
$val = self::HeaderValueCleanup($val);
if ($val != "")
{
$name2 = self::HeaderNameCleanup($name);
if (isset($headers[$name2])) unset($headers[$name2]);
$headers[$name] = $val;
}
}
}
public static function ExtractHeader($data)
{
$result = array();
@@ -416,7 +459,7 @@ private static function ProcessSSLOptions(&$options, $key, $host)
if (!isset($options["headers"]["Host"])) $options[$key]["CN_match"] = $host;
else
{
$info = self::ExtractURL("http://" . $options["headers"]["Host"]);
$info = self::ExtractURL("https://" . $options["headers"]["Host"]);
$options[$key]["CN_match"] = $info["host"];
}
}
@@ -429,7 +472,7 @@ private static function ProcessSSLOptions(&$options, $key, $host)
if (!isset($options["headers"]["Host"])) $options[$key]["SNI_server_name"] = $host;
else
{
$info = self::ExtractURL("http://" . $options["headers"]["Host"]);
$info = self::ExtractURL("https://" . $options["headers"]["Host"]);
$options[$key]["SNI_server_name"] = $info["host"];
}
}
@@ -705,9 +748,9 @@ public static function ProcessState(&$state)
{
$readfp = NULL;
$writefp = array($state["fp"]);
$exceptfp = NULL;
$exceptfp = array($state["fp"]);
$result = @stream_select($readfp, $writefp, $exceptfp, 0);
if ($result === false) return self::CleanupErrorState($state, array("success" => false, "error" => self::HTTPTranslate("A stream_select() failure occurred. Most likely cause: Connection failure."), "errorcode" => "stream_select_failed"));
if ($result === false || count($exceptfp)) return self::CleanupErrorState($state, array("success" => false, "error" => self::HTTPTranslate("A stream_select() failure occurred. Most likely cause: Connection failure."), "errorcode" => "stream_select_failed"));
if (!count($writefp)) return array("success" => false, "error" => self::HTTPTranslate("Connection not established yet."), "errorcode" => "no_data");
}
@@ -1295,7 +1338,7 @@ public static function RetrieveWebpage($url, $options = array())
if ($url["scheme"] != "http" && $url["scheme"] != "https") return array("success" => false, "error" => self::HTTPTranslate("RetrieveWebpage() only supports the 'http' and 'https' protocols."), "errorcode" => "protocol_check");
$secure = ($url["scheme"] == "https");
$protocol = ($secure ? (isset($options["protocol"]) && strtolower($options["protocol"]) == "ssl" ? "ssl" : "tls") : "tcp");
$protocol = ($secure ? (isset($options["protocol"]) ? strtolower($options["protocol"]) : "ssl") : "tcp");
if (function_exists("stream_get_transports") && !in_array($protocol, stream_get_transports())) return array("success" => false, "error" => self::HTTPTranslate("The desired transport protocol '%s' is not installed.", $protocol), "errorcode" => "transport_not_installed");
$host = str_replace(" ", "-", self::HeaderValueCleanup($url["host"]));
if ($host == "") return array("success" => false, "error" => self::HTTPTranslate("Invalid URL."));
@@ -1309,6 +1352,7 @@ public static function RetrieveWebpage($url, $options = array())
// Cleanup input headers.
if (!isset($options["headers"])) $options["headers"] = array();
$options["headers"] = self::NormalizeHeaders($options["headers"]);
if (isset($options["rawheaders"])) self::MergeRawHeaders($options["headers"], $options["rawheaders"]);
// Process the proxy URL (if specified).
$useproxy = (isset($options["proxyurl"]) && trim($options["proxyurl"]) != "");
@@ -1321,7 +1365,7 @@ public static function RetrieveWebpage($url, $options = array())
$proxyurl = self::ExtractURL($proxyurl);
$proxysecure = ($proxyurl["scheme"] == "https");
$proxyprotocol = ($proxysecure ? (isset($options["proxyprotocol"]) && strtolower($options["proxyprotocol"]) == "ssl" ? "ssl" : "tls") : "tcp");
$proxyprotocol = ($proxysecure ? (isset($options["proxyprotocol"]) ? strtolower($options["proxyprotocol"]) : "ssl") : "tcp");
if (function_exists("stream_get_transports") && !in_array($proxyprotocol, stream_get_transports())) return array("success" => false, "error" => self::HTTPTranslate("The desired transport proxy protocol '%s' is not installed.", $proxyprotocol), "errorcode" => "proxy_transport_not_installed");
$proxyhost = str_replace(" ", "-", self::HeaderValueCleanup($proxyurl["host"]));
$proxyport = ((int)$proxyurl["port"] ? (int)$proxyurl["port"] : ($proxysecure ? 443 : 80));
@@ -1338,15 +1382,16 @@ public static function RetrieveWebpage($url, $options = array())
$proxydata .= "Host: " . $host . ($defaultport ? "" : ":" . $port) . "\r\n";
$proxydata .= "Proxy-Connection: keep-alive\r\n";
if ($proxyusername != "") $proxydata .= "Proxy-Authorization: BASIC " . base64_encode($proxyusername . ":" . $proxypassword) . "\r\n";
if (isset($options["proxyheaders"]))
if (!isset($options["proxyheaders"])) $options["proxyheaders"] = array();
$options["proxyheaders"] = self::NormalizeHeaders($options["proxyheaders"]);
if (isset($options["rawproxyheaders"])) self::MergeRawHeaders($options["proxyheaders"], $options["rawproxyheaders"]);
unset($options["proxyheaders"]["Accept-Encoding"]);
foreach ($options["proxyheaders"] as $name => $val)
{
$options["proxyheaders"] = self::NormalizeHeaders($options["proxyheaders"]);
unset($options["proxyheaders"]["Accept-Encoding"]);
foreach ($options["proxyheaders"] as $name => $val)
{
if ($name != "Content-Type" && $name != "Content-Length" && $name != "Proxy-Connection" && $name != "Host") $proxydata .= $name . ": " . $val . "\r\n";
}
if ($name != "Content-Type" && $name != "Content-Length" && $name != "Proxy-Connection" && $name != "Host") $proxydata .= $name . ": " . $val . "\r\n";
}
$proxydata .= "\r\n";
if (isset($options["debug_callback"]) && is_callable($options["debug_callback"])) call_user_func_array($options["debug_callback"], array("rawproxyheaders", $proxydata, &$options["debug_callback_opts"]));
}
@@ -1520,12 +1565,13 @@ public static function RetrieveWebpage($url, $options = array())
{
$context = @stream_context_create();
if (isset($options["source_ip"])) $context["socket"] = array("bindto" => $options["source_ip"] . ":0");
if ($proxysecure && isset($options["proxysslopts"]) && is_array($options["proxysslopts"]))
if ($proxysecure)
{
if (!isset($options["proxysslopts"]) || !is_array($options["proxysslopts"])) $options["proxysslopts"] = self::GetSafeSSLOpts();
self::ProcessSSLOptions($options, "proxysslopts", $host);
foreach ($options["proxysslopts"] as $key => $val) @stream_context_set_option($context, "ssl", $key, $val);
}
$fp = @stream_socket_client($proxyprotocol . "://" . $proxyhost . ":" . $proxyport, $errornum, $errorstr, $options["proxyconnecttimeout"], STREAM_CLIENT_CONNECT | (isset($options["async"]) && $options["async"] ? STREAM_CLIENT_ASYNC_CONNECT : 0), $context);
$fp = @stream_socket_client($proxyprotocol . "://" . $proxyhost . ":" . $proxyport, $errornum, $errorstr, $options["proxyconnecttimeout"], (isset($options["async"]) && $options["async"] ? STREAM_CLIENT_ASYNC_CONNECT : STREAM_CLIENT_CONNECT), $context);
}
if ($fp === false) return array("success" => false, "error" => self::HTTPTranslate("Unable to establish a connection to '%s'.", ($proxysecure ? $proxyprotocol . "://" : "") . $proxyhost . ":" . $proxyport), "info" => $errorstr . " (" . $errornum . ")", "errorcode" => "proxy_connect");
@@ -1540,12 +1586,13 @@ public static function RetrieveWebpage($url, $options = array())
{
$context = @stream_context_create();
if (isset($options["source_ip"])) $context["socket"] = array("bindto" => $options["source_ip"] . ":0");
if ($secure && isset($options["sslopts"]) && is_array($options["sslopts"]))
if ($secure)
{
if (!isset($options["sslopts"]) || !is_array($options["sslopts"])) $options["sslopts"] = self::GetSafeSSLOpts();
self::ProcessSSLOptions($options, "sslopts", $host);
foreach ($options["sslopts"] as $key => $val) @stream_context_set_option($context, "ssl", $key, $val);
}
$fp = @stream_socket_client($protocol . "://" . $host . ":" . $port, $errornum, $errorstr, $options["connecttimeout"], STREAM_CLIENT_CONNECT | (isset($options["async"]) && $options["async"] ? STREAM_CLIENT_ASYNC_CONNECT : 0), $context);
$fp = @stream_socket_client($protocol . "://" . $host . ":" . $port, $errornum, $errorstr, $options["connecttimeout"], (isset($options["async"]) && $options["async"] ? STREAM_CLIENT_ASYNC_CONNECT : STREAM_CLIENT_CONNECT), $context);
}
if ($fp === false) return array("success" => false, "error" => self::HTTPTranslate("Unable to establish a connection to '%s'.", ($secure ? $protocol . "://" : "") . $host . ":" . $port), "info" => $errorstr . " (" . $errornum . ")", "errorcode" => "connect_failed");
Oops, something went wrong.

0 comments on commit 1718406

Please sign in to comment.