diff --git a/.travis.yml b/.travis.yml index a585d16..0af7ea6 100644 --- a/.travis.yml +++ b/.travis.yml @@ -1,3 +1,4 @@ +sudo: false language: node_js @@ -7,10 +8,18 @@ node_js: - 8 - 6 -sudo: false +matrix: + include: + - node_js: node + env: BROTLI=1 + - node_js: 6 + env: BROTLI=1 + before_install: npm i --save-only request brotli +before_install: npm i --save-only request +install: npm i after_success: npm run coverage notifications: webhooks: https://www.travisbuddy.com/?insertMode=update - on_success: never \ No newline at end of file + on_success: never diff --git a/CHANGELOG.md b/CHANGELOG.md index 1e178c2..a0aaf7b 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,11 @@ ## Change Log +### v4.0.0 (22/04/2019) +- Randomize `User-Agent` header with random chrome browser +- Recaptcha solving support +- Brotli non-mandatory support +- Various code changes and improvements + ### v3.9.1 (11/04/2019) - Fix for the timeout parsing @@ -11,7 +17,7 @@ ### v3.7.0 (07/04/2019) - [#182](https://github.com/codemanki/cloudscraper/pull/182) Usage examples have been added. -- [#169](https://github.com/codemanki/cloudscraper/pull/169) Cloudscraper now automatically parses out timeout for a CF challenge. `cloudflareTimeout` still can be used, but will be deprecated soon +- [#169](https://github.com/codemanki/cloudscraper/pull/169) Cloudscraper now automatically parses out timeout for a CF challenge. ### v3.6.0 (03/04/2019) - [#180](https://github.com/codemanki/cloudscraper/pull/180) Update code to parse latest CF challenge diff --git a/README.md b/README.md index 85c067c..4cfd468 100644 --- a/README.md +++ b/README.md @@ -127,6 +127,10 @@ Cloudscraper wraps request and request-promise, so using cloudscraper is pretty .catch(function (err) { }); ``` + +## Recaptcha +Cloudscraper may help you with the recaptcha page. Take a look at [this example](https://github.com/codemanki/cloudscraper/blob/master/examples/solve-recaptcha.js). + ## Defaults method `cloudscraper.defaults` is a very convenient way of extending the cloudscraper requests with any of your settings. @@ -151,12 +155,16 @@ var options = { jar: requestModule.jar(), // Custom cookie jar headers: { // User agent, Cache Control and Accept headers are required + // User agent is populated by a random UA. 'User-Agent': 'Ubuntu Chromium/34.0.1847.116 Chrome/34.0.1847.116 Safari/537.36', 'Cache-Control': 'private', 'Accept': 'application/xml,application/xhtml+xml,text/html;q=0.9, text/plain;q=0.8,image/png,*/*;q=0.5' }, - // Cloudflare requires a delay of 4 seconds, so wait for at least 5. + // Cloudscraper automatically parses out timeout required by Cloudflare. + // Override cloudflareTimeout to adjust it. cloudflareTimeout: 5000, + // Reduce Cloudflare's timeout to cloudflareMaxTimeout if it is excessive + cloudflareMaxTimeout: 30000, // followAllRedirects - follow non-GET HTTP 3xx responses as redirects followAllRedirects: true, // Support only this max challenges in row. If CF returns more, throw an error @@ -227,3 +235,4 @@ Current Cloudflare implementation requires browser to respect the timeout of 5 s * [request-promise](https://github.com/request/request-promise) + diff --git a/errors.js b/errors.js index f2975bf..fee304e 100644 --- a/errors.js +++ b/errors.js @@ -9,17 +9,17 @@ // 1. There is a non-enumerable errorType attribute. // 2. The error constructor is hidden from the stacktrace. -var EOL = require('os').EOL; -var original = require('request-promise-core/errors'); -var http = require('http'); +const EOL = require('os').EOL; +const original = require('request-promise-core/errors'); +const http = require('http'); -var BUG_REPORT = format([ +const BUG_REPORT = format([ '### Cloudflare may have changed their technique, or there may be a bug.', '### Bug Reports: https://github.com/codemanki/cloudscraper/issues', '### Check the detailed exception message that follows for the cause.' ]); -var ERROR_CODES = { +const ERROR_CODES = { // Non-standard 5xx server error HTTP status codes '520': 'Web server is returning an unknown error', '521': 'Web server is down', @@ -48,22 +48,22 @@ ERROR_CODES[1006] = ERROR_CODES[1007] = ERROR_CODES[1008] = 'Access Denied: Your IP address has been banned'; -var OriginalError = original.RequestError; +const OriginalError = original.RequestError; -var RequestError = create('RequestError', 0); -var CaptchaError = create('CaptchaError', 1); +const RequestError = create('RequestError', 0); +const CaptchaError = create('CaptchaError', 1); // errorType 4 is a CloudflareError so this constructor is reused. -var CloudflareError = create('CloudflareError', 2, function (error) { +const CloudflareError = create('CloudflareError', 2, function (error) { if (!isNaN(error.cause)) { - var description = ERROR_CODES[error.cause] || http.STATUS_CODES[error.cause]; + const description = ERROR_CODES[error.cause] || http.STATUS_CODES[error.cause]; if (description) { error.message = error.cause + ', ' + description; } } }); -var ParserError = create('ParserError', 3, function (error) { +const ParserError = create('ParserError', 3, function (error) { error.message = BUG_REPORT + error.message; }); diff --git a/examples/solve-recaptcha.js b/examples/solve-recaptcha.js new file mode 100644 index 0000000..edb1094 --- /dev/null +++ b/examples/solve-recaptcha.js @@ -0,0 +1,19 @@ +#!/usr/bin/env node + +function solveReCAPTCHA (url, sitekey, callback) { + // Here you do some magic with the sitekey provided by cloudscraper +} + +function onCaptcha (options, response, body) { + const captcha = response.captcha; + // solveReCAPTCHA is a method that you should come up with and pass it href and sitekey, in return it will return you a reponse + solveReCAPTCHA(response.request.uri.href, captcha.siteKey, (error, gRes) => { + if (error) return void captcha.submit(error); + captcha.form['g-recaptcha-response'] = gRes; + captcha.submit(); + }); +} + +const cloudscraper = require('..').defaults({ onCaptcha }); +var uri = process.argv[2]; +cloudscraper.get({ uri: uri, headers: { cookie: 'captcha=1' } }).catch(console.warn).then(console.log); // eslint-disable-line promise/catch-or-return diff --git a/index.js b/index.js index 0201c4f..ab7bb5c 100644 --- a/index.js +++ b/index.js @@ -1,50 +1,31 @@ 'use strict'; -var vm = require('vm'); -var requestModule = require('request-promise'); -var errors = require('./errors'); -var decodeEmails = require('./lib/email-decode.js'); - -var USER_AGENTS = [ - 'Ubuntu Chromium/34.0.1847.116 Chrome/34.0.1847.116 Safari/537.36', - 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.21 (KHTML, like Gecko) konqueror/4.14.10 Safari/537.21', - 'Mozilla/5.0 (iPad; CPU OS 5_1 like Mac OS X) AppleWebKit/534.46 (KHTML, like Gecko ) Version/5.1 Mobile/9B176 Safari/7534.48.3', - 'Mozilla/5.0 (iPad; U; CPU OS 3_2 like Mac OS X; en-us) AppleWebKit/531.21.10 (KHTML, like Gecko) Version/4.0.4 Mobile/7B334b Safari/531.21.10', - 'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; Media Center PC 6.0; InfoPath.3; MS-RTC LM 8; Zune 4.7)', - 'Mozilla/5.0 (Windows Phone 8.1; ARM; Trident/7.0; Touch; rv:11.0; IEMobile/11.0; NOKIA; Lumia 630) like Gecko', - 'Mozilla/5.0 (compatible; MSIE 10.0; Windows Phone 8.0; Trident/6.0; IEMobile/10.0; ARM; Touch; NOKIA; Lumia 920)', - 'Mozilla/5.0 (Linux; U; Android 2.2; en-us; Sprint APA9292KT Build/FRF91) AppleWebKit/533.1 (KHTML, like Gecko) Version/4.0 Mobile Safari/533.1', - 'Mozilla/5.0 (X11; Linux x86_64; rv:2.2a1pre) Gecko/20100101 Firefox/4.2a1pre', - 'Mozilla/5.0 (SymbianOS/9.1; U; en-us) AppleWebKit/413 (KHTML, like Gecko) Safari/413 es65', - 'Mozilla/5.0 (Linux; Android 6.0; Nexus 5X Build/MDB08L) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.124 Mobile Safari/537.36', - 'Mozilla/5.0 (X11; U; FreeBSD i386; de-CH; rv:1.9.2.8) Gecko/20100729 Firefox/3.6.8' -]; - -var DEFAULT_USER_AGENT = randomUA(); - -var VM_OPTIONS = { - contextOrigin: 'cloudflare:challenge.js', - contextCodeGeneration: { strings: true, wasm: false }, - timeout: 5000 -}; +const requestModule = require('request-promise'); +const sandbox = require('./lib/sandbox'); +const decodeEmails = require('./lib/email-decode.js'); +const getDefaultHeaders = require('./lib/headers'); +const brotli = require('./lib/brotli'); + +const { + RequestError, + CaptchaError, + CloudflareError, + ParserError +} = require('./errors'); + +const HOST = Symbol('host'); module.exports = defaults.call(requestModule); function defaults (params) { // isCloudScraper === !isRequestModule - var isRequestModule = this === requestModule; + const isRequestModule = this === requestModule; - var defaultParams = (!isRequestModule && this.defaultParams) || { + let defaultParams = (!isRequestModule && this.defaultParams) || { requester: requestModule, // Cookies should be enabled jar: requestModule.jar(), - headers: { - 'Connection': 'keep-alive', - 'User-Agent': DEFAULT_USER_AGENT, - 'Cache-Control': 'private', - 'Accept': 'application/xml,application/xhtml+xml,text/html;q=0.9, text/plain;q=0.8,image/png,*/*;q=0.5', - 'Accept-Language': 'en-US,en;q=0.9' - }, + headers: getDefaultHeaders({ 'Host': HOST }), // Reduce Cloudflare's timeout to cloudflareMaxTimeout if it is excessive cloudflareMaxTimeout: 30000, // followAllRedirects - follow non-GET HTTP 3xx responses as redirects @@ -52,14 +33,17 @@ function defaults (params) { // Support only this max challenges in row. If CF returns more, throw an error challengesToSolve: 3, // Remove Cloudflare's email protection - decodeEmails: false + decodeEmails: false, + // Support gzip encoded responses + gzip: true }; // Object.assign requires at least nodejs v4, request only test/supports v6+ defaultParams = Object.assign({}, defaultParams, params); - var cloudscraper = requestModule.defaults + const cloudscraper = requestModule.defaults .call(this, defaultParams, function (options) { + validateRequest(options); return performRequest(options, true); }); @@ -79,9 +63,7 @@ function defaults (params) { return cloudscraper; } -// This function is wrapped to ensure that we get new options on first call. -// The options object is reused in subsequent calls when calling it directly. -function performRequest (options, isFirstRequest) { +function validateRequest (options) { // Prevent overwriting realEncoding in subsequent calls if (!('realEncoding' in options)) { // Can't just do the normal options.encoding || 'utf8' @@ -105,15 +87,26 @@ function performRequest (options, isFirstRequest) { 'got ' + typeof (options.cloudflareMaxTimeout) + ' instead.'); } - // This should be the default export of either request or request-promise. - var requester = options.requester; - - if (typeof requester !== 'function') { + if (typeof options.requester !== 'function') { throw new TypeError('Expected `requester` option to be a function, got ' + - typeof (requester) + ' instead.'); + typeof (options.requester) + ' instead.'); } +} - var request = requester(options); +// This function is wrapped to ensure that we get new options on first call. +// The options object is reused in subsequent calls when calling it directly. +function performRequest (options, isFirstRequest) { + // This should be the default export of either request or request-promise. + const requester = options.requester; + + // Note that request is always an instanceof ReadableStream, EventEmitter + // If the requester is request-promise, it is also thenable. + const request = requester(options); + + // We must define the host header ourselves to preserve case and order. + if (request.getHeader('host') === HOST) { + request.setHeader('host', request.uri.host); + } // If the requester is not request-promise, ensure we get a callback. if (typeof request.callback !== 'function') { @@ -139,7 +132,7 @@ function performRequest (options, isFirstRequest) { onRequestResponse(options, null, response, body); }); - // Indicate that this is a cloudscraper request, required by test/helper. + // Indicate that this is a cloudscraper request request.cloudscraper = true; return request; } @@ -147,12 +140,12 @@ function performRequest (options, isFirstRequest) { // The argument convention is options first where possible, options // always before response, and body always after response. function onRequestResponse (options, error, response, body) { - var callback = options.callback; + const callback = options.callback; // Encoding is null so body should be a buffer object if (error || !body || !body.toString) { // Pure request error (bad connection, wrong url, etc) - return callback(new errors.RequestError(error, options, response)); + return callback(new RequestError(error, options, response)); } response.responseStartTime = Date.now(); @@ -164,23 +157,33 @@ function onRequestResponse (options, error, response, body) { return callback(null, response, body); } + // Decompress brotli compressed responses + if (/\bbr\b/i.test('' + response.caseless.get('content-encoding'))) { + if (!brotli.isAvailable) { + const cause = 'Received a Brotli compressed response. Please install brotli'; + return callback(new RequestError(cause, options, response)); + } + + response.body = body = brotli.decompress(body); + } + if (response.isCloudflare && response.isHTML) { onCloudflareResponse(options, response, body); } else { - processResponseBody(options, response, body); + onRequestComplete(options, response, body); } } function onCloudflareResponse (options, response, body) { - var callback = options.callback; + const callback = options.callback; - var stringBody; - var isChallenge; - var isRedirectChallenge; + let stringBody; + let isChallenge; + let isRedirectChallenge; if (body.length < 1) { // This is a 4xx-5xx Cloudflare response with an empty body. - return callback(new errors.CloudflareError(response.statusCode, options, response)); + return callback(new CloudflareError(response.statusCode, options, response)); } stringBody = body.toString('utf8'); @@ -188,104 +191,106 @@ function onCloudflareResponse (options, response, body) { try { validate(options, response, stringBody); } catch (error) { + if (error instanceof CaptchaError && typeof options.onCaptcha === 'function') { + // Give users a chance to solve the reCAPTCHA via services such as anti-captcha.com + return onCaptcha(options, response, stringBody); + } + return callback(error); } isChallenge = stringBody.indexOf('a = document.getElementById(\'jschl-answer\');') !== -1; if (isChallenge) { - return solveChallenge(options, response, stringBody); + return onChallenge(options, response, stringBody); } isRedirectChallenge = stringBody.indexOf('You are being redirected') !== -1 || stringBody.indexOf('sucuri_cloudproxy_js') !== -1; if (isRedirectChallenge) { - return setCookieAndReload(options, response, stringBody); + return onRedirectChallenge(options, response, stringBody); } // 503 status is always a challenge if (response.statusCode === 503) { - return solveChallenge(options, response, stringBody); + return onChallenge(options, response, stringBody); } // All is good - processResponseBody(options, response, body); + onRequestComplete(options, response, body); } function validate (options, response, body) { - var match; + let match; // Finding captcha if (body.indexOf('why_captcha') !== -1 || /cdn-cgi\/l\/chk_captcha/i.test(body)) { - throw new errors.CaptchaError('captcha', options, response); + // Convenience boolean + response.isCaptcha = true; + throw new CaptchaError('captcha', options, response); } // Trying to find '1006' match = body.match(/<\w+\s+class="cf-error-code">(.*)<\/\w+>/i); if (match) { - var code = parseInt(match[1]); - throw new errors.CloudflareError(code, options, response); + let code = parseInt(match[1]); + throw new CloudflareError(code, options, response); } return false; } -function solveChallenge (options, response, body) { - var callback = options.callback; - var cause; - var error; +function onChallenge (options, response, body) { + const callback = options.callback; + const uri = response.request.uri; + // The query string to send back to Cloudflare + const payload = { /* s, jschl_vc, pass, jschl_answer */ }; + + let cause; + let error; if (options.challengesToSolve === 0) { cause = 'Cloudflare challenge loop'; - error = new errors.CloudflareError(cause, options, response); + error = new CloudflareError(cause, options, response); error.errorType = 4; return callback(error); } - var timeout = parseInt(options.cloudflareTimeout); - var uri = response.request.uri; - // The query string to send back to Cloudflare - // var payload = { s, jschl_vc, pass, jschl_answer }; - var payload = {}; - var sandbox; - var match; + let timeout = parseInt(options.cloudflareTimeout); + let match; match = body.match(/name="s" value="(.+?)"/); - if (match) { payload.s = match[1]; } match = body.match(/name="jschl_vc" value="(\w+)"/); - if (!match) { cause = 'challengeId (jschl_vc) extraction failed'; - return callback(new errors.ParserError(cause, options, response)); + return callback(new ParserError(cause, options, response)); } payload.jschl_vc = match[1]; match = body.match(/name="pass" value="(.+?)"/); - if (!match) { cause = 'Attribute (pass) value extraction failed'; - return callback(new errors.ParserError(cause, options, response)); + return callback(new ParserError(cause, options, response)); } payload.pass = match[1]; match = body.match(/getElementById\('cf-content'\)[\s\S]+?setTimeout.+?\r?\n([\s\S]+?a\.value\s*=.+?)\r?\n(?:[^{<>]*},\s*(\d{4,}))?/); - if (!match) { cause = 'setTimeout callback extraction failed'; - return callback(new errors.ParserError(cause, options, response)); + return callback(new ParserError(cause, options, response)); } if (isNaN(timeout)) { - if (match.length > 2) { + if (match[2] !== undefined) { timeout = parseInt(match[2]); if (timeout > options.cloudflareMaxTimeout) { @@ -297,7 +302,7 @@ function solveChallenge (options, response, body) { } } else { cause = 'Failed to parse challenge timeout'; - return callback(new errors.ParserError(cause, options, response)); + return callback(new ParserError(cause, options, response)); } } @@ -305,16 +310,16 @@ function solveChallenge (options, response, body) { response.challenge = match[1] + '; a.value'; try { - sandbox = createSandbox({ uri: uri, body: body }); - payload.jschl_answer = vm.runInNewContext(response.challenge, sandbox, VM_OPTIONS); + const ctx = new sandbox.Context({ hostname: uri.hostname, body }); + payload.jschl_answer = sandbox.eval(response.challenge, ctx); } catch (error) { error.message = 'Challenge evaluation failed: ' + error.message; - return callback(new errors.ParserError(error, options, response)); + return callback(new ParserError(error, options, response)); } if (isNaN(payload.jschl_answer)) { cause = 'Challenge answer is not a number'; - return callback(new errors.ParserError(cause, options, response)); + return callback(new ParserError(cause, options, response)); } // Prevent reusing the headers object to simplify unit testing. @@ -328,44 +333,151 @@ function solveChallenge (options, response, body) { } // Set the query string and decrement the number of challenges to solve. options.qs = payload; - options.challengesToSolve = options.challengesToSolve - 1; + options.challengesToSolve -= 1; // Make request with answer after delay. timeout -= Date.now() - response.responseStartTime; setTimeout(performRequest, timeout, options, false); } -function setCookieAndReload (options, response, body) { - var callback = options.callback; +// Parses the reCAPTCHA form and hands control over to the user +function onCaptcha (options, response, body) { + const callback = options.callback; + // UDF that has the responsibility of returning control back to cloudscraper + const handler = options.onCaptcha; + // The form data to send back to Cloudflare + const payload = { /* s, g-re-captcha-response */ }; + + let cause; + let match; + + match = body.match(/