diff --git a/configs/ghostery.js b/configs/ghostery.js index b221fef..d404a14 100644 --- a/configs/ghostery.js +++ b/configs/ghostery.js @@ -17,7 +17,8 @@ module.exports = { publish: publish.toEdge('browser-core', 'ghostery'), sourceMaps: false, format: 'common', - settings: { ...urls, + settings: { + ...urls, channel: 'CH80', MSGCHANNEL: 'web-extension', URL_CHANGED_EVENT_DEBOUNCE: 500, @@ -33,7 +34,10 @@ module.exports = { }, }, HUMAN_WEB_LITE_COLLECTOR_VIA_PROXY: 'https://collector-hpn.ghostery.net', - HUMAN_WEB_LITE_COLLECTOR_DIRECT: 'https://collector-hpn.ghostery.net' }, + HUMAN_WEB_LITE_COLLECTOR_DIRECT: 'https://collector-hpn.ghostery.net', + HUMAN_WEB_LITE_PATTERNS: 'https://cdn2.ghostery.com/human-web-android/patterns.json', + HUMAN_WEB_LITE_AUTO_TRIGGER: true + }, default_prefs: { 'modules.human-web.enabled': true, 'modules.antitracking.enabled': true, diff --git a/modules/human-web-lite/sources/html-parser.es b/modules/human-web-lite/sources/html-parser.es index cfef0ee..15c721b 100644 --- a/modules/human-web-lite/sources/html-parser.es +++ b/modules/human-web-lite/sources/html-parser.es @@ -6,23 +6,10 @@ * file, You can obtain one at https://mozilla.org/MPL/2.0/. */ -// TODO: here we need something that works in react-native. -// The following code is copied from human-web and probably -// will not work outside of the browser. -// -// If there are problems, maybe one of the libraries mentioned here -// can be used on Mobile: -// * https://stackoverflow.com/q/38343951/783510 -// -// Note: In jsdom, this implementation should work: -// -// return new JSDOM(html).window.document; -// -// However, it is hard to use jsdom outside of NodeJs. -// I failed to get it working in the Browser, for example. -// import window from '../core/globals-window'; +// TODO: consider using linkedom here as well +// (Note: we need something that works in react-native) export default function parseHtml(html) { if (!parseHtml.domParser) { parseHtml.domParser = new window.DOMParser(); diff --git a/modules/human-web-lite/sources/http.es b/modules/human-web-lite/sources/http.es index 92a2144..e6cf7ec 100644 --- a/modules/human-web-lite/sources/http.es +++ b/modules/human-web-lite/sources/http.es @@ -9,23 +9,90 @@ const SECOND = 1000; -// TODO: Can we use the "AbortController" API on Mobile? Otherwise, -// running jobs in background might be difficult because we risk that -// the app will be killed. -export async function anonymousHttpGet(url, { timeout = 15 * SECOND } = {}) { +/** + * Performs a HTTP Get. + * + * Optional: + * - headers: allows to overwrite headers + * - redirect: 'follow' or 'manual' (default; i.e. redirects are errors) + */ +export async function anonymousHttpGet(url, { + headers = null, + redirect = null, + timeout = 15 * SECOND, +} = {}) { const options = { credentials: 'omit', mode: 'no-cors', - redirect: 'manual', + redirect: redirect || 'manual', // TODO: Or maybe this does work? It is not part of the fetch standard, // but I have seen it in some react-native examples. // If it works, it could be used if AbortController is not available. timeout, }; - const response = await fetch(url, options); - if (!response.ok) { - throw new Error(`Failed to fetch url ${url}: ${response.statusText}`); + + // The following code overwrite the headers of the request. + // Note that "fetch" allows to overwrite headers in a simple declarative way, + // but unfortunately it is limited. For example, it is not possible to + // overwrite the cookie headers. The following code will work for all + // type of headers. + // + // The matching logic is not perfect but should be fairly accurate. + // Ideally, we would want to run the handler only for the request that we + // are about to trigger, but not for any other requests to avoid unintended + // side-effects. To mitigate the risk, uninstall the handler at the first + // opportunity: either if it is called or if the request finished + // (and we know the handle will never be called). + let webRequestHandler; + const uninstallHandler = () => { + if (webRequestHandler) { + chrome.webRequest.onBeforeSendHeaders.removeListener(webRequestHandler); + webRequestHandler = null; + } + }; + const headerNames = Object.keys(headers || {}); + if (headerNames.length > 0) { + webRequestHandler = (details) => { + if (details.url !== url || details.type !== 'xmlhttprequest' || details.method !== 'GET') { + // does that match the request that we intended to trigger + return {}; + } + + // match: now we can already deregister the listener + // (it should not be executed multiple times) + uninstallHandler(); + const normalizedHeaders = headerNames.map(x => x.toLowerCase()); + + /* eslint-disable no-param-reassign */ + details.requestHeaders = details.requestHeaders.filter( + header => !normalizedHeaders.includes(header.name.toLowerCase()) + ); + + headerNames.forEach((name) => { + details.requestHeaders.push({ + name, + value: headers[name], + }); + }); + + return { + requestHeaders: details.requestHeaders + }; + }; + chrome.webRequest.onBeforeSendHeaders.addListener(webRequestHandler, { + urls: [url] + }, ['blocking', 'requestHeaders']); + } + + try { + const response = await fetch(url, options); + if (!response.ok) { + throw new Error(`Failed to fetch url ${url}: ${response.statusText}`); + } + uninstallHandler(); + return response.text(); + } finally { + uninstallHandler(); } - return response.text(); } diff --git a/modules/human-web-lite/sources/human-web-lite.es b/modules/human-web-lite/sources/human-web-lite.es index 644f845..fc0de2e 100644 --- a/modules/human-web-lite/sources/human-web-lite.es +++ b/modules/human-web-lite/sources/human-web-lite.es @@ -6,6 +6,8 @@ * file, You can obtain one at https://mozilla.org/MPL/2.0/. */ +import Patterns from './patterns'; +import PatternsUpdater from './patterns-updater'; import Sanitizer from './sanitizer'; import UrlAnalyzer from './url-analyzer'; import MessageSender from './message-sender'; @@ -21,8 +23,16 @@ export default class HumanWebLite { // to collect data. this.isActive = false; + this.patterns = new Patterns(); + this.patternsUpdater = new PatternsUpdater({ + config, + patterns: this.patterns, + storage, + storageKey: 'patterns', + }); + this.sanitizer = new Sanitizer(config); - this.urlAnalyzer = new UrlAnalyzer(); + this.urlAnalyzer = new UrlAnalyzer(this.patterns); this.persistedHashes = new PersistedHashes({ storage, storageKey: 'deduplication_hashes', @@ -32,6 +42,7 @@ export default class HumanWebLite { this.messageSender = new MessageSender(this.duplicateDetector, hpn); this.searchExtractor = new SearchExtractor({ config, + patterns: this.patterns, sanitizer: this.sanitizer, persistedHashes: this.persistedHashes, }); @@ -39,9 +50,7 @@ export default class HumanWebLite { } async init() { - // TODO: In a feature-complete implementation, you would need - // to have a mechanism to keep the extraction patterns up-to-date. - // As we have hard-coded patterns, there is nothing to do here. + await this.patternsUpdater.init(); this.isActive = true; } @@ -78,7 +87,19 @@ export default class HumanWebLite { return true; } - processPendingJobs() { + async processPendingJobs() { + await this._ensurePatternsAreUpToDate(); return this.jobScheduler.processPendingJobs(); } + + async _ensurePatternsAreUpToDate() { + // Currently, the PatternsUpdater needs to be externally triggered. + // This implementation detail could be avoided, if the PatternsUpdater + // could use a browser API like timers in persistent background pages + // or the Alert API (Manifest V3). + // The "update" function is a quick operation unless for the rare + // situation that the patterns are outdated and need to be fetched. + // Thus, there should be no harm in calling it here. + await this.patternsUpdater.update(); + } } diff --git a/modules/human-web-lite/sources/patterns-updater.es b/modules/human-web-lite/sources/patterns-updater.es new file mode 100644 index 0000000..33ca18a --- /dev/null +++ b/modules/human-web-lite/sources/patterns-updater.es @@ -0,0 +1,282 @@ +/*! + * Copyright (c) 2014-present Cliqz GmbH. All rights reserved. + * + * This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at https://mozilla.org/MPL/2.0/. + */ + +import logger from './logger'; + +const SECOND = 1000; +const MINUTE = 60 * SECOND; +const HOUR = 60 * MINUTE; + +/** + * If you need to introduce incompatible changes to the the state + * persistence, you can bump this number to clear the persisted cache. + * It will be as if you start with a fresh installation of the extension. + * + * (If you are not sure whether the change is incompatible, it is + * a good idea to be conservative and bump this number anyway; it will + * only introduce little overhead.) + */ +const DB_VERSION = 1; + +function randBetween(min, max) { + return min + Math.random() * (max - min); +} + +function clamp({ min, max, value }) { + return Math.min(Math.max(min, value), max); +} + +/** + * Responsible for keeping the patterns up-to-date by polling + * the backend for changes. By design, it supports situations where + * the extension is frequently restarted; in other words, it is + * designed to work in an empheral service worker model and + * does not required a persistent background. + * + * The current implementation assumes that the "update" function + * gets triggered frequently enough to check for new patterns + * within the configured intervals. + * + * (If the need to trigger the "update" function becomes a problem, + * it could be lifted by using alternative browser APIs such as + * as the "Alarms API". But as calling update is a fast operation + * and the results are cached, it is safe to call it before + * each access to the patterns.) + */ +export default class PatternsUpdater { + constructor({ config, patterns, storage, storageKey, _fetchImpl }) { + this.patterns = patterns; + this.storage = storage; + this.storageKey = storageKey; + this.patternUpdateUrl = config.HUMAN_WEB_LITE_PATTERNS; + + // Update intervals: + // 1) standard polling interval + // (it can be configured conservatively to safe bandwidth) + this.defaultUpdateInterval = { + min: 8 * HOUR, + max: 24 * HOUR, + }; + + // 2) accelerated polling intervals + // It can happen that a pattern update is followed quickly by another one; + // so, temporarily switch to a faster polling frequency to see these + // as fast as possible. It is desirable that clients do not fall into + // different populations with different patterns version. + this.fastUpdateInterval = { + min: 30 * MINUTE, + max: 2 * HOUR, + }; + this._initEmptyCache(); + + // Wrap browser APIs (primarily to swap them out in the unit tests): + // this._fetch works like the native "fetch" + this._fetch = _fetchImpl || (() => { + try { + // Apology, this part is a bit esoteric. Do not simplify without + // testing on different devices (Android)! + // Both the extra assignment and explicit wrapping are intended. + const nativeFetch = fetch; // fail fast if "fetch" is not available + return (...args) => nativeFetch(...args); + } catch (e) { + const msg = 'fetch API is not available. You should never see this warning in production! ' + + 'For unit tests, you can pass _fetchImpl to provide an implementation.'; + logger.warn(msg); + return () => { + throw new Error(msg); + }; + } + })(); + } + + _initEmptyCache() { + this._persistedState = { + patterns: null, // string (unmodified as it came from the server) + skipAttemptsUntil: 0, // unix epoche + lastFetchAttempt: 0, // unix epoche + lastConfirmedModification: 0, // unix epoche + failedAttemptsInARow: 0, + dbVersion: DB_VERSION, + }; + } + + async _savePersistedState() { + return this.storage.set(this.storageKey, this._persistedState); + } + + async init({ now = Date.now() } = {}) { + let force = false; + try { + let persistedState = await this.storage.get(this.storageKey); + if (persistedState && persistedState.version !== DB_VERSION) { + logger.info('DB_VERSION changed. Discarding the cache...'); + persistedState = null; + } + if (persistedState && !this._timestampsLookValid(persistedState, now)) { + logger.warn('The timestamps in the pattern cache show indications that the system clock was off. Discarding the cache:', persistedState); + persistedState = null; + } + + if (persistedState) { + if (persistedState.patterns) { + this.patterns.updatePatterns(JSON.parse(persistedState.patterns)); + } + this._persistedState = persistedState; + } else { + logger.info('Pattern cache does not exist. This should only happen on the first time the extension is started.'); + force = true; + } + } catch (e) { + logger.warn('Failed to load cached patterns from disk. Forcing an update.'); + force = true; + } + + try { + await this.update({ force, now }); + } catch (e) { + logger.warn('Failed to fetch patterns', e); + } + } + + /** + * Poll for pattern updates. It will block until the update operation + * is finished. + * + * Cooldowns: + * ---------- + * To make this function easier to use, the caller is not responsible + * for throttling updates. In fact, it is better if the update function + * is called as often as possible. + * + * Error handling: + * --------------- + * It is not the responsibility of the caller to schedule retry attemps, + * but only to guarantee that the "update" function is called regurarily. + * As it is generally not useful to learn about failure, the returned + * promise will always resolve. But if you want to know whether it really + * succeeded or not - for logging or debugging purposes - you can overwrite + * the default by passing the "ignoreError" flag. + */ + async update({ force = false, ignoreErrors = true, now = Date.now() } = {}) { + try { + await this._update({ force, now }); + } catch (e) { + if (!ignoreErrors) { + throw e; + } + logger.debug('Failed to update patterns. It is safe to continue.'); + } + } + + async _update({ force = false, now = Date.now() } = {}) { + if (!force && now < this._persistedState.skipAttemptsUntil) { + logger.debug('Cooldown not reached yet. Need to wait until', this._persistedState.skipAttemptsUntil, 'before updating patterns again.'); + if (this._persistedState.failedAttemptsInARow > 0) { + throw new Error('Unable to fetch patterns. Need to wait for the cooldown to finish before retrying...'); + } + return; + } + + const otherUpdate = this._pendingUpdate; + if (otherUpdate) { + logger.debug('Pattern update already in progress...'); + await otherUpdate; + return; + } + + let done; + this._pendingUpdate = new Promise((resolve) => { done = resolve; }); + try { + this._persistedState.lastFetchAttempt = now; + const url = this.patternUpdateUrl; + const response = await this._fetch(url, { + method: 'GET', + credentials: 'omit', + }); + if (!response.ok) { + throw new Error(`Failed to fetch patterns (${response.statusText}) from url=${url}`); + } + const newPatterns = await response.text(); + const rules = JSON.parse(newPatterns); + + // Bookkeeping to detect if the server just released an update. + // It is useful for debugging, but it also means we should + // temporarily poll more frequently for updates. + const oldPatterns = this._persistedState.patterns; + this._persistedState.patterns = newPatterns; + const detectedModification = oldPatterns && oldPatterns !== newPatterns; + if (detectedModification) { + logger.info('The server released new patterns:', rules); + this._persistedState.lastConfirmedModification = now; + } + + // successful update: set cooldown + this._persistedState.failedAttemptsInARow = 0; + const { min, max } = detectedModification + ? this.fastUpdateInterval : this.defaultUpdateInterval; + this._persistedState.skipAttemptsUntil = now + randBetween(min, max); + + // apply the pattern update + if (!oldPatterns || detectedModification) { + this.patterns.updatePatterns(rules); + } + } catch (e) { + // the update failed: we have to retry, but approximate an + // exponential backoff to prevent a burst HTTP calls. + this._persistedState.failedAttemptsInARow += 1; + const avgCooldown = this._persistedState.failedAttemptsInARow * (10 * SECOND); + const noisyCooldown = randBetween(avgCooldown / 1.5, 1.5 * avgCooldown); + const finalCooldown = clamp({ + value: noisyCooldown, + min: 3 * SECOND, + max: 8 * HOUR, + }); + this._persistedState.skipAttemptsUntil = now + finalCooldown; + logger.warn('Failed to fetch pattern. Cooldown until:', + new Date(this._persistedState.skipAttemptsUntil), e); + throw e; + } finally { + try { + await this._savePersistedState(); + } catch (e) { + logger.warn('Failed to update patterns cache.', e); + } + this._pendingUpdate = null; + done(); + } + } + + /** + * Run some sanity checks on the timestamps. If any of the timestamps + * were produced from a clock that was in the future, updates will + * stop working. If we detect such a case, we should purge the cache + * and start from scratch. + * + * Staying forever on outdated patterns would also increase the risk + * of sticking out from the crowd. + */ + _timestampsLookValid(persistedState, now = Date.now()) { + const maxCooldown = Math.max( + this.defaultUpdateInterval.min, + this.defaultUpdateInterval.max, + this.fastUpdateInterval.min, + this.fastUpdateInterval.max + ); + + // Small jumps are not a concern (some minutes). But it becomes a + // problem if the clock jumped too in the future (months). + // The systems were the system clock is off are rare, but they + // can happen (e.g. https://github.com/systemd/systemd/issues/6036). + const allowedDrift = 5 * MINUTE; + const isOK = ts => ts < now + allowedDrift; + + return isOK(persistedState.skipAttemptsUntil - maxCooldown) + && isOK(persistedState.lastConfirmedModification) + && isOK(persistedState.lastFetchAttempt); + } +} diff --git a/modules/human-web-lite/sources/patterns.es b/modules/human-web-lite/sources/patterns.es new file mode 100644 index 0000000..8c534a5 --- /dev/null +++ b/modules/human-web-lite/sources/patterns.es @@ -0,0 +1,223 @@ +/*! + * Copyright (c) 2014-present Cliqz GmbH. All rights reserved. + * + * This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at https://mozilla.org/MPL/2.0/. + */ + +import logger from './logger'; +import { sanitizeUrl } from './sanitizer'; + +function expectString(arg) { + if (typeof arg !== 'string') { + throw new Error(`Expected string argument but got: ${arg}`); + } +} + +function expectInteger(arg) { + if ((typeof arg !== 'number') || (arg % 1 !== 0)) { + throw new Error(`Expected integer argument but got: ${arg}`); + } +} + +/** + * A list of predefined string transformations that can be specified + * in the DSL in the "transforms" definition. + * + * Notes: + * - All transformations are stateless and must be free of side-effects. + * - If a single steps return "null", the following steps will + * not be executed. + * - The first argument is the current value (the accumulator), + * but extra parameters can be defined in the DSL; these will be + * passed to the function as additional arguments. + * + * Preventing remote code execution + * -------------------------------- + * + * The predefined functions need to be carefully checked. To illustrate + * the threat model, let us look at a constructed example first: + * + * badIdea: (x, param) => eval(param) + * + * Now, if an attacker compromises the servers and gets control to push + * malicious pattern updates, the function could be exploited: + * + * ["badIdea", ""]. + * + * Be careful not to introduce a function that allows an attack + * like that. That is why it is so important to keep the function free + * of side-effects! + * + * ---------------------------------------------------------------------- + * + * Additional warnings: + * + * 1) Do not allow DoS (be careful when looping; if possible avoid any loops): + * + * As long as the functions are free of side-effects, the worst possible + * attack would be denial-of-service (in other words, someone could push a + * rule that results in an infinite loop). So, also be careful when using + * explicit loops - there should be no need for it anyway. + * Best keep the transformations simple. + * + * 2) Do not trust the parameters: + * + * Note that an attacker will be able to control the arguments passed + * into the function: + * - extra parameters are under direct control (as they are taken + * from the rule definitions) + * - the first parameter (the accumulator) is more difficult to + * control but expect that it is prudent to assume that it can + * be controlled as well (e.g., if a user can be tricked to visit + * any website where the attacker can control text) + * + * As long as you avoid side-effects and loops, critical exploits + * are not possible, but again there are DoS type attacks. + * + * For instance, if you are writing a rule with an parameter that will + * be used as a regular expression, be careful. What will happen if the + * attacker pushes a rule with a long regular expression that may lead + * to exponential backtracking? Think about these kind of attacks and + * about mitigations (e.g. reject overly long parameters). + * Again, it is best to keep the functions simple to avoid any surprises. + * + * ---------------------------------------------------------------------- + * + * Error handling: + * + * 1) Throwing an exception is supported. In that case, expect the whole + * rule to be skipped (no message will be sent). In other words, reserve + * it for unexpected cases. + * 2) Returning "null"/"undefined" has the semantic of stopping the + * execution without an error. It is still possible that a + * message will be sent, but with a missing value. + */ +const TRANSFORMS = new Map(Object.entries({ + /** + * Extracts a given query parameter and decodes it. + * + * Example ["queryParam", "foo"]: + * - "https://example.test/path?foo=bar+baz" -> "bar baz" + * - "/example.test/path?foo=bar+baz" -> "bar baz" + * - "/example.test/path" -> null + * - "This is a string but not an URL" -> null + */ + queryParam: (x, queryParam) => { + expectString(x); + expectString(queryParam); + try { + // we only need the query parameter, but to handle relative + // URLs we have to pass a base URL (any domain will work) + return new URL(x, 'http://x').searchParams.get(queryParam); + } catch (e) { + return null; + } + }, + + /** + * Given a URL, it runs a set of extra checks to filter out + * parts that may be sensitive (i.e. keeping only the hostname), + * or even drop it completely. + */ + maskU: (x) => { + expectString(x); + try { + return sanitizeUrl(x).safeUrl; + } catch (e) { + return null; + } + }, + + split: (x, splitON, arrPos) => { + expectString(x); + expectString(splitON); + expectInteger(arrPos); + + const parts = x.split(splitON); + if (parts.length === 1) { + return null; + } + return parts[arrPos] ?? null; + }, + + trySplit: (x, splitON, arrPos) => { + expectString(x); + expectString(splitON); + expectInteger(arrPos); + + return x.split(splitON)[arrPos] || x; + }, + + decodeURIComponent: (x) => { + expectString(x); + return decodeURIComponent(x); + }, +})); + +export function lookupBuiltinTransform(name) { + const transform = TRANSFORMS.get(name); + if (transform) { + return transform; + } + throw new Error(`Unknown transformation: "${name}"`); +} + +/** + * Represents the currently active rules. + * + * It is updated by the PatternsUpdater, which polls + * the server for updates. + */ +export default class Patterns { + constructor() { + this._rules = {}; + } + + updatePatterns(rules) { + logger.info('Loaded patterns:', rules); + this._rules = rules; + } + + /** + * Grants access to the active patterns. It is guaranteed that the + * returned object will not be modified. + * + * If you plan to perform multiple operations, it is recommended + * to call this function one and then operate on this snapshot. + * Even though it is unlikely, patterns can change at any point + * in time. As long as you operate on the snapshot, you do not have + * to worry about it. + */ + getRulesSnapshot() { + return this._rules; + } + + /** + * Constructs a "doublefetchRequest" object, which defines the doublefetch + * requests for the given URL. + * + * Example outputs: + * 1. { url: 'https://example.test/foo', followRedirects: true, headers: { Cookie: 'bar' } } + * - allow redirects and overwrite the "Cookie" HTTP headers (as 'Cookie: bar') + * 2. { url: 'https://example.test/foo' } + * - do not allow redirects and do not overwrite headers + */ + createDoublefetchRequest(msgType, url) { + if (!this._rules[msgType]) { + return null; + } + const doublefetchRequest = { url }; + if (this._rules[msgType].doublefetch) { + const { headers, followRedirects } = this._rules[msgType].doublefetch; + if (followRedirects) { + doublefetchRequest.redirect = 'follow'; + } + if (headers) { + doublefetchRequest.headers = headers; + } + } + return doublefetchRequest; + } +} diff --git a/modules/human-web-lite/sources/sanitizer.es b/modules/human-web-lite/sources/sanitizer.es index c5164e5..e0b48f1 100644 --- a/modules/human-web-lite/sources/sanitizer.es +++ b/modules/human-web-lite/sources/sanitizer.es @@ -9,57 +9,50 @@ import logger from './logger'; import prefs from '../core/prefs'; -// STUB: To have something to start with. Note that these rules -// are not a full port of original rules in human-web.es. -// -// I copied the subset of the rules that we extracted for the -// server (where it is used a last defence against old clients). -// -/* eslint-disable */ -function isSuspiciousQueryStub(query) { - logger.info('[STUB] isSuspiciousQuery is not fully ported.'); +/** + * Returns true if the given string contains any text that looks + * like an email address. The check is conservative, that means + * false positives are expected, but false negatives are not. + */ +function checkForEmail(str) { + return /[a-z0-9\-_@]+(@|%40|%(25)+40)[a-z0-9\-_]+\.[a-z0-9\-_]/i.test(str); +} - // copied from the navigation extension client code (human-web.es): - function checkForLongNumber(str, max_number_length) { - const cstr = str.replace(/[^A-Za-z0-9]/g, ''); - - let lcn = 0; - let maxlcn = 0; - let maxlcnpos = null; - - for (let i=0;i= '0' && cstr[i] <= '9') { - lcn+=1; - } else { - if (lcn > maxlcn) { - maxlcn = lcn; - maxlcnpos = i; - lcn = 0; - } else { - lcn = 0; - } - } - } +function checkForLongNumber(str, maxNumberLength) { + const cstr = str.replace(/[^A-Za-z0-9]/g, ''); + + let lcn = 0; + let maxlcn = 0; + let maxlcnpos = null; - if (lcn > maxlcn) { + for (let i = 0; i < cstr.length; i += 1) { + if (cstr[i] >= '0' && cstr[i] <= '9') { + lcn += 1; + } else if (lcn > maxlcn) { maxlcn = lcn; - maxlcnpos = cstr.length; + maxlcnpos = i; lcn = 0; } else { lcn = 0; } + } - if (maxlcnpos!=null && maxlcn > max_number_length) { - return cstr.slice(maxlcnpos-maxlcn, maxlcnpos); - } - - return null; + if (lcn > maxlcn) { + maxlcn = lcn; + maxlcnpos = cstr.length; + lcn = 0; + } else { + lcn = 0; } - // copied from the navigation extension client code (human-web.es): - function checkForEmail(str) { - return /[a-z0-9\-_@]+(@|%40|%(25)+40)[a-z0-9\-_]+\.[a-z0-9\-_]/i.test(str); + if (maxlcnpos != null && maxlcn > maxNumberLength) { + return cstr.slice(maxlcnpos - maxlcn, maxlcnpos); } + return null; +} + +function isSuspiciousQueryStub(query) { + logger.info('[STUB] isSuspiciousQuery is not fully ported.'); // Remove the msg if the query is too long, if (query.length > 50) { @@ -78,18 +71,18 @@ function isSuspiciousQueryStub(query) { return true; } - //Remove if email (exact), even if not totally well formed + // Remove if email (exact), even if not totally well formed if (checkForEmail(query)) { return true; } - //Remove if query looks like an http password + // Remove if query looks like an http password if (/[^:]+:[^@]+@/.test(query)) { return true; } const v = query.split(' '); - for (let i=0; i 20) { return true; } @@ -100,7 +93,117 @@ function isSuspiciousQueryStub(query) { return false; } -/* eslint-enable */ + +function tryParseUrl(url) { + try { + return new URL(url); + } catch (e) { + return null; + } +} + +function checkForInternalIp(hostname) { + // TODO: this could be extended to detect more cases + return hostname === 'localhost' || hostname === '127.0.0.1'; +} + +/** + * There should be no reason for these URLs to show up, but if they do + * we should never send them to the backend. Especially, "moz-extension" + * is problematic, as it includes an id that is unique per user and + * can be used to link messages. + */ +function urlLeaksExtensionId(url) { + return url.startsWith('moz-extension://') || url.startsWith('chrome-extension://'); +} + +/** + * Sanity checks to protect against accidentially sending sensitive URLs. + * + * There are three possible outcomes: + * 1) "safe": URL can be accepted as is + * 2) "truncated": URL may have sensitive parts but can be truncated + * (use includ the hostname but remove the rest) + * 3) "dropped": URL is corrupted or unsafe + * + * Expections: this function should be seen as an additional layer of defence, + * but do not expect it to detect all situation. Instead, make sure to extract + * only URLs where the context is safe. Otherwise, you are expecting too + * much from this static classifier. + * + * When changing new rules here, it is OK to be conservative. Since + * classification error are expected, rather err on the side of + * dropping (or truncating) too much. + */ +export function sanitizeUrl(url) { + const accept = () => ({ result: 'safe', safeUrl: url }); + const drop = reason => ({ result: 'dropped', safeUrl: null, reason }); + + // first run some sanity check on the structure of the URL + const parsedUrl = tryParseUrl(url); + if (!parsedUrl) { + return drop('invalid URL'); + } + if (parsedUrl.username) { + return drop('URL sets username'); + } + if (parsedUrl.password) { + return drop('URL sets password'); + } + if (parsedUrl.port && (parsedUrl.port !== '80' && parsedUrl.port !== '443')) { + return drop('URL has uncommon port'); + } + if (parsedUrl.protocol !== 'http' && parsedUrl.protocol !== 'https') { + return drop('URL has uncommon protocol'); + } + if (checkForInternalIp(parsedUrl.hostname)) { + return drop('URL is not public'); + } + if (urlLeaksExtensionId(url)) { + return drop('URL leaks extension ID'); + } + + try { + // At this point, the most problematic URLs should be gone; + // now we can also decide to truncated by limiting it to the hostname. + // + // Often, that is a good compromise, as it still provides value + // but the risk that it contains sensitive information is limited. + // Note that even on https, the hostname will be shared in plaintext, + // so it is less likely that sites include secrets or personal + // identifiers in the hostname. + const truncate = (reason) => { + const safeUrl = `${parsedUrl.protocol}://${parsedUrl.hostname}/ (PROTECTED)`; + logger.debug('sanitizeUrl truncated URL:', url, '->', safeUrl); + return { + result: 'truncated', + safeUrl, + reason, + }; + }; + + // TODO: these rules could use some polishing + if (url.hostname > 50) { + return drop('hostname too long'); + } + if (url.length > 200) { + return truncate('url too long'); + } + + const decodedUrl = decodeURIComponent(url); + if (checkForEmail(url) || checkForEmail(decodedUrl)) { + return truncate('potential email found'); + } + + // TODO: check each path and query parameter and truncate if there + // are fields that could be tokens, secrets, names or logins. + + return accept(); + } catch (e) { + logger.warn(`Unexpected error in sanitizeUrl. Skipping url=${url}`, e); + return drop('Unexpected error'); + } +} /** * Set of heuristics to prevent accidentally leaking sensitive data. @@ -148,22 +251,21 @@ export default class Sanitizer { } maskURL(url) { - logger.info('[STUB] maskURL does nothing'); - return url; + if (sanitizeUrl(url)) { + return url; + } + return null; } /** - * Knowing the country of the sender is useful in Human Web data. - * For example, is allows to build separate search indexes for - * US, French, or German users. - * * As long as there are enough other users, revealing the country - * will not compromise anonymity. If the user base is too low + * will not compromise anonymity. Only if the user base is too low * (e.g., Liechtenstein), we have to be careful. In that case, - * do not reveal the country to mitigate fingerprinting attacks. + * do not reveal the country, otherwise fingerprinting attacks + * could be possible. * - * As the number of users varies between products, we get the - * information from the config. + * As the expected number of users varies between products, + * the information needs to be provided by the config. */ getSafeCountryCode() { const ctry = prefs.get('config_location', null); diff --git a/modules/human-web-lite/sources/search-extractor.es b/modules/human-web-lite/sources/search-extractor.es index 58b3a9b..b8427e1 100644 --- a/modules/human-web-lite/sources/search-extractor.es +++ b/modules/human-web-lite/sources/search-extractor.es @@ -6,12 +6,15 @@ * file, You can obtain one at https://mozilla.org/MPL/2.0/. */ +/* eslint-disable no-continue */ + import parseHtml from './html-parser'; import logger from './logger'; import { truncatedHash } from '../core/helpers/md5'; import random from '../core/crypto/random'; import { anonymousHttpGet } from './http'; import { getTimeAsYYYYMMDD } from '../hpn-lite/timestamps'; +import { lookupBuiltinTransform } from './patterns'; function doublefetchQueryHash(query, type) { // defines a cooldown to avoid performing unnecessary @@ -43,8 +46,57 @@ function chooseExpiration() { return Math.max(tillCooldown, tillNextUtcDay) + randomNoise; } +function runSelector(item, selector, attr) { + const elem = selector ? item.querySelector(selector) : item; + if (elem) { + if (attr === 'textContent') { + return elem.textContent; + } + if (attr === 'href') { + return elem.href; + } + if (elem.hasAttribute(attr)) { + return elem.getAttribute(attr); + } + } + return null; +} + +function runTransforms(value, transformSteps = []) { + if (!Array.isArray(transformSteps)) { + throw new Error('Transform definitions must be array.'); + } + if (value === undefined || value === null) { + return null; + } + let tmpValue = value; + for (const step of transformSteps) { + const [name, ...args] = step; + const transform = lookupBuiltinTransform(name); + tmpValue = transform(tmpValue, ...args); + } + return tmpValue ?? null; +} + +function findFirstMatch(rootItem, selectorDef) { + // special case: allows to define multiple rules (first matching rule wins) + if (selectorDef.firstMatch) { + for (const { select, attr, transform = [] } of selectorDef.firstMatch) { + const match = runSelector(rootItem, select, attr) ?? null; + if (match !== null) { + return runTransforms(match, transform); + } + } + return null; + } + + // default case: only one rule + return runSelector(rootItem, selectorDef.select, selectorDef.attr) ?? null; +} + export default class SearchExtractor { - constructor({ config, sanitizer, persistedHashes }) { + constructor({ config, patterns, sanitizer, persistedHashes }) { + this.patterns = patterns; this.sanitizer = sanitizer; this.persistedHashes = persistedHashes; this.channel = config.HW_CHANNEL; @@ -53,7 +105,7 @@ export default class SearchExtractor { } } - async runJob({ type, query, doublefetchUrl }) { + async runJob({ type, query, doublefetchRequest }) { function discard(reason = '') { logger.debug('No messages found for query:', query, 'Reason:', reason); return { @@ -76,17 +128,20 @@ export default class SearchExtractor { let doc; try { - const html = await anonymousHttpGet(doublefetchUrl); + const html = await anonymousHttpGet(doublefetchRequest.url, { + headers: doublefetchRequest.headers, + redirect: doublefetchRequest.redirect, + }); doc = await parseHtml(html); } catch (e) { // unblock the hash to allow retries later // (at this point, the error could be caused by a network error, // so it is still possible that a retry later could work.) - logger.info('Failed to fetch query:', doublefetchUrl, e); + logger.info('Failed to fetch query:', doublefetchRequest.url, e); await this.persistedHashes.delete(queryHash).catch(() => {}); throw e; } - const messages = this.extractMessages({ doc, type, query, doublefetchUrl }); + const messages = this.extractMessages({ doc, type, query, doublefetchRequest }); if (messages.length === 0) { return discard('No content found.'); } @@ -95,49 +150,126 @@ export default class SearchExtractor { return { messages }; } - extractMessages({ doc, type, query, doublefetchUrl }) { - // TODO: it should be possible to update patterns without new releases - // (e.g., by porting content-extractor functionality is not option) - - // STUB: hard-coded rules for queries to have something to test with. - if (type !== 'search-go') { - return []; - } - const rso = doc.getElementById('rso'); - if (!rso) { + extractMessages({ doc, type, query, doublefetchRequest }) { + const rules = this.patterns.getRulesSnapshot(); + if (!rules[type]) { return []; } - const results = []; - [].forEach.call(rso.querySelectorAll('div.mnr-c.xpd.O9g5cc.uUPGi'), (x) => { - const url = (x.querySelector('a') || {}).href; - const title = (x.querySelector('a > div > div') || { textContent: '' }).textContent; - const age = (x.querySelector('.BmP5tf .wuQ4Ob') || { textContent: '' }).textContent.split('·')[0]; - const missingKeyword = (x.querySelector('.TXwUJf a.fl') || { textContent: '' }).textContent; - if (url && title) { - results.push({ t: title, u: url, age: age || null, m: missingKeyword || null }); + const found = {}; + + const { input = {}, output = {} } = rules[type]; + for (const [selector, selectorDef] of Object.entries(input)) { + found[selector] = found[selector] || {}; + if (selectorDef.first) { + const item = doc.querySelector(selector); + if (item) { + for (const [key, def] of Object.entries(selectorDef.first)) { + const value = findFirstMatch(item, def); + found[selector][key] = runTransforms(value, def.transform); + } + } + } else if (selectorDef.all) { + const rootItems = doc.querySelectorAll(selector); + if (rootItems) { + found[selector] = found[selector] || {}; + for (const [key, def] of Object.entries(selectorDef.all)) { + found[selector][key] = []; + for (const rootItem of rootItems) { + const item = findFirstMatch(rootItem, def); + found[selector][key].push(runTransforms(item, def.transform)); + } + } + } + } else { + throw new Error('Internal error: bad selector (expected "first" or "all")'); } - }); - if (results.length === 0) { - return []; } - // TODO: to simplify delayed sending (and resending after errors), - // we should not immediately fill in the ts, but only before sending - const msg = { - type: 'humanweb', - action: 'hwlite.query', - payload: { - r: { ...results }, - q: query, - qurl: doublefetchUrl, - ctry: this.sanitizer.getSafeCountryCode(), - }, - ver: '2.8', - channel: this.channel, - ts: getTimeAsYYYYMMDD(), - 'anti-duplicates': Math.floor(random() * 10000000), + // meta fields, which are provided instead of being extracted + const context = { + q: query ?? null, + qurl: doublefetchRequest.url ?? null, + ctry: this.sanitizer.getSafeCountryCode(), }; - return [msg]; + const isPresent = x => x !== null && x !== undefined && x !== ''; + + // Now combine the results to build the messages as specified + // in the "output" section of the patterns. + // + // Message payload + // --------------- + // There are three origins of the data: + // 1) a single keys + // (extracted from an input with a "first" section) + // 2) array entries that need to be merged + // (extracted from an input with an "all" section) + // 3) special entries provided in the context + // + // Filtering: + // ---------- + // By default, all keys of a message have to be present (where empty arrays + // and empty strings are considered to absent). The default behaviour can be + // overwritten by setting the "optional" property of a field. Also, the merging + // of arrays can allow entries with missing values by overwriting the + // "requiredKeys" property. If not specified, all keys of the array entry need + // to be present; otherwise, the entry will be skipped. + const messages = []; + nextaction: // eslint-disable-line no-labels, no-restricted-syntax + for (const [action, schema] of Object.entries(output)) { + const payload = {}; + for (const { key, source, requiredKeys, optional = false } of schema.fields) { + if (source) { + if (!input[source]) { + throw new Error(`Output rule for action=${action} references invalid input source=${source}`); + } + if (input[source].first) { + // case 1: single extracted value + if (!optional && !isPresent(found[source][key])) { + continue nextaction; // eslint-disable-line no-labels + } + payload[key] = found[source][key] ?? null; + } else if (input[source].all) { + // case 2: merge the fields from an array of previously extracted values + const results = []; + const innerKeys = Object.keys(input[source].all); + for (const innerKey of innerKeys) { + found[source][innerKey].forEach((value, idx) => { + results[idx] = results[idx] || {}; + results[idx][innerKey] = value ?? null; + }); + } + + // check if all required data was found + // (by default, all keys in the fields need to be present) + const required = requiredKeys || innerKeys; + const allFieldsPresent = entry => required.every(x => isPresent(entry[x])); + const cleanedResults = results.filter(allFieldsPresent); + if (cleanedResults.length === 0 && !optional) { + continue nextaction; // eslint-disable-line no-labels + } + payload[key] = { ...cleanedResults }; + } else { + throw new Error(`Output rule for action=${action} does not match input key=${key}`); + } + } else { + // case 3: access special keys from the context + if (!optional && !isPresent(context[key])) { + continue; + } + payload[key] = context[key] ?? null; + } + } + messages.push({ + type: 'humanweb', + action, + payload, + ver: '2.9', + channel: this.channel, + ts: getTimeAsYYYYMMDD(), + 'anti-duplicates': Math.floor(random() * 10000000), + }); + } + return messages; } } diff --git a/modules/human-web-lite/sources/url-analyzer.es b/modules/human-web-lite/sources/url-analyzer.es index 5481a37..1eb40b6 100644 --- a/modules/human-web-lite/sources/url-analyzer.es +++ b/modules/human-web-lite/sources/url-analyzer.es @@ -7,10 +7,11 @@ */ import { parse } from '../core/url'; +import logger from './logger'; export default class UrlAnalyzer { - updatePatterns() { - // TODO: STUB (get patterns from server) + constructor(patterns) { + this.patterns = patterns; } parseSearchLinks(url) { @@ -28,7 +29,13 @@ export default class UrlAnalyzer { if (query) { const query_ = encodeURIComponent(query).replace(/%20/g, '+'); const doublefetchUrl = `https://${parsedUrl.host}/search?q=${query_}`; - return { found: true, type: 'search-go', query, doublefetchUrl }; + const type = 'search-go'; + const doublefetchRequest = this.patterns.createDoublefetchRequest(type, doublefetchUrl); + if (!doublefetchRequest) { + logger.info('Matching rule for', url, 'skipped (no matching server side rules exist)'); + return { found: false }; + } + return { found: true, type, query, doublefetchRequest }; } } diff --git a/modules/human-web-lite/tests/unit/fixtures/search-extractor/android/go/bad-driver-macbook-2022-03-08/page.html.gz b/modules/human-web-lite/tests/unit/fixtures/search-extractor/android/go/bad-driver-macbook-2022-03-08/page.html.gz new file mode 100644 index 0000000..c31f550 Binary files /dev/null and b/modules/human-web-lite/tests/unit/fixtures/search-extractor/android/go/bad-driver-macbook-2022-03-08/page.html.gz differ diff --git a/modules/human-web-lite/tests/unit/fixtures/search-extractor/android/go/bad-driver-macbook-2022-03-08/scenario.json b/modules/human-web-lite/tests/unit/fixtures/search-extractor/android/go/bad-driver-macbook-2022-03-08/scenario.json new file mode 100644 index 0000000..5889b81 --- /dev/null +++ b/modules/human-web-lite/tests/unit/fixtures/search-extractor/android/go/bad-driver-macbook-2022-03-08/scenario.json @@ -0,0 +1,74 @@ +{ + "url": "https://www.google.com/search?hl=en&q=how%20to%20delete%20a%20bad%20driver%20macbook", + "type": "search-go", + "query": "how to delete a bad driver macbook", + "ctry": "de", + + "mustContain": [ + { + "type": "humanweb", + "action": "hwlite.query", + "payload": { + "r": { + "0": { + "t": "Here's how to uninstall drivers on your Mac - MacPaw", + "u": "https://macpaw.com/how-to/uninstall-drivers-mac", + "age": "16.04.2021", + "m": null + }, + "1": { + "t": "How to find and uninstall drivers. - Apple Support Communities", + "u": "https://discussions.apple.com/thread/3562907", + "age": null, + "m": "bad" + }, + "2": { + "t": "How do I uninstall different Mac OS drivers that I don't need? - Quora", + "u": "https://www.quora.com/How-do-I-uninstall-different-Mac-OS-drivers-that-I-dont-need", + "age": null, + "m": null + }, + "3": { + "t": "How to remove a driver on Mac : r/MacOS - Reddit", + "u": "https://www.reddit.com/r/MacOS/comments/52x7jq/how_to_remove_a_driver_on_mac/", + "age": null, + "m": "bad" + }, + "4": { + "t": "How to uninstall drivers on Mac in just a few clicks - YouTube", + "u": "https://m.youtube.com/watch?v=voC1lH_ux2g", + "age": null, + "m": "bad" + }, + "5": { + "t": "How to Find and Remove Potential Driver Conflicts in macOS", + "u": "https://www.makeuseof.com/how-to-fix-driver-conflicts-macos/", + "age": "20.01.2022", + "m": null + }, + "6": { + "t": "Remove Reckless Driving Completely under Mac OS X", + "u": "https://cleanmacapp.com/remove-reckless-driving-for-mac.html", + "age": null, + "m": null + }, + "7": { + "t": "How to Uninstall Drivers on Mac - Cyclonis", + "u": "https://www.cyclonis.com/how-to-uninstall-drivers-on-mac/", + "age": null, + "m": "bad" + }, + "8": { + "t": "Quick Guide to Completely Uninstall Programs on Mac - Techsviewer", + "u": "https://techsviewer.com/uninstall-programs-on-mac/", + "age": null, + "m": null + } + }, + "q": "how to delete a bad driver macbook", + "qurl": "https://www.google.com/search?hl=en&q=how%20to%20delete%20a%20bad%20driver%20macbook", + "ctry": "de" + } + } + ] +} diff --git a/modules/human-web-lite/tests/unit/fixtures/search-extractor/android/go/baerentatze-2022-03-08/page.html.gz b/modules/human-web-lite/tests/unit/fixtures/search-extractor/android/go/baerentatze-2022-03-08/page.html.gz new file mode 100644 index 0000000..90182b0 Binary files /dev/null and b/modules/human-web-lite/tests/unit/fixtures/search-extractor/android/go/baerentatze-2022-03-08/page.html.gz differ diff --git a/modules/human-web-lite/tests/unit/fixtures/search-extractor/android/go/baerentatze-2022-03-08/scenario.json b/modules/human-web-lite/tests/unit/fixtures/search-extractor/android/go/baerentatze-2022-03-08/scenario.json new file mode 100644 index 0000000..d6d65cc --- /dev/null +++ b/modules/human-web-lite/tests/unit/fixtures/search-extractor/android/go/baerentatze-2022-03-08/scenario.json @@ -0,0 +1,80 @@ +{ + "url": "https://www.google.com/search?q=b%C3%A4rentatze", + "type": "search-go", + "query": "bärentatze", + "ctry": "de", + + "mustContain": [ + { + "type": "humanweb", + "action": "hwlite.query", + "payload": { + "r": { + "0": { + "t": "Bärentatzen von Henrietta | Chefkoch", + "u": "https://www.chefkoch.de/rezepte/185611079707271/Baerentatzen.html", + "age": null, + "m": null + }, + "1": { + "t": "Herrlich mürbe Bärentatzen mit Schokolade - Backen macht glücklich", + "u": "https://www.backenmachtgluecklich.de/rezepte/baerentatzen.html", + "age": "08.11.2021", + "m": null + }, + "2": { + "t": "Bärentatze (Gebäck) - Wikipedia", + "u": "https://de.m.wikipedia.org/wiki/B%C3%A4rentatze_(Geb%C3%A4ck)", + "age": null, + "m": null + }, + "3": { + "t": "Bärentatze - Wikipedia", + "u": "https://de.m.wikipedia.org/wiki/B%C3%A4rentatze", + "age": null, + "m": null + }, + "4": { + "t": "Bärentatzen ( Originalrezept ) - Rezept - kochbar.de", + "u": "https://www.kochbar.de/rezept/261825/Baerentatzen-Originalrezept.html", + "age": null, + "m": null + }, + "5": { + "t": "Weiche Bärentatze (Acanthus mollis) - perfekte Stauden & Ratgeber", + "u": "https://www.baumschule-horstmann.de/weiche-baerentatze-698_61561.html", + "age": null, + "m": null + }, + "6": { + "t": "33 Bärentatze-Ideen | tatze, bär, indianische tattoos - Pinterest", + "u": "https://www.pinterest.de/gerd5983/b%C3%A4rentatze/", + "age": null, + "m": null + }, + "7": { + "t": "Rezept Bärentatzen Weihnachtsplätzchen - Küchengötter", + "u": "https://www.kuechengoetter.de/rezepte/baerentatzen-weihnachtsplaetzchen-37399", + "age": null, + "m": null + }, + "8": { + "t": "Bärentatze-Pedale Preisvergleich | Günstig bei idealo kaufen", + "u": "https://www.idealo.de/preisvergleich/ProductCategory/13092F1946874.html", + "age": null, + "m": null + }, + "9": { + "t": "Acanthus mollis, Weiche Bärentatze - Stanze Gartencenter in Hannover ...", + "u": "https://www.stanze-gartencenter.de/artikel/568/acanthus-mollis", + "age": null, + "m": null + } + }, + "q": "bärentatze", + "qurl": "https://www.google.com/search?q=b%C3%A4rentatze", + "ctry": "de" + } + } + ] +} diff --git a/modules/human-web-lite/tests/unit/fixtures/search-extractor/android/go/ukraine-2022-03-08/page.html.gz b/modules/human-web-lite/tests/unit/fixtures/search-extractor/android/go/ukraine-2022-03-08/page.html.gz new file mode 100644 index 0000000..e808916 Binary files /dev/null and b/modules/human-web-lite/tests/unit/fixtures/search-extractor/android/go/ukraine-2022-03-08/page.html.gz differ diff --git a/modules/human-web-lite/tests/unit/fixtures/search-extractor/android/go/ukraine-2022-03-08/scenario.json b/modules/human-web-lite/tests/unit/fixtures/search-extractor/android/go/ukraine-2022-03-08/scenario.json new file mode 100644 index 0000000..a4c6730 --- /dev/null +++ b/modules/human-web-lite/tests/unit/fixtures/search-extractor/android/go/ukraine-2022-03-08/scenario.json @@ -0,0 +1,68 @@ +{ + "url": "https://www.google.com/search?q=ukraine", + "type": "search-go", + "query": "ukraine", + "ctry": "de", + + "mustContain": [ + { + "type": "humanweb", + "action": "hwlite.query", + "payload": { + "r": { + "0": { + "t": "Liveblog zum Ukraine-Krieg: ++ Mehr als zwei Millionen ...", + "u": "https://www.tagesschau.de/newsticker/liveblog-ukraine-dienstag-101.html", + "age": "vor 1 Stunde", + "m": null + }, + "1": { + "t": "Ukraine bestätigt geöffnete Fluchtkorridore | tagesschau.de", + "u": "https://www.tagesschau.de/ausland/europa/russland-ukraine-krieg-verhandlungen-107.html", + "age": "vor 1 Stunde", + "m": null + }, + "2": { + "t": "Ukraine - Wikipedia", + "u": "https://de.m.wikipedia.org/wiki/Ukraine", + "age": "vor 4 Stunden", + "m": null + }, + "3": { + "t": "++ Ukraine-Krieg: Russlands Invasion erreicht nächste ...", + "u": "https://www.fr.de/politik/ukraine-krieg-konflikt-russland-kiew-putin-selenskyj-drohung-mariupol-news-ticker-zr-91391160.html", + "age": "vor 4 Stunden", + "m": null + }, + "4": { + "t": "Ukraine - DER SPIEGEL", + "u": "https://www.spiegel.de/thema/ukraine/", + "age": "vor 8 Stunden", + "m": null + }, + "5": { + "t": "Krieg in der Ukraine: News zum Thema | ZEIT ONLINE", + "u": "https://www.zeit.de/thema/krieg-in-ukraine", + "age": "vor 4 Stunden", + "m": null + }, + "6": { + "t": "Russland greift Ukraine an - Aktuelle News im Liveblog - ZDF", + "u": "https://www.zdf.de/nachrichten/politik/ukraine-russland-konflikt-blog-100.html", + "age": "vor 3 Stunden", + "m": null + }, + "7": { + "t": "Ukraine: Reisewarnung/Ausreiseaufforderung", + "u": "https://www.auswaertiges-amt.de/de/aussenpolitik/laender/ukraine-node/ukrainesicherheit/201946", + "age": "vor 4 Tagen", + "m": null + } + }, + "q": "ukraine", + "qurl": "https://www.google.com/search?q=ukraine", + "ctry": "de" + } + } + ] +} diff --git a/modules/human-web-lite/tests/unit/fixtures/search-extractor/go/bad-driver-macbook-2021-10-13/page.html.gz b/modules/human-web-lite/tests/unit/fixtures/search-extractor/ios/go/bad-driver-macbook-2021-10-13/page.html.gz similarity index 100% rename from modules/human-web-lite/tests/unit/fixtures/search-extractor/go/bad-driver-macbook-2021-10-13/page.html.gz rename to modules/human-web-lite/tests/unit/fixtures/search-extractor/ios/go/bad-driver-macbook-2021-10-13/page.html.gz diff --git a/modules/human-web-lite/tests/unit/fixtures/search-extractor/go/bad-driver-macbook-2021-10-13/scenario.json b/modules/human-web-lite/tests/unit/fixtures/search-extractor/ios/go/bad-driver-macbook-2021-10-13/scenario.json similarity index 100% rename from modules/human-web-lite/tests/unit/fixtures/search-extractor/go/bad-driver-macbook-2021-10-13/scenario.json rename to modules/human-web-lite/tests/unit/fixtures/search-extractor/ios/go/bad-driver-macbook-2021-10-13/scenario.json diff --git a/modules/human-web-lite/tests/unit/patterns-updater-test.es b/modules/human-web-lite/tests/unit/patterns-updater-test.es new file mode 100644 index 0000000..3b80d92 --- /dev/null +++ b/modules/human-web-lite/tests/unit/patterns-updater-test.es @@ -0,0 +1,350 @@ +/*! + * Copyright (c) 2014-present Cliqz GmbH. All rights reserved. + * + * This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at https://mozilla.org/MPL/2.0/. + */ + +/* global chai, describeModule */ + +/* eslint-disable no-await-in-loop */ + +const expect = chai.expect; + +const SECOND = 1000; +const MINUTE = 60 * SECOND; +const HOUR = 60 * MINUTE; +const DAY = 24 * HOUR; +const WEEK = 7 * DAY; + +// Some big value which should be large enough to +// make sure all cooldowns have expired. +const SKIP_ALL_COOLDOWNS = WEEK; + +function generateTestRules(msgName) { + return { + [msgName]: { + input: { + 'div.someCssSelector': { + first: { + test: { + attr: 'textContent', + }, + } + }, + output: { + 'some-output-msg': { + fields: [], + }, + }, + }, + }, + }; +} + +// Some patterns that can be used to simulate pattern released. +// The actual content does not matter, as the PatternUpdater is not +// aware of the DSL and is thus agnostic to the content of the rules. +// +// But what is important is that the Pattern class, which holds the +// the currently active rules, will always start with an empty rule set +// when the extension is loaded. +const EMPTY_PATTERN = JSON.stringify({}); +const SOME_NON_EMPTY_PATTERN = JSON.stringify(generateTestRules('some-test-message')); +const ANOTHER_NON_EMPTY_PATTERN = JSON.stringify(generateTestRules('another-test-message')); + +function mockFetch(patternsUrl, serverPatterns) { + expect(patternsUrl).to.exist; + expect(serverPatterns).to.exist; + + const expectedFetchArgs = { + url: patternsUrl, + options: { + method: 'GET', + credentials: 'omit', + }, + }; + + const mock = { + stats: { + attemptedRequests: 0, + }, + fetchImpl: async (url, options = {}) => { + mock.stats.attemptedRequests += 1; + expect({ url, options }).to.deep.equal(expectedFetchArgs); + + if (serverPatterns.simulateNetworkError) { + throw new Error('Stub server has been configured to fail with a network error'); + } + + if (serverPatterns.simulateBadResponse) { + return { + ok: false, + statusText: 'Stub server has been configured to fail (this is expected).', + }; + } + + return { + ok: true, + text: async () => serverPatterns.value, + }; + }, + }; + return mock; +} + +function mockStorage(storageKey) { + return { + async get(key) { + expect(key).to.equal(storageKey); + return this._content; + }, + async set(key, obj) { + expect(key).to.equal(storageKey); + this._content = obj; + } + }; +} + +// stub implementation of the "Patterns" class +function mockPatterns() { + return { + _rules: {}, + _updateHistory: [], + + updatePatterns(rules) { + this._rules = rules; + this._updateHistory.push(rules); + }, + + getRulesSnapshot() { + return this._rules; + }, + }; +} + +export default describeModule('human-web-lite/patterns-updater', + () => ({ + 'platform/globals': { + default: {}, + }, + }), + () => { + describe('#PatternsUpdater', function () { + let PatternsUpdater; + let uut; + let config; + let storage; + let fetchMock; + + // These stubs allow the tests to simulate that the server releases + // new rules and allows to check what rules would be active in the client. + let clientPatterns; + let serverPatterns; + + function releasePatterns(pattern) { + if (typeof pattern === 'string') { + serverPatterns.value = pattern; + } else { + serverPatterns.value = JSON.stringfy(pattern); + } + } + + function networkIsDown() { + serverPatterns.simulateNetworkError = true; + } + + function networkIsUp() { + serverPatterns.simulateNetworkError = false; + } + + function serverRespondsWithNon2xx() { + serverPatterns.simulateBadResponse = true; + } + + function serverRespondsWith2xx() { + serverPatterns.simulateBadResponse = false; + } + + function expectLoadedPatternsToBe(pattern) { + const expectedRules = (typeof pattern === 'string') ? JSON.parse(pattern) : pattern; + expect(clientPatterns.getRulesSnapshot()).to.deep.equal(expectedRules); + } + + beforeEach(function () { + PatternsUpdater = this.module().default; + clientPatterns = mockPatterns(); + serverPatterns = { + value: EMPTY_PATTERN, + }; + + config = { + HUMAN_WEB_LITE_PATTERNS: 'https://patterns-location.test', + }; + const storageKey = 'some-storage-key'; + storage = mockStorage(storageKey); + fetchMock = mockFetch(config.HUMAN_WEB_LITE_PATTERNS, serverPatterns); + uut = new PatternsUpdater({ + config, + patterns: clientPatterns, + storage, + storageKey, + _fetchImpl: (...args) => fetchMock.fetchImpl(...args), + }); + }); + + describe('on a fresh extension installation', function () { + it('should update to the latest patterns', async () => { + expectLoadedPatternsToBe(EMPTY_PATTERN); + releasePatterns(SOME_NON_EMPTY_PATTERN); + expectLoadedPatternsToBe(EMPTY_PATTERN); + + await uut.init(); + + expectLoadedPatternsToBe(SOME_NON_EMPTY_PATTERN); + }); + + describe('should handle error cases during startup gracefully', function () { + beforeEach(function () { + releasePatterns(SOME_NON_EMPTY_PATTERN); + }); + + it('network is down', async () => { + networkIsDown(); + + await uut.init(); + + expectLoadedPatternsToBe(EMPTY_PATTERN); + }); + + it('server responds with non-2xx', async () => { + serverRespondsWithNon2xx(); + + await uut.init(); + + expectLoadedPatternsToBe(EMPTY_PATTERN); + }); + + it('server responds with something that is not well-formed', async () => { + releasePatterns('some text that is not JSON'); + + await uut.init(); + + expectLoadedPatternsToBe(EMPTY_PATTERN); + }); + + it('server first fails but then responds', async () => { + releasePatterns(SOME_NON_EMPTY_PATTERN); + serverRespondsWithNon2xx(); + + const ts = Date.now(); + await uut.init({ now: ts }); + expectLoadedPatternsToBe(EMPTY_PATTERN); + + serverRespondsWith2xx(); + await uut.update({ now: ts + SKIP_ALL_COOLDOWNS }); + expectLoadedPatternsToBe(SOME_NON_EMPTY_PATTERN); + }); + }); + + describe('should retry if it is unable fetch patterns on startup', function () { + beforeEach(function () { + releasePatterns(SOME_NON_EMPTY_PATTERN); + networkIsDown(); + }); + + it('update patterns if the network is responding again later', async () => { + const ts = Date.now(); + + const outageDuration = 5; + await uut.init({ now: ts }); + for (let i = 1; i <= outageDuration; i += 1) { + await uut.update({ now: ts + i * MINUTE }); + expectLoadedPatternsToBe(EMPTY_PATTERN); + } + + networkIsUp(); + for (let i = outageDuration; i <= 10 * outageDuration; i += 1) { + await uut.update({ now: ts + i * MINUTE }); + } + expectLoadedPatternsToBe(SOME_NON_EMPTY_PATTERN); + }); + }); + + describe('[error handling]', function () { + let ts; + + beforeEach(async () => { + // setup: the client has already loaded some pattern ... + ts = Date.now(); + releasePatterns(SOME_NON_EMPTY_PATTERN); + await uut.init({ now: ts }); + expectLoadedPatternsToBe(SOME_NON_EMPTY_PATTERN); + + // ... but then the nextwork is down and updates are failing + releasePatterns(ANOTHER_NON_EMPTY_PATTERN); + ts += SKIP_ALL_COOLDOWNS; + networkIsDown(); + }); + + function assertClientHasNotUpdatedYet() { + expectLoadedPatternsToBe(SOME_NON_EMPTY_PATTERN); + } + + it('cooldown as failures (ignoreErrors: false)', async () => { + for (let i = 0; i < 10; i += 1) { + // detect failure to update + let detected = false; + try { + await uut.update({ now: ts, ignoreErrors: false }); + } catch (e) { + detected = true; + } + expect(detected).to.equal(true); + assertClientHasNotUpdatedYet(); + + // suppress error + await uut.update({ now: ts, ignoreErrors: true }); + assertClientHasNotUpdatedYet(); + + // by default, it should be suppressed + await uut.update({ now: ts }); + assertClientHasNotUpdatedYet(); + } + }); + }); + + describe('[parallel updates]', function () { + it('should handle concurrent updates', async () => { + let ts = Date.now(); + releasePatterns(SOME_NON_EMPTY_PATTERN); + await uut.init({ now: ts }); + expectLoadedPatternsToBe(SOME_NON_EMPTY_PATTERN); + + // new release is out + ts += SKIP_ALL_COOLDOWNS; + releasePatterns(ANOTHER_NON_EMPTY_PATTERN); + + // When doing multiple parallel fetch requests, only one + // HTTP request should be make and all update operations + // have to block until the update is completed. + expect(fetchMock.stats.attemptedRequests).to.equal(1); + const parallelUpdates = []; + for (let i = 0; i < 100; i += 1) { + parallelUpdates.push((async (uut_, fetchMock_) => { + expectLoadedPatternsToBe(SOME_NON_EMPTY_PATTERN); + await uut_.update({ now: ts, ignoreErrors: false }); + expect(fetchMock_.stats.attemptedRequests).to.equal(2); + expectLoadedPatternsToBe(ANOTHER_NON_EMPTY_PATTERN); + })(uut, fetchMock)); + } + await Promise.all(parallelUpdates); + expectLoadedPatternsToBe(ANOTHER_NON_EMPTY_PATTERN); + expect(fetchMock.stats.attemptedRequests).to.equal(2); + }); + }); + }); + }); + }); + diff --git a/modules/human-web-lite/tests/unit/search-extractor-test.es b/modules/human-web-lite/tests/unit/search-extractor-test.es index 3817324..a3a7956 100644 --- a/modules/human-web-lite/tests/unit/search-extractor-test.es +++ b/modules/human-web-lite/tests/unit/search-extractor-test.es @@ -14,7 +14,11 @@ const fs = require('fs'); const zlib = require('zlib'); const FileHound = require('filehound'); const stripJsonComments = require('strip-json-comments'); -const { mockDocumentWith, allSupportedParsers } = require('../../human-web/unit/dom-parsers'); +const { mockDocumentWith /* , allSupportedParsers */ } = require('../../human-web/unit/dom-parsers'); + +// TODO: for completeness support linkedom (allSupportedParsers) +// (not urgent, since linkedom is not used on Mobile) +const allSupportedParsers = ['jsdom']; const EMPTY_HTML_PAGE = ` @@ -56,6 +60,101 @@ function findAllFixtures() { .map(file => path.relative(FIXTURES_BASE_PATH, file)); } + +const ANDROID_PATTERNS = { + 'search-go': { + input: { + '#main div div[data-hveid] div.ZINbbc.xpd.O9g5cc.uUPGi': { + all: { + u: { + select: 'a', + attr: 'href', + transform: [['queryParam', 'q']], + }, + t: { + select: 'a > h3 > div', + attr: 'textContent', + }, + age: { + firstMatch: [{ + select: 'div.kCrYT > div > div.BNeawe.s3v9rd.AP7Wnd > div > div:not(.MSiauf) > div > span.xUrNXd.xUrNXd.UMOHqf + br + span.xUrNXd.UMOHqf', + attr: 'textContent', + }, { + select: 'div.kCrYT > div > div.BNeawe.s3v9rd.AP7Wnd > div > div:not(.MSiauf) > div > span.xUrNXd.xUrNXd.UMOHqf', + attr: 'textContent', + }], + }, + m: { + select: 'span.Tmh7uc.UMOHqf', + attr: 'textContent', + }, + } + }, + }, + output: { + 'hwlite.query': { + fields: [ + { + key: 'r', + source: '#main div div[data-hveid] div.ZINbbc.xpd.O9g5cc.uUPGi', + requiredKeys: ['t', 'u'], + }, + { key: 'q' }, + { key: 'qurl' }, + { key: 'ctry' }, + ], + }, + }, + }, +}; + +const IOS_PATTERNS = { + 'search-go': { + input: { + '#rso div.mnr-c.xpd.O9g5cc.uUPGi': { + all: { + u: { + select: 'a', + attr: 'href', + }, + t: { + select: 'a > div > div', + attr: 'textContent', + }, + age: { + select: '.BmP5tf .wuQ4Ob', + attr: 'textContent', + transform: [['trySplit', '·', 0]], + }, + m: { + select: '.TXwUJf a.fl', + attr: 'textContent', + }, + } + }, + }, + output: { + 'hwlite.query': { + fields: [ + { + key: 'r', + source: '#rso div.mnr-c.xpd.O9g5cc.uUPGi', + requiredKeys: ['t', 'u'], + }, + { key: 'q' }, + { key: 'qurl' }, + { key: 'ctry' }, + ], + }, + }, + }, +}; + +const PATTERN_FIXTURES = { + android: ANDROID_PATTERNS, + ios: IOS_PATTERNS, +}; + export default describeModule('human-web-lite/search-extractor', () => ({ 'platform/globals': { @@ -70,12 +169,14 @@ export default describeModule('human-web-lite/search-extractor', allSupportedParsers.forEach((domParserLib) => { describe(`with ${domParserLib}`, function () { let SearchExtractor; + let Patterns; let uut; let mockWindow; let doc; let fixture; let config; let ctry; + let patterns; let sanitizer; let persistedHashes; @@ -91,6 +192,11 @@ export default describeModule('human-web-lite/search-extractor', expect({ ...fixture, html: '' }).to.include.keys('url', 'type', 'query', 'ctry'); ctry = fixture.ctry; setupDocument(fixture.html); + + const target = _path.split('/')[0]; + if (PATTERN_FIXTURES[target]) { + patterns.updatePatterns(PATTERN_FIXTURES[target]); + } } catch (e) { throw new Error(`Failed to load test fixture "${_path}": ${e}`, e); } @@ -133,9 +239,12 @@ export default describeModule('human-web-lite/search-extractor', } }; - beforeEach(function () { + beforeEach(async function () { SearchExtractor = this.module().default; + Patterns = (await this.system.import('human-web-lite/patterns')).default; + config = { HW_CHANNEL: 'test-channel' }; + patterns = new Patterns(); ctry = 'test-ctry'; sanitizer = { getSafeCountryCode() { @@ -143,7 +252,7 @@ export default describeModule('human-web-lite/search-extractor', } }; persistedHashes = {}; - uut = new SearchExtractor({ config, sanitizer, persistedHashes }); + uut = new SearchExtractor({ config, patterns, sanitizer, persistedHashes }); }); afterEach(function () { @@ -161,7 +270,9 @@ export default describeModule('human-web-lite/search-extractor', doc, type: 'search-go', query: 'dummy query', - doublefetchUrl: 'https://dummy.test/', + doublefetchRequest: { + url: 'https://dummy.test/', + }, }); expect(messages).to.deep.equal([]); }); @@ -179,7 +290,9 @@ export default describeModule('human-web-lite/search-extractor', doc, type: fixture.type, query: fixture.query, - doublefetchUrl: fixture.url, + doublefetchRequest: { + url: fixture.url, + }, }); // Then diff --git a/modules/human-web-lite/tests/unit/url-analyzer-test.es b/modules/human-web-lite/tests/unit/url-analyzer-test.es index 63a433f..6758a8a 100644 --- a/modules/human-web-lite/tests/unit/url-analyzer-test.es +++ b/modules/human-web-lite/tests/unit/url-analyzer-test.es @@ -11,6 +11,27 @@ const expect = chai.expect; const fc = require('fast-check'); +function searchGoDoublefetch(url) { + return { + url, + redirect: 'follow', + headers: { + Cookie: 'SOME_DUMMY_VALUE', + }, + }; +} + +function fakePatterns() { + return { + createDoublefetchRequest(msgType, url) { + if (msgType === 'search-go') { + return searchGoDoublefetch(url); + } + throw new Error(`Unexpected msgType: ${msgType}`); + } + }; +} + export default describeModule('human-web-lite/url-analyzer', () => ({ 'platform/globals': { @@ -24,7 +45,8 @@ export default describeModule('human-web-lite/url-analyzer', beforeEach(function () { UrlAnalyzer = this.module().default; - uut = new UrlAnalyzer(); + const patterns = fakePatterns(); + uut = new UrlAnalyzer(patterns); }); it('should detect the query "trump alaska"', function () { @@ -33,7 +55,7 @@ export default describeModule('human-web-lite/url-analyzer', found: true, type: 'search-go', query: 'trump alaska', - doublefetchUrl: 'https://www.google.de/search?q=trump+alaska', + doublefetchRequest: searchGoDoublefetch('https://www.google.de/search?q=trump+alaska'), }); }); @@ -43,7 +65,7 @@ export default describeModule('human-web-lite/url-analyzer', found: true, type: 'search-go', query: 'a+b', - doublefetchUrl: 'https://www.google.com/search?q=a%2Bb', + doublefetchRequest: searchGoDoublefetch('https://www.google.com/search?q=a%2Bb'), }); }); @@ -53,7 +75,7 @@ export default describeModule('human-web-lite/url-analyzer', found: true, type: 'search-go', query: 'c# how to read a file', - doublefetchUrl: 'https://www.google.com/search?q=c%23+how+to+read+a+file', + doublefetchRequest: searchGoDoublefetch('https://www.google.com/search?q=c%23+how+to+read+a+file'), }); }); @@ -63,7 +85,7 @@ export default describeModule('human-web-lite/url-analyzer', found: true, type: 'search-go', query: 'a+b', - doublefetchUrl: 'https://www.google.com/search?q=a%2Bb', + doublefetchRequest: searchGoDoublefetch('https://www.google.com/search?q=a%2Bb'), }); });