From 4663fd4a1bde5e57e8474f35146eb52c2f7ba72e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?R=C3=A9mi=20Berson?= Date: Wed, 24 Apr 2019 11:12:46 +0200 Subject: [PATCH 1/4] clean-up filters parsing benchmark --- bench/micro.js | 21 ++++++--------------- bench/run_benchmark.js | 4 +++- 2 files changed, 9 insertions(+), 16 deletions(-) diff --git a/bench/micro.js b/bench/micro.js index ad85db32dd..8e605e500e 100644 --- a/bench/micro.js +++ b/bench/micro.js @@ -36,28 +36,19 @@ function benchStringTokenize({ filters }) { return dummy; } -function benchParsingImpl(lists, { loadNetworkFilters, loadCosmeticFilters }) { - let dummy = 0; - - for (let i = 0; i < lists.length; i += 1) { - dummy = (dummy + adblocker.parseFilters(lists[i], { - loadNetworkFilters, - loadCosmeticFilters, - }).networkFilters.length) >>> 0; - } - - return dummy; +function benchParsingImpl(lists, options) { + return adblocker.parseFilters(lists, options); } -function benchCosmeticsFiltersParsing({ lists }) { - return benchParsingImpl(lists, { +function benchCosmeticsFiltersParsing({ combinedLists }) { + return benchParsingImpl(combinedLists, { loadCosmeticFilters: true, loadNetworkFilters: false, }); } -function benchNetworkFiltersParsing({ lists }) { - return benchParsingImpl(lists, { +function benchNetworkFiltersParsing({ combinedLists }) { + return benchParsingImpl(combinedLists, { loadCosmeticFilters: false, loadNetworkFilters: true, }); diff --git a/bench/run_benchmark.js b/bench/run_benchmark.js index 70c378ecfe..86c56066b8 100644 --- a/bench/run_benchmark.js +++ b/bench/run_benchmark.js @@ -80,7 +80,8 @@ function runMicroBenchmarks(lists, resources) { }, true /* Also serialize engine */); const filters = getFiltersFromLists(lists); - const { networkFilters, cosmeticFilters } = parseFilters(filters.join('\n')); + const combinedLists = filters.join('\n'); + const { networkFilters, cosmeticFilters } = parseFilters(combinedLists); const results = {}; // Arguments shared among benchmarks @@ -92,6 +93,7 @@ function runMicroBenchmarks(lists, resources) { serialized, networkFilters, cosmeticFilters, + combinedLists, }; [ From fdf00d23e06e2e6a06a1e73b49dd492ff0436c2f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?R=C3=A9mi=20Berson?= Date: Wed, 24 Apr 2019 11:45:28 +0200 Subject: [PATCH 2/4] speed-up parseFilters and detectFilterType (10-15% gain) --- bench/micro.js | 10 ++-- src/lists.ts | 124 +++++++++++++++++++++++++++++++++---------------- 2 files changed, 88 insertions(+), 46 deletions(-) diff --git a/bench/micro.js b/bench/micro.js index 8e605e500e..c7d936a069 100644 --- a/bench/micro.js +++ b/bench/micro.js @@ -1,6 +1,6 @@ /* eslint-disable no-bitwise */ -const adblocker = require('../'); +const { FiltersEngine, fastHash, tokenize, parseFilters } = require('../'); const { createEngine } = require('./utils'); @@ -17,13 +17,13 @@ function benchEngineSerialization({ engine }) { } function benchEngineDeserialization({ serialized }) { - return adblocker.FiltersEngine.deserialize(serialized); + return FiltersEngine.deserialize(serialized); } function benchStringHashing({ filters }) { let dummy = 0; for (let i = 0; i < filters.length; i += 1) { - dummy = (dummy + adblocker.fastHash(filters[i])) % 1000000000; + dummy = (dummy + fastHash(filters[i])) >>> 0; } return dummy; } @@ -31,13 +31,13 @@ function benchStringHashing({ filters }) { function benchStringTokenize({ filters }) { let dummy = 0; for (let i = 0; i < filters.length; i += 1) { - dummy = (dummy + adblocker.tokenize(filters[i]).length) % 1000000000; + dummy = (dummy + tokenize(filters[i]).length) >>> ; } return dummy; } function benchParsingImpl(lists, options) { - return adblocker.parseFilters(lists, options); + return parseFilters(lists, options); } function benchCosmeticsFiltersParsing({ combinedLists }) { diff --git a/src/lists.ts b/src/lists.ts index b4ab5a1051..930abe791f 100644 --- a/src/lists.ts +++ b/src/lists.ts @@ -3,8 +3,6 @@ import CosmeticFilter from './filters/cosmetic'; import NetworkFilter from './filters/network'; import { fastStartsWith, fastStartsWithFrom } from './utils'; -const SPACE = /\s/; - const enum FilterType { NOT_SUPPORTED, NETWORK, @@ -18,50 +16,89 @@ const enum FilterType { * `NetworkFilter` or `CosmeticFilter`. */ function detectFilterType(line: string): FilterType { + // Ignore empty line + if (line.length === 0 || line.length === 1) { + return FilterType.NOT_SUPPORTED; + } + // Ignore comments + const firstCharCode: number = line.charCodeAt(0); + const secondCharCode: number = line.charCodeAt(1); if ( - line.length === 1 || - line.charAt(0) === '!' || - (line.charAt(0) === '#' && SPACE.test(line.charAt(1))) || - fastStartsWith(line, '[Adblock') + firstCharCode === 33 /* '!' */ || + (firstCharCode === 35 /* '#' */ && secondCharCode <= 32) || + (firstCharCode === 91 /* '[' */ && fastStartsWith(line, '[Adblock')) ) { return FilterType.NOT_SUPPORTED; } - if (fastStartsWith(line, '|') || fastStartsWith(line, '@@|')) { + // Fast heuristics to detect network filters + const lastCharCode: number = line.charCodeAt(line.length - 1); + if ( + firstCharCode === 36 /* '$' */ || + firstCharCode === 38 /* '&' */ || + firstCharCode === 42 /* '*' */ || + firstCharCode === 45 /* '-' */ || + firstCharCode === 46 /* '.' */ || + firstCharCode === 47 /* '/' */ || + firstCharCode === 58 /* ':' */ || + firstCharCode === 61 /* '=' */ || + firstCharCode === 63 /* '?' */ || + firstCharCode === 64 /* '@' */ || + firstCharCode === 95 /* '_' */ || + firstCharCode === 124 /* '|' */ || + lastCharCode === 124 /* '|' */ + ) { return FilterType.NETWORK; } // Ignore Adguard cosmetics - // `$$` - if (line.indexOf('$$') !== -1) { - return FilterType.NOT_SUPPORTED; + // `$$` = HTML filtering rules + const dollarIndex: number = line.indexOf('$'); + if (dollarIndex !== -1 && dollarIndex !== line.length - 1) { + const afterDollarIndex = dollarIndex + 1; + const afterDollarCharCode = line.charCodeAt(afterDollarIndex); + + // Ignore Adguard HTML rewrite rules + if ( + afterDollarCharCode === 36 /* '$' */ || + (afterDollarCharCode === 64 /* '@' */ && + fastStartsWithFrom(line, /* $@$ */ '@$', afterDollarIndex)) + ) { + return FilterType.NOT_SUPPORTED; + } } // Check if filter is cosmetics - const sharpIndex = line.indexOf('#'); - if (sharpIndex !== -1) { + const sharpIndex: number = line.indexOf('#'); + if (sharpIndex !== -1 && sharpIndex !== line.length - 1) { const afterSharpIndex = sharpIndex + 1; + const afterSharpCharCode = line.charCodeAt(afterSharpIndex); - // Ignore Adguard cosmetics - // `#$#` `#@$#` - // `#%#` `#@%#` - // `#?#` if ( - fastStartsWithFrom(line, /* #@$# */ '@$#', afterSharpIndex) || - fastStartsWithFrom(line, /* #@%# */ '@%#', afterSharpIndex) || - fastStartsWithFrom(line, /* #%# */ '%#', afterSharpIndex) || - fastStartsWithFrom(line, /* #$# */ '$#', afterSharpIndex) || - fastStartsWithFrom(line, /* #?# */ '?#', afterSharpIndex) - ) { - return FilterType.NOT_SUPPORTED; - } else if ( - fastStartsWithFrom(line, /* ## */ '#', afterSharpIndex) || - fastStartsWithFrom(line, /* #@# */ '@#', afterSharpIndex) + afterSharpCharCode === 35 /* '#'*/ || + (afterSharpCharCode === 64 /* '@' */ && + fastStartsWithFrom(line, /* #@# */ '@#', afterSharpIndex)) ) { // Parse supported cosmetic filter // `##` `#@#` return FilterType.COSMETIC; + } else if ( + (afterSharpCharCode === 64 /* '@'*/ && + (fastStartsWithFrom(line, /* #@$# */ '@$#', afterSharpIndex) || + fastStartsWithFrom(line, /* #@%# */ '@%#', afterSharpIndex))) || + (afterSharpCharCode === 37 /* '%' */ && + fastStartsWithFrom(line, /* #%# */ '%#', afterSharpIndex)) || + (afterSharpCharCode === 36 /* '$' */ && + fastStartsWithFrom(line, /* #$# */ '$#', afterSharpIndex)) || + (afterSharpCharCode === 63 /* '?' */ && + fastStartsWithFrom(line, /* #?# */ '?#', afterSharpIndex)) + ) { + // Ignore Adguard cosmetics + // `#$#` `#@$#` + // `#%#` `#@%#` + // `#?#` + return FilterType.NOT_SUPPORTED; } } @@ -95,26 +132,31 @@ export function parseFilters( const networkFilters: NetworkFilter[] = []; const cosmeticFilters: CosmeticFilter[] = []; - const lines = list.split('\n'); for (let i = 0; i < lines.length; i += 1) { - const line = lines[i].trim(); + let line = lines[i]; - if (line.length > 0) { - const filterType = detectFilterType(line); + // Check if `line` should be trimmed before parsing + const isTrimmingNeeded = + line.length > 1 && (line.charCodeAt(0) <= 32 || line.charCodeAt(line.length - 1) <= 32); + if (isTrimmingNeeded) { + line = line.trim(); + } - if (filterType === FilterType.NETWORK && config.loadNetworkFilters) { - const filter = NetworkFilter.parse(line, config.debug); - if (filter !== null) { - networkFilters.push(filter); - } - } else if (filterType === FilterType.COSMETIC && config.loadCosmeticFilters) { - const filter = CosmeticFilter.parse(line, config.debug); - if (filter !== null) { - if (config.loadGenericCosmeticsFilters === true || filter.isGenericHide() === false) { - cosmeticFilters.push(filter); - } + // Detect if filter is supported, network or cosmetic + const filterType = detectFilterType(line); + + if (filterType === FilterType.NETWORK && config.loadNetworkFilters === true) { + const filter = NetworkFilter.parse(line, config.debug); + if (filter !== null) { + networkFilters.push(filter); + } + } else if (filterType === FilterType.COSMETIC && config.loadCosmeticFilters === true) { + const filter = CosmeticFilter.parse(line, config.debug); + if (filter !== null) { + if (config.loadGenericCosmeticsFilters === true || filter.isGenericHide() === false) { + cosmeticFilters.push(filter); } } } From 67181cd82537ab58305b0c7552f46830c3c26d79 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?R=C3=A9mi=20Berson?= Date: Wed, 24 Apr 2019 11:59:59 +0200 Subject: [PATCH 3/4] speed-up CosmeticFilter.parse (15% gain) --- bench/micro.js | 2 +- src/filters/cosmetic.ts | 78 +++++++++++++++++++++++------------------ 2 files changed, 44 insertions(+), 36 deletions(-) diff --git a/bench/micro.js b/bench/micro.js index c7d936a069..f688c38f9b 100644 --- a/bench/micro.js +++ b/bench/micro.js @@ -31,7 +31,7 @@ function benchStringHashing({ filters }) { function benchStringTokenize({ filters }) { let dummy = 0; for (let i = 0; i < filters.length; i += 1) { - dummy = (dummy + tokenize(filters[i]).length) >>> ; + dummy = (dummy + tokenize(filters[i]).length) >>> 0; } return dummy; } diff --git a/src/filters/cosmetic.ts b/src/filters/cosmetic.ts index ca1ccae395..f72cd4d919 100644 --- a/src/filters/cosmetic.ts +++ b/src/filters/cosmetic.ts @@ -179,10 +179,9 @@ export default class CosmeticFilter implements IFilter { * used to parse tens of thousands of lines. */ public static parse(line: string, debug: boolean = false): CosmeticFilter | null { - // Mask to store attributes - // Each flag (unhide, scriptInject, etc.) takes only 1 bit - // at a specific offset defined in COSMETICS_MASK. - // cf: COSMETICS_MASK for the offset of each property + // Mask to store attributes. Each flag (unhide, scriptInject, etc.) takes + // only 1 bit at a specific offset defined in COSMETICS_MASK. cf: + // COSMETICS_MASK for the offset of each property let mask = 0; let selector: string | undefined; let hostnames: Uint32Array | undefined; @@ -225,38 +224,40 @@ export default class CosmeticFilter implements IFilter { const hostnamesArray: number[] = []; const notHostnamesArray: number[] = []; - // TODO - this could be done without any string copy - line - .slice(0, sharpIndex) - .split(',') - .forEach((hostname) => { - if (hasUnicode(hostname)) { - hostname = toASCII(hostname); - mask = setBit(mask, COSMETICS_MASK.isUnicode); - } + const parts = line.slice(0, sharpIndex).split(','); + for (let i = 0; i < parts.length; i += 1) { + let hostname = parts[i]; + if (hasUnicode(hostname)) { + hostname = toASCII(hostname); + mask = setBit(mask, COSMETICS_MASK.isUnicode); + } - const negation: boolean = hostname[0] === '~'; - const entity: boolean = hostname.endsWith('.*'); + const negation: boolean = hostname.charCodeAt(0) === 126 /* '~' */; + const entity: boolean = + hostname.charCodeAt(hostname.length - 1) === 42 /* '*' */ && + hostname.charCodeAt(hostname.length - 2) === 46 /* '.' */; - const start: number = negation ? 1 : 0; - const end: number = entity ? hostname.length - 2 : hostname.length; + const start: number = negation ? 1 : 0; + const end: number = entity ? hostname.length - 2 : hostname.length; - const hash = hashHostnameBackward(hostname.slice(start, end)); + const hash = hashHostnameBackward( + negation === true || entity === true ? hostname.slice(start, end) : hostname, + ); - if (negation) { - if (entity) { - notEntitiesArray.push(hash); - } else { - notHostnamesArray.push(hash); - } + if (negation) { + if (entity) { + notEntitiesArray.push(hash); } else { - if (entity) { - entitiesArray.push(hash); - } else { - hostnamesArray.push(hash); - } + notHostnamesArray.push(hash); } - }); + } else { + if (entity) { + entitiesArray.push(hash); + } else { + hostnamesArray.push(hash); + } + } + } if (entitiesArray.length !== 0) { entities = new Uint32Array(entitiesArray).sort(); @@ -276,14 +277,16 @@ export default class CosmeticFilter implements IFilter { } // We should not have unhide without any hostname - // NOTE: it does not make sense either to only have a negated domain or - // entity (e.g.: ~domain.com or ~entity.*), these are thus ignored. if (getBit(mask, COSMETICS_MASK.unhide) && hostnames === undefined && entities === undefined) { return null; } // Deal with script:inject and script:contains - if (fastStartsWithFrom(line, 'script:', suffixStartIndex)) { + if ( + line.length - suffixStartIndex > 7 && + line.charCodeAt(suffixStartIndex) === 115 /* 's' */ && + fastStartsWithFrom(line, 'script:', suffixStartIndex) + ) { // script:inject(.......) // ^ ^ // script:contains(/......./) @@ -306,7 +309,11 @@ export default class CosmeticFilter implements IFilter { } selector = line.slice(scriptSelectorIndexStart, scriptSelectorIndexEnd); - } else if (fastStartsWithFrom(line, '+js(', suffixStartIndex)) { + } else if ( + line.length - suffixStartIndex > 4 && + line.charCodeAt(suffixStartIndex) === 43 /* '+' */ && + fastStartsWithFrom(line, '+js(', suffixStartIndex) + ) { mask = setBit(mask, COSMETICS_MASK.scriptInject); selector = line.slice(suffixStartIndex + 4, line.length - 1); } else { @@ -663,7 +670,8 @@ export default class CosmeticFilter implements IFilter { // Note, we do not need to use negated domains or entities as tokens here // since they will by definition not match on their own, unless accompanied - // by a domain or entity. + // by a domain or entity. Instead, they are handled in + // `CosmeticFilterBucket.getCosmeticsFilters`. if (this.hostnames !== undefined) { for (let i = 0; i < this.hostnames.length; i += 1) { From e0ba6ed23895b0d6fe47d9e10474ce2161f2d23f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?R=C3=A9mi=20Berson?= Date: Wed, 24 Apr 2019 15:01:51 +0200 Subject: [PATCH 4/4] speed-up NetworkFilter.parse (10% gain) --- src/filters/network.ts | 165 +++++++++++++++++++++-------------------- src/utils.ts | 4 +- 2 files changed, 87 insertions(+), 82 deletions(-) diff --git a/src/filters/network.ts b/src/filters/network.ts index 7605c67f25..396c911cb1 100644 --- a/src/filters/network.ts +++ b/src/filters/network.ts @@ -13,6 +13,8 @@ import { fastStartsWithFrom, getBit, hasUnicode, + isAlpha, + isDigit, setBit, tokenizeFilterInPlace, tokenizeInPlace, @@ -23,6 +25,12 @@ const TOKENS_BUFFER = new TokensBuffer(200); const HTTP_HASH = fastHash('http'); const HTTPS_HASH = fastHash('https'); +function isAllowedHostname(ch: number): boolean { + return ( + isDigit(ch) || isAlpha(ch) || ch === 95 /* '_' */ || ch === 45 /* '-' */ || ch === 46 /* '.' */ + ); +} + /** * Masks used to store options of network filters in a bitmask. */ @@ -142,8 +150,6 @@ function computeFilterId( return hash >>> 0; } -const SEPARATOR = /[/^*]/; - /** * Compiles a filter pattern to a regex. This is only performed *lazily* for * filters containing at least a * or ^ symbol. Because Regexes are expansive, @@ -208,7 +214,7 @@ export default class NetworkFilter implements IFilter { let filterIndexEnd: number = line.length; // @@filter == Exception - if (fastStartsWith(line, '@@')) { + if (line.charCodeAt(0) === 64 /* '@' */ && line.charCodeAt(1) === 64 /* '@' */) { filterIndexStart += 2; mask = setBit(mask, NETWORK_FILTER_MASK.isException); } @@ -225,29 +231,19 @@ export default class NetworkFilter implements IFilter { // --------------------------------------------------------------------- // // parseOptions - // TODO: This could be implemented without string copy, - // using indices, like in main parsing functions. - const rawOptions = line.slice(optionsIndex + 1); - const options = rawOptions.split(','); + // --------------------------------------------------------------------- // + const options = line.slice(optionsIndex + 1).split(','); for (let i = 0; i < options.length; i += 1) { const rawOption = options[i]; - let negation = false; - let option = rawOption; - - // Check for negation: ~option - if (fastStartsWith(option, '~')) { - negation = true; - option = option.slice(1); - } else { - negation = false; - } + const negation = rawOption.charCodeAt(0) === 126 /* '~' */; + let option = negation === true ? rawOption.slice(1) : rawOption; // Check for options: option=value1|value2 let optionValue: string = ''; - if (option.indexOf('=') !== -1) { - const optionAndValues = option.split('=', 2); - option = optionAndValues[0]; - optionValue = optionAndValues[1]; + const indexOfEqual: number = option.indexOf('='); + if (indexOfEqual !== -1) { + optionValue = option.slice(indexOfEqual + 1); + option = option.slice(0, indexOfEqual); } switch (option) { @@ -259,7 +255,7 @@ export default class NetworkFilter implements IFilter { for (let j = 0; j < optionValues.length; j += 1) { const value: string = optionValues[j]; if (value) { - if (fastStartsWith(value, '~')) { + if (value.charCodeAt(0) === 126 /* '~' */) { optNotDomainsArray.push(fastHash(value.slice(1))); } else { optDomainsArray.push(fastHash(value)); @@ -417,72 +413,77 @@ export default class NetworkFilter implements IFilter { // Identify kind of pattern // Deal with hostname pattern - if (line[filterIndexEnd - 1] === '|') { + if (line.charCodeAt(filterIndexEnd - 1) === 124 /* '|' */) { mask = setBit(mask, NETWORK_FILTER_MASK.isRightAnchor); filterIndexEnd -= 1; } - if (fastStartsWithFrom(line, '||', filterIndexStart)) { - mask = setBit(mask, NETWORK_FILTER_MASK.isHostnameAnchor); - filterIndexStart += 2; - } else if (line[filterIndexStart] === '|') { - mask = setBit(mask, NETWORK_FILTER_MASK.isLeftAnchor); - filterIndexStart += 1; + if (line.charCodeAt(filterIndexStart) === 124 /* '|' */) { + if (line.charCodeAt(filterIndexStart + 1) === 124 /* '|' */) { + mask = setBit(mask, NETWORK_FILTER_MASK.isHostnameAnchor); + filterIndexStart += 2; + } else { + mask = setBit(mask, NETWORK_FILTER_MASK.isLeftAnchor); + filterIndexStart += 1; + } } - const isRegex = checkIsRegex(line, filterIndexStart, filterIndexEnd); - mask = setNetworkMask(mask, NETWORK_FILTER_MASK.isRegex, isRegex); + // const isRegex = checkIsRegex(line, filterIndexStart, filterIndexEnd); + // mask = setNetworkMask(mask, NETWORK_FILTER_MASK.isRegex, isRegex); if (getBit(mask, NETWORK_FILTER_MASK.isHostnameAnchor)) { - if (isRegex) { - // Split at the first '/', '*' or '^' character to get the hostname - // and then the pattern. - // TODO - this could be made more efficient if we could match between two - // indices. Once again, we have to do more work than is really needed. - const firstSeparator = line.search(SEPARATOR); - // NOTE: `firstSeparator` shall never be -1 here since `isRegex` is true. - // This means there must be at least an occurrence of `*` or `^` - // somewhere. + // Split at the first character which is not allowed in a hostname + let firstSeparator = filterIndexStart; + while ( + firstSeparator < filterIndexEnd && + isAllowedHostname(line.charCodeAt(firstSeparator)) === true + ) { + firstSeparator += 1; + } + // No separator found so hostname has full length + if (firstSeparator === filterIndexEnd) { + hostname = line.slice(filterIndexStart, filterIndexEnd); + filterIndexStart = filterIndexEnd; + // mask = setBit(mask, NETWORK_FILTER_MASK.isLeftAnchor); + } else { + // Found a separator hostname = line.slice(filterIndexStart, firstSeparator); filterIndexStart = firstSeparator; - - // If the only symbol remaining for the selector is '^' then ignore it - // but set the filter as right anchored since there should not be any - // other label on the right - if (filterIndexEnd - filterIndexStart === 1 && line[filterIndexStart] === '^') { - mask = clearBit(mask, NETWORK_FILTER_MASK.isRegex); - filterIndexStart = filterIndexEnd; - mask = setNetworkMask(mask, NETWORK_FILTER_MASK.isRightAnchor, true); + const separatorCode = line.charCodeAt(firstSeparator); + + if (separatorCode === 94 /* '^' */) { + // If the only symbol remaining for the selector is '^' then ignore it + // but set the filter as right anchored since there should not be any + // other label on the right + if (filterIndexEnd - filterIndexStart === 1) { + filterIndexStart = filterIndexEnd; + mask = setBit(mask, NETWORK_FILTER_MASK.isRightAnchor); + } else { + mask = setBit(mask, NETWORK_FILTER_MASK.isRegex); + mask = setBit(mask, NETWORK_FILTER_MASK.isLeftAnchor); + } + } else if (separatorCode === 42 /* '*' */) { + mask = setBit(mask, NETWORK_FILTER_MASK.isRegex); } else { - mask = setNetworkMask(mask, NETWORK_FILTER_MASK.isLeftAnchor, true); - mask = setNetworkMask( - mask, - NETWORK_FILTER_MASK.isRegex, - checkIsRegex(line, filterIndexStart, filterIndexEnd), - ); - } - } else { - // Look for next / - const slashIndex = line.indexOf('/', filterIndexStart); - if (slashIndex !== -1) { - hostname = line.slice(filterIndexStart, slashIndex); - filterIndexStart = slashIndex; mask = setBit(mask, NETWORK_FILTER_MASK.isLeftAnchor); - } else { - hostname = line.slice(filterIndexStart, filterIndexEnd); - filterIndexStart = filterIndexEnd; } } } // Remove trailing '*' - if (filterIndexEnd - filterIndexStart > 0 && line[filterIndexEnd - 1] === '*') { + if ( + filterIndexEnd - filterIndexStart > 0 && + line.charCodeAt(filterIndexEnd - 1) === 42 /* '*' */ + ) { filterIndexEnd -= 1; } // Remove leading '*' if the filter is not hostname anchored. - if (filterIndexEnd - filterIndexStart > 0 && line[filterIndexStart] === '*') { + if ( + filterIndexEnd - filterIndexStart > 0 && + line.charCodeAt(filterIndexStart) === 42 /* '*' */ + ) { mask = clearBit(mask, NETWORK_FILTER_MASK.isLeftAnchor); filterIndexStart += 1; } @@ -527,11 +528,13 @@ export default class NetworkFilter implements IFilter { if (filterIndexEnd - filterIndexStart > 0) { filter = line.slice(filterIndexStart, filterIndexEnd).toLowerCase(); mask = setNetworkMask(mask, NETWORK_FILTER_MASK.isUnicode, hasUnicode(filter)); - mask = setNetworkMask( - mask, - NETWORK_FILTER_MASK.isRegex, - checkIsRegex(filter, 0, filter.length), - ); + if (getBit(mask, NETWORK_FILTER_MASK.isRegex) === false) { + mask = setNetworkMask( + mask, + NETWORK_FILTER_MASK.isRegex, + checkIsRegex(filter, 0, filter.length), + ); + } } // TODO @@ -1233,15 +1236,16 @@ function setNetworkMask(mask: number, m: number, value: boolean): number { /** * Check if the sub-string contained between the indices start and end is a - * regex filter (it contains a '*' or '^' char). Here we are limited by the - * capability of javascript to check the presence of a pattern between two - * indices (same for Regex...). - * // TODO - we could use sticky regex here + * regex filter (it contains a '*' or '^' char). */ function checkIsRegex(filter: string, start: number, end: number): boolean { - const starIndex = filter.indexOf('*', start); - const separatorIndex = filter.indexOf('^', start); - return (starIndex !== -1 && starIndex < end) || (separatorIndex !== -1 && separatorIndex < end); + const indexOfSeparator = filter.indexOf('^', start); + if (indexOfSeparator !== -1 && indexOfSeparator < end) { + return true; + } + + const indexOfWildcard = filter.indexOf('*', start); + return indexOfWildcard !== -1 && indexOfWildcard < end; } /** @@ -1428,14 +1432,15 @@ function checkPatternHostnameLeftRightAnchorFilter( // ||pattern + left-anchor => This means that a plain pattern needs to appear // exactly after the hostname, with nothing in between. function checkPatternHostnameLeftAnchorFilter(filter: NetworkFilter, request: Request): boolean { - if (isAnchoredByHostname(filter.getHostname(), request.hostname)) { + const filterHostname = filter.getHostname(); + if (isAnchoredByHostname(filterHostname, request.hostname)) { // Since this is not a regex, the filter pattern must follow the hostname // with nothing in between. So we extract the part of the URL following // after hostname and will perform the matching on it. return fastStartsWithFrom( request.url, filter.getFilter(), - request.url.indexOf(filter.getHostname()) + filter.getHostname().length, + request.url.indexOf(filterHostname) + filterHostname.length, ); } diff --git a/src/utils.ts b/src/utils.ts index 2e895b00cb..ca6fc259d3 100644 --- a/src/utils.ts +++ b/src/utils.ts @@ -73,13 +73,13 @@ export function fastStartsWithFrom(haystack: string, needle: string, start: numb } // Efficient manuel lexer -function isDigit(ch: number): boolean { +export function isDigit(ch: number): boolean { // 48 == '0' // 57 == '9' return ch >= 48 && ch <= 57; } -function isAlpha(ch: number): boolean { +export function isAlpha(ch: number): boolean { // Force to lower-case ch |= 32; // 65 == 'A'