Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Parsefilters optimizations #142

Merged
merged 4 commits into from Apr 26, 2019
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
29 changes: 10 additions & 19 deletions bench/micro.js
@@ -1,6 +1,6 @@
/* eslint-disable no-bitwise */

const adblocker = require('../');
const { FiltersEngine, fastHash, tokenize, parseFilters } = require('../');
const { createEngine } = require('./utils');


Expand All @@ -17,47 +17,38 @@ function benchEngineSerialization({ engine }) {
}

function benchEngineDeserialization({ serialized }) {
return adblocker.FiltersEngine.deserialize(serialized);
return FiltersEngine.deserialize(serialized);
}

function benchStringHashing({ filters }) {
let dummy = 0;
for (let i = 0; i < filters.length; i += 1) {
dummy = (dummy + adblocker.fastHash(filters[i])) % 1000000000;
dummy = (dummy + fastHash(filters[i])) >>> 0;
}
return dummy;
}

function benchStringTokenize({ filters }) {
let dummy = 0;
for (let i = 0; i < filters.length; i += 1) {
dummy = (dummy + adblocker.tokenize(filters[i]).length) % 1000000000;
dummy = (dummy + tokenize(filters[i]).length) >>> 0;
}
return dummy;
}

function benchParsingImpl(lists, { loadNetworkFilters, loadCosmeticFilters }) {
let dummy = 0;

for (let i = 0; i < lists.length; i += 1) {
dummy = (dummy + adblocker.parseFilters(lists[i], {
loadNetworkFilters,
loadCosmeticFilters,
}).networkFilters.length) >>> 0;
}

return dummy;
function benchParsingImpl(lists, options) {
return parseFilters(lists, options);
}

function benchCosmeticsFiltersParsing({ lists }) {
return benchParsingImpl(lists, {
function benchCosmeticsFiltersParsing({ combinedLists }) {
return benchParsingImpl(combinedLists, {
loadCosmeticFilters: true,
loadNetworkFilters: false,
});
}

function benchNetworkFiltersParsing({ lists }) {
return benchParsingImpl(lists, {
function benchNetworkFiltersParsing({ combinedLists }) {
return benchParsingImpl(combinedLists, {
loadCosmeticFilters: false,
loadNetworkFilters: true,
});
Expand Down
4 changes: 3 additions & 1 deletion bench/run_benchmark.js
Expand Up @@ -80,7 +80,8 @@ function runMicroBenchmarks(lists, resources) {
}, true /* Also serialize engine */);

const filters = getFiltersFromLists(lists);
const { networkFilters, cosmeticFilters } = parseFilters(filters.join('\n'));
const combinedLists = filters.join('\n');
const { networkFilters, cosmeticFilters } = parseFilters(combinedLists);
const results = {};

// Arguments shared among benchmarks
Expand All @@ -92,6 +93,7 @@ function runMicroBenchmarks(lists, resources) {
serialized,
networkFilters,
cosmeticFilters,
combinedLists,
};

[
Expand Down
78 changes: 43 additions & 35 deletions src/filters/cosmetic.ts
Expand Up @@ -179,10 +179,9 @@ export default class CosmeticFilter implements IFilter {
* used to parse tens of thousands of lines.
*/
public static parse(line: string, debug: boolean = false): CosmeticFilter | null {
// Mask to store attributes
// Each flag (unhide, scriptInject, etc.) takes only 1 bit
// at a specific offset defined in COSMETICS_MASK.
// cf: COSMETICS_MASK for the offset of each property
// Mask to store attributes. Each flag (unhide, scriptInject, etc.) takes
// only 1 bit at a specific offset defined in COSMETICS_MASK. cf:
// COSMETICS_MASK for the offset of each property
let mask = 0;
let selector: string | undefined;
let hostnames: Uint32Array | undefined;
Expand Down Expand Up @@ -225,38 +224,40 @@ export default class CosmeticFilter implements IFilter {
const hostnamesArray: number[] = [];
const notHostnamesArray: number[] = [];

// TODO - this could be done without any string copy
line
.slice(0, sharpIndex)
.split(',')
.forEach((hostname) => {
if (hasUnicode(hostname)) {
hostname = toASCII(hostname);
mask = setBit(mask, COSMETICS_MASK.isUnicode);
}
const parts = line.slice(0, sharpIndex).split(',');
for (let i = 0; i < parts.length; i += 1) {
let hostname = parts[i];
if (hasUnicode(hostname)) {
hostname = toASCII(hostname);
mask = setBit(mask, COSMETICS_MASK.isUnicode);
}

const negation: boolean = hostname[0] === '~';
const entity: boolean = hostname.endsWith('.*');
const negation: boolean = hostname.charCodeAt(0) === 126 /* '~' */;
const entity: boolean =
hostname.charCodeAt(hostname.length - 1) === 42 /* '*' */ &&
hostname.charCodeAt(hostname.length - 2) === 46 /* '.' */;

const start: number = negation ? 1 : 0;
const end: number = entity ? hostname.length - 2 : hostname.length;
const start: number = negation ? 1 : 0;
const end: number = entity ? hostname.length - 2 : hostname.length;

const hash = hashHostnameBackward(hostname.slice(start, end));
const hash = hashHostnameBackward(
negation === true || entity === true ? hostname.slice(start, end) : hostname,
);

if (negation) {
if (entity) {
notEntitiesArray.push(hash);
} else {
notHostnamesArray.push(hash);
}
if (negation) {
if (entity) {
notEntitiesArray.push(hash);
} else {
if (entity) {
entitiesArray.push(hash);
} else {
hostnamesArray.push(hash);
}
notHostnamesArray.push(hash);
}
});
} else {
if (entity) {
entitiesArray.push(hash);
} else {
hostnamesArray.push(hash);
}
}
}

if (entitiesArray.length !== 0) {
entities = new Uint32Array(entitiesArray).sort();
Expand All @@ -276,14 +277,16 @@ export default class CosmeticFilter implements IFilter {
}

// We should not have unhide without any hostname
// NOTE: it does not make sense either to only have a negated domain or
// entity (e.g.: ~domain.com or ~entity.*), these are thus ignored.
if (getBit(mask, COSMETICS_MASK.unhide) && hostnames === undefined && entities === undefined) {
return null;
}

// Deal with script:inject and script:contains
if (fastStartsWithFrom(line, 'script:', suffixStartIndex)) {
if (
line.length - suffixStartIndex > 7 &&
line.charCodeAt(suffixStartIndex) === 115 /* 's' */ &&
fastStartsWithFrom(line, 'script:', suffixStartIndex)
) {
// script:inject(.......)
// ^ ^
// script:contains(/......./)
Expand All @@ -306,7 +309,11 @@ export default class CosmeticFilter implements IFilter {
}

selector = line.slice(scriptSelectorIndexStart, scriptSelectorIndexEnd);
} else if (fastStartsWithFrom(line, '+js(', suffixStartIndex)) {
} else if (
line.length - suffixStartIndex > 4 &&
line.charCodeAt(suffixStartIndex) === 43 /* '+' */ &&
fastStartsWithFrom(line, '+js(', suffixStartIndex)
) {
mask = setBit(mask, COSMETICS_MASK.scriptInject);
selector = line.slice(suffixStartIndex + 4, line.length - 1);
} else {
Expand Down Expand Up @@ -663,7 +670,8 @@ export default class CosmeticFilter implements IFilter {

// Note, we do not need to use negated domains or entities as tokens here
// since they will by definition not match on their own, unless accompanied
// by a domain or entity.
// by a domain or entity. Instead, they are handled in
// `CosmeticFilterBucket.getCosmeticsFilters`.

if (this.hostnames !== undefined) {
for (let i = 0; i < this.hostnames.length; i += 1) {
Expand Down