Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Better URL Matching #2517

Closed
wants to merge 48 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
48 commits
Select commit Hold shift + click to select a range
9a98219
Nice
Gusted May 14, 2020
46dc274
wwops
Gusted May 14, 2020
dcc9181
and last requirement
Gusted May 14, 2020
dcc1f53
Code-Style
Gusted May 14, 2020
df5c58b
Test cases :)
Gusted May 14, 2020
8e43f70
Custom regex
Gusted May 14, 2020
2aa1cbf
Discord
Gusted Jun 17, 2020
a6551b0
The big update
Gusted Aug 17, 2020
9fa506d
Woops the big update
Gusted Aug 17, 2020
e505ac8
The big update of URL-Patch
Gusted Aug 17, 2020
8a0a646
Explicity state specify subdomains.
Gusted Nov 1, 2020
c86eed1
Merge branch 'master' into URL-Patch
Gusted Nov 1, 2020
cddea76
More test cases
Gusted Nov 1, 2020
59c0928
Merge branch 'master' into URL-Patch
Gusted Dec 3, 2020
df6d2e5
Remove unused file
Gusted Dec 3, 2020
bdf0850
Match base only
Gusted Dec 10, 2020
e6eea7b
Merge branch 'master' into URL-Patch
Gusted Dec 10, 2020
f8416b0
Code-Style
Gusted Dec 10, 2020
5c3b7d8
Ensure IPv4 base
Gusted Dec 10, 2020
86d6f0e
Remove debugger
Gusted Dec 10, 2020
f1e6efc
Merge branch 'master' into URL-Patch
Gusted Dec 16, 2020
18d7c79
Prioritize ? over ://
Gusted Jan 6, 2021
d5fad02
Merge branch 'master' into URL-Patch
Gusted Jan 6, 2021
90a6617
Test to ensure no false-positive IPV6
Gusted Jan 10, 2021
e04dee9
Merge branch 'master' into URL-Patch
Gusted Jan 10, 2021
a8f0633
Merge branch 'master' into URL-Patch
Gusted Jan 20, 2021
19abf15
Merge branch 'master' into URL-Patch
Gusted Mar 7, 2021
9ca6a22
Fix double top-TLD dot
Gusted Mar 22, 2021
40bdf95
Code-Style
Gusted Mar 22, 2021
8573b0b
Merge branch 'master' into URL-Patch
Gusted Mar 22, 2021
4c93197
Merge branch 'master' into URL-Patch
Gusted Apr 30, 2021
a6cd2bb
Add comments
Gusted May 1, 2021
ce84021
Code-style
Gusted May 1, 2021
16b352a
Add more edge case
Gusted Aug 4, 2021
4624c33
Merge branch 'master' into URL-Patch
Gusted Aug 4, 2021
a0c150c
Code-Style
Gusted Aug 4, 2021
ae97780
Drop whitespace change
bershanskiy Sep 23, 2021
6c7fdf8
Do not remove www. from domain
bershanskiy Sep 23, 2021
eec6745
Merge branch 'master' into URL-Patch
bershanskiy Sep 23, 2021
89406f9
Update tests to match new behavior
bershanskiy Sep 23, 2021
5d886d2
Update and extend isURLMatched and isURLInList tests
bershanskiy Sep 23, 2021
e21c26d
Add comment
bershanskiy Sep 24, 2021
c5d0780
Merge branch 'master' into HEAD
bershanskiy Oct 2, 2021
6d6b9d4
Merge branch 'master' into HEAD
bershanskiy Oct 10, 2021
409c7da
Merge branch 'master' into URL-Patch
bershanskiy Oct 22, 2021
1e38a97
Merge branch 'master' into URL-Patch
bershanskiy Oct 25, 2021
60d7171
Merge remote-tracking branch 'upstream/master' into HEAD
bershanskiy Oct 29, 2021
09a4ff8
Add unit test for PDF names with [ and ] in them.
bershanskiy Oct 29, 2021
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
19 changes: 0 additions & 19 deletions src/utils/ipv6.ts

This file was deleted.

120 changes: 120 additions & 0 deletions src/utils/matching.ts
@@ -0,0 +1,120 @@
const regexpCache = new Map();

interface MatchInterface {
regexp: RegExp;
negated: boolean;
}

function makeRegexp(pattern: string): MatchInterface {
if (regexpCache.has(pattern)) {
return regexpCache.get(pattern);
}

const negated = pattern[0] === '!';
if (negated) {
pattern = pattern.substr(1);
}

// Check if the pattern is regex
if (pattern[0] === '/') {
// Get the flag of the specified regex
const flag = pattern.substr(pattern.lastIndexOf('/') + 1);
// Remove the / indentifiers so Regexp can make valid regexp of it.
pattern = pattern.substr(1).substr(0, pattern.lastIndexOf('/') - 1);
const regexp = new RegExp(pattern, flag);
const regObject: MatchInterface = {
regexp,
negated,
};
return regObject;
}
// Magic replacement to ensure pattern is valid
pattern = pattern.replace(/[|\\{}()[\]^$+*?.]/g, '\\$&').replace(/-/g, '\\x2d').replace(/\\\*/g, '[\\s\\S]*');
// Create an "hard" regexp to ensure that it's the exact domain we are matching and not some subdomain.
const regexp = new RegExp(`^${pattern}(?![A-Za-z0-9.])`, 'i');
const regObject: MatchInterface = {
regexp,
negated,
};
// Make sure it's cached!
regexpCache.set(pattern, regObject);
return regObject;
}

/**
* Sanitazed the website so it ensures different input
* with the same meaning will have the same output here.
*/
function sanitazeInput(input: string) {
bershanskiy marked this conversation as resolved.
Show resolved Hide resolved
return (input.replace(/^\^/, '')
.replace(/\$$/, '')
.replace(/\?.*$/, '')
// Regular URLs have two slashes, while local file URLs have three:
// file:///C:/path/to/file
.replace(/^.*?\/{2,3}/, '')
bershanskiy marked this conversation as resolved.
Show resolved Hide resolved
.replace(/\/$/, '')
);
}

export function isMatch(input: string, pattern: string) {
if (input === '' || pattern === '') {
return false;
}

// Check if it's an regexp.
if (pattern[0] === '/') {
const flag = pattern.substr(pattern.lastIndexOf('/') + 1);
pattern = pattern.substr(1).substr(0, pattern.lastIndexOf('/') - 1);
return (new RegExp(pattern, flag)).test(input);
}

input = sanitazeInput(input);
const sanitazedPattern = sanitazeInput(pattern);
const compiledRegexp = makeRegexp(sanitazedPattern);
const match = Boolean(compiledRegexp.regexp.exec(input));

// If it's negated make sure the result is inverted.
const matched = compiledRegexp.negated ? !match : match;
return matched;
}

export function isInPattern(input: string, patterns: any[]) {
if (input === '' || patterns.length === 0) {
return false;
}

input = sanitazeInput(input);
// These sets are important to check if it's in the list.
// Or if some pattern omitted this input etc.
const omit = new Set();
const keep = new Set();
const items = new Set();
let negatives = 0;

for (let i = 0, len = patterns.length; i < len; i++) {
const pattern = sanitazeInput(patterns[i]);
const matchRegex = makeRegexp(pattern);
// Don't add the ! into the items list.
items.add(pattern[0] === '!' ? pattern.slice(1) : pattern);

// However it's negated make sure to up the negatives counter.
const negated = matchRegex.negated;
if (negated) {
negatives++;
}
const matched = Boolean(matchRegex.regexp.exec(input));
if (!matched) {
continue;
}
if (negated) {
omit.add(input);
} else {
omit.delete(input);
keep.add(input);
}
}

const result = negatives === patterns.length ? [...items] : [...keep];
const matches = result.filter((item) => !omit.has(item));
return matches.length !== 0;
}
83 changes: 3 additions & 80 deletions src/utils/url.ts
@@ -1,5 +1,5 @@
import type {UserSettings} from '../definitions';
import {isIPV6, compareIPV6} from './ipv6';
import {isInPattern, isMatch} from './matching';
import {isThunderbird} from './platform';

let anchor: HTMLAnchorElement;
Expand Down Expand Up @@ -88,12 +88,7 @@ export function compareURLPatterns(a: string, b: string) {
* @paramlist List to search into.
*/
export function isURLInList(url: string, list: string[]) {
for (let i = 0; i < list.length; i++) {
if (isURLMatched(url, list[i])) {
return true;
}
}
return false;
return isInPattern(url, list);
}

/**
Expand All @@ -102,79 +97,7 @@ export function isURLInList(url: string, list: string[]) {
* @param urlTemplate URL template ("google.*", "youtube.com" etc).
*/
export function isURLMatched(url: string, urlTemplate: string): boolean {
const isFirstIPV6 = isIPV6(url);
const isSecondIPV6 = isIPV6(urlTemplate);
if (isFirstIPV6 && isSecondIPV6) {
return compareIPV6(url, urlTemplate);
} else if (!isFirstIPV6 && !isSecondIPV6) {
const regex = createUrlRegex(urlTemplate);
return Boolean(url.match(regex));
}
return false;
}

function createUrlRegex(urlTemplate: string): RegExp {
urlTemplate = urlTemplate.trim();
const exactBeginning = (urlTemplate[0] === '^');
const exactEnding = (urlTemplate[urlTemplate.length - 1] === '$');

urlTemplate = (urlTemplate
.replace(/^\^/, '') // Remove ^ at start
.replace(/\$$/, '') // Remove $ at end
.replace(/^.*?\/{2,3}/, '') // Remove scheme
.replace(/\?.*$/, '') // Remove query
.replace(/\/$/, '') // Remove last slash
);

let slashIndex: number;
let beforeSlash: string;
let afterSlash: string;
if ((slashIndex = urlTemplate.indexOf('/')) >= 0) {
beforeSlash = urlTemplate.substring(0, slashIndex); // google.*
afterSlash = urlTemplate.replace(/\$/g, '').substring(slashIndex); // /login/abc
} else {
beforeSlash = urlTemplate.replace(/\$/g, '');
}

//
// SCHEME and SUBDOMAINS

let result = (exactBeginning ?
'^(.*?\\:\\/{2,3})?' // Scheme
: '^(.*?\\:\\/{2,3})?([^\/]*?\\.)?' // Scheme and subdomains
);

//
// HOST and PORT

const hostParts = beforeSlash.split('.');
result += '(';
for (let i = 0; i < hostParts.length; i++) {
if (hostParts[i] === '*') {
hostParts[i] = '[^\\.\\/]+?';
}
}
result += hostParts.join('\\.');
result += ')';

//
// PATH and QUERY

if (afterSlash) {
result += '(';
result += afterSlash.replace('/', '\\/');
result += ')';
}

result += (exactEnding ?
'(\\/?(\\?[^\/]*?)?)$' // All following queries
: '(\\/?.*?)$' // All following paths and queries
);

//
// Result

return new RegExp(result, 'i');
return isMatch(url, urlTemplate);
}

export function isPDF(url: string) {
Expand Down
10 changes: 1 addition & 9 deletions tests/generators/utils/parse.tests.ts
Expand Up @@ -296,9 +296,6 @@ test('The generic fix appears first', () => {
}, {
'url': ['long.sub.example.com'],
'directive':'long'
}, {
'url': ['sub.example.com'],
'directive':'sub'
}]);
});

Expand Down Expand Up @@ -349,7 +346,7 @@ test('Fixes appear only once', () => {
}]);
});

test('Implied wildcards', () => {
test('No implied wildcards', () => {
interface TestFix {
url: string[];
directive: string[];
Expand Down Expand Up @@ -386,10 +383,5 @@ test('Implied wildcards', () => {
{
'url': ['*'],
'directive': 'hello world'
}, {
'url': [
'example.com',
],
'directive': 'one'
}]);
});